In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
df = pd.read_csv('googleplaystore.csv')

In [3]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [4]:
rows_to_shift = df[df['Last Updated'] == '1.0.19'].index
df.iloc[rows_to_shift, 1: -1] = df.iloc[rows_to_shift, 1: -1].shift(1, axis=1)
# shifted columns by 1 to the right

In [5]:
df['Last Updated'] = pd.to_datetime(df['Last Updated'], format='%B %d, %Y')

In [6]:
df[df['Type'].isnull()]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
9148,Command & Conquer: Rivals,FAMILY,,0,Varies with device,0,,0,Everyone 10+,Strategy,2018-06-28,Varies with device,Varies with device


In [7]:
df.isnull().sum() / len(df) * 100

App                0.000000
Category           0.009224
Rating            13.596532
Reviews            0.000000
Size               0.000000
Installs           0.000000
Type               0.009224
Price              0.000000
Content Rating     0.000000
Genres             0.009224
Last Updated       0.000000
Current Ver        0.073794
Android Ver        0.027673
dtype: float64

In [8]:
# df.fillna(-9999, inplace=True)

In [9]:
df['Size'].value_counts()

Varies with device    1695
11M                    198
12M                    196
14M                    194
13M                    191
                      ... 
961k                     1
103k                     1
816k                     1
144k                     1
544k                     1
Name: Size, Length: 461, dtype: int64

In [10]:
def cleaning_sizes():
  df['Size'].replace('Varies with device', '0', inplace=True)
  df['Size'] = df['Size'].apply(lambda x: x.replace('M', '000000'))
  df['Size'] = df['Size'].apply(lambda x: x.replace('k', '000'))
  df['Size'] = df['Size'].apply(lambda x: x.replace('+', ''))
  df['Size'] = df['Size'].apply(lambda x: x.replace(',', ''))

In [11]:
cleaning_sizes()

In [12]:
df['Size'].value_counts()

0           1695
11000000     198
12000000     196
14000000     194
13000000     191
            ... 
953000         1
898000         1
219000         1
97000          1
801000         1
Name: Size, Length: 461, dtype: int64

In [13]:
df['Size'] = df['Size'].astype(float)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   App             10841 non-null  object        
 1   Category        10840 non-null  object        
 2   Rating          9367 non-null   object        
 3   Reviews         10841 non-null  object        
 4   Size            10841 non-null  float64       
 5   Installs        10841 non-null  object        
 6   Type            10840 non-null  object        
 7   Price           10841 non-null  object        
 8   Content Rating  10841 non-null  object        
 9   Genres          10840 non-null  object        
 10  Last Updated    10841 non-null  datetime64[ns]
 11  Current Ver     10833 non-null  object        
 12  Android Ver     10838 non-null  object        
dtypes: datetime64[ns](1), float64(1), object(11)
memory usage: 1.1+ MB


In [15]:
def clean_reviews():
  df['Reviews']=df['Reviews'].astype(str)
  df['Reviews'] = df['Reviews'].apply(lambda x: x.replace('M', '000000'))
  df['Reviews'] = df['Reviews'].apply(lambda x: x.replace('.', ''))
  df['Reviews']=df['Reviews'].astype(float)

clean_reviews()

In [16]:
df['Installs'] = df['Installs'].apply(lambda x: x.replace(',', ''))
df['Installs'] = df['Installs'].apply(lambda x: x.replace('+', ''))
df['Installs'] = df['Installs'].apply(lambda x: x.replace('Free', '0'))

In [17]:
df['Installs'] = df['Installs'].astype(float)

In [18]:
df['Price'].value_counts()

0          10041
$0.99        148
$2.99        129
$1.99         73
$4.99         72
           ...  
$2.59          1
$28.99         1
$2.50          1
$379.99        1
$1.26          1
Name: Price, Length: 92, dtype: int64

In [19]:
df['Price'] = df['Price'].apply(lambda x: x.replace('$', ''))
df['Price'] = df['Price'].apply(lambda x: x.replace('Everyone', '-1'))

In [20]:
df['Price'] = df['Price'].astype(float)

In [21]:
df['Content Rating'].value_counts()

Everyone           8715
Teen               1208
Mature 17+          499
Everyone 10+        414
Adults only 18+       3
Unrated               2
Name: Content Rating, dtype: int64

In [22]:
df['Content Rating'] = df['Content Rating'].replace(-9999, 'Unrated')

In [23]:
df['Last Updated'].value_counts()

2018-08-03    326
2018-08-02    304
2018-07-31    294
2018-08-01    285
2018-07-30    211
             ... 
2015-11-02      1
2015-09-13      1
2015-06-26      1
2014-10-30      1
2013-09-27      1
Name: Last Updated, Length: 1377, dtype: int64

In [24]:
df['Last Updated'].astype(str)

0        2018-01-07
1        2018-01-15
2        2018-08-01
3        2018-06-08
4        2018-06-20
            ...    
10836    2017-07-25
10837    2018-07-06
10838    2017-01-20
10839    2015-01-19
10840    2018-07-25
Name: Last Updated, Length: 10841, dtype: object

In [25]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159.0,19000000.0,10000.0,Free,0.0,Everyone,Art & Design,2018-01-07,1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967.0,14000000.0,500000.0,Free,0.0,Everyone,Art & Design;Pretend Play,2018-01-15,2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510.0,8.7,5000000.0,Free,0.0,Everyone,Art & Design,2018-08-01,1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644.0,25000000.0,50000000.0,Free,0.0,Teen,Art & Design,2018-06-08,Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967.0,2.8,100000.0,Free,0.0,Everyone,Art & Design;Creativity,2018-06-20,1.1,4.4 and up


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   App             10841 non-null  object        
 1   Category        10840 non-null  object        
 2   Rating          9367 non-null   object        
 3   Reviews         10841 non-null  float64       
 4   Size            10841 non-null  float64       
 5   Installs        10841 non-null  float64       
 6   Type            10840 non-null  object        
 7   Price           10841 non-null  float64       
 8   Content Rating  10841 non-null  object        
 9   Genres          10840 non-null  object        
 10  Last Updated    10841 non-null  datetime64[ns]
 11  Current Ver     10833 non-null  object        
 12  Android Ver     10838 non-null  object        
dtypes: datetime64[ns](1), float64(4), object(8)
memory usage: 1.1+ MB


In [27]:
df['Last Updated'].dt.day_name()

0           Sunday
1           Monday
2        Wednesday
3           Friday
4        Wednesday
           ...    
10836      Tuesday
10837       Friday
10838       Friday
10839       Monday
10840    Wednesday
Name: Last Updated, Length: 10841, dtype: object

In [29]:
df[df['App'] == 'ROBLOX']

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
1653,ROBLOX,GAME,4.5,4447388.0,67000000.0,100000000.0,Free,0.0,Everyone 10+,Adventure;Action & Adventure,2018-07-31,2.347.225742,4.1 and up
1701,ROBLOX,GAME,4.5,4447346.0,67000000.0,100000000.0,Free,0.0,Everyone 10+,Adventure;Action & Adventure,2018-07-31,2.347.225742,4.1 and up
1748,ROBLOX,GAME,4.5,4448791.0,67000000.0,100000000.0,Free,0.0,Everyone 10+,Adventure;Action & Adventure,2018-07-31,2.347.225742,4.1 and up
1841,ROBLOX,GAME,4.5,4449882.0,67000000.0,100000000.0,Free,0.0,Everyone 10+,Adventure;Action & Adventure,2018-07-31,2.347.225742,4.1 and up
1870,ROBLOX,GAME,4.5,4449910.0,67000000.0,100000000.0,Free,0.0,Everyone 10+,Adventure;Action & Adventure,2018-07-31,2.347.225742,4.1 and up
2016,ROBLOX,FAMILY,4.5,4449910.0,67000000.0,100000000.0,Free,0.0,Everyone 10+,Adventure;Action & Adventure,2018-07-31,2.347.225742,4.1 and up
2088,ROBLOX,FAMILY,4.5,4450855.0,67000000.0,100000000.0,Free,0.0,Everyone 10+,Adventure;Action & Adventure,2018-07-31,2.347.225742,4.1 and up
2206,ROBLOX,FAMILY,4.5,4450890.0,67000000.0,100000000.0,Free,0.0,Everyone 10+,Adventure;Action & Adventure,2018-07-31,2.347.225742,4.1 and up
4527,ROBLOX,FAMILY,4.5,4443407.0,67000000.0,100000000.0,Free,0.0,Everyone 10+,Adventure;Action & Adventure,2018-07-31,2.347.225742,4.1 and up


In [30]:
df.drop_duplicates(subset=['App'], keep='first', inplace=True)

In [31]:
len(df['App'])

9660

In [32]:
df['Genres'].value_counts()

Tools                                826
Entertainment                        561
Education                            510
Business                             420
Medical                              395
                                    ... 
Travel & Local;Action & Adventure      1
Casual;Music & Video                   1
Health & Fitness;Education             1
Role Playing;Brain Games               1
Music & Audio;Music & Video            1
Name: Genres, Length: 118, dtype: int64

In [33]:
df['Genres'] = df['Genres'].str.split(';') # split the string into a list

In [34]:
df[df['Genres'].map(type) == float]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
10472,Life Made WI-Fi Touchscreen Photo Frame,,1.9,190.0,3.0,1000.0,Free,0.0,Everyone,,2018-02-11,1.0.19,


In [35]:
df['Genres'].loc[10472] = 'Art & Design'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [36]:
df['Genres'].loc[10472]

'Art & Design'

In [37]:
df['Genres'] = df['Genres'].apply(lambda x: x[0])

In [38]:
df['Genres'].value_counts()

Tools                      827
Entertainment              592
Education                  580
Business                   420
Medical                    395
Personalization            376
Productivity               374
Lifestyle                  370
Finance                    345
Sports                     335
Communication              316
Action                     311
Health & Fitness           290
Photography                281
News & Magazines           254
Social                     239
Casual                     225
Books & Reference          225
Travel & Local             219
Simulation                 206
Shopping                   202
Arcade                     199
Dating                     171
Video Players & Editors    165
Puzzle                     142
Maps & Navigation          131
Role Playing               115
Food & Drink               112
Racing                     108
Educational                102
Strategy                    99
Auto & Vehicles             85
Librarie

In [39]:
df.loc[10472]

App               Life Made WI-Fi Touchscreen Photo Frame
Category                                              NaN
Rating                                                1.9
Reviews                                             190.0
Size                                                  3.0
Installs                                           1000.0
Type                                                 Free
Price                                                 0.0
Content Rating                                   Everyone
Genres                                                  A
Last Updated                          2018-02-11 00:00:00
Current Ver                                        1.0.19
Android Ver                                           NaN
Name: 10472, dtype: object

In [40]:
df['Rating'] = df['Rating'].astype(float) # set the type to float (inplace = TRUE)

In [43]:
df['Rating'] = df['Rating'].fillna(0) 
# fill the NaN values with 0 because the rating is between 1 and 5, not 0 and 5
# we can seperate out the 0 ratings because they are not useful, but still require the other columns to be filled

In [44]:
df['Rating'].value_counts()

0.0    1463
4.3     897
4.4     895
4.5     848
4.2     810
4.6     683
4.1     621
4.0     513
4.7     442
3.9     359
3.8     286
5.0     271
3.7     224
4.8     221
3.6     167
3.5     156
3.4     126
3.3     100
4.9      85
3.0      81
3.1      69
3.2      63
2.9      45
2.8      40
2.6      24
2.7      23
2.3      20
2.5      20
2.4      19
1.0      16
2.2      14
2.0      12
1.9      12
1.7       8
1.8       8
2.1       8
1.6       4
1.5       3
1.4       3
1.2       1
Name: Rating, dtype: int64

In [45]:
df['Rating'].isnull().sum()

0

In [48]:
df.isnull().sum()

App               0
Category          1
Rating            0
Reviews           0
Size              0
Installs          0
Type              1
Price             0
Content Rating    0
Genres            0
Last Updated      0
Current Ver       8
Android Ver       3
dtype: int64

In [51]:
df['Category'].loc[10472] = 'ART_AND_DESIGN'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


# Refactoring our Code


In [55]:
df['Type'].loc[9148] = 'Free'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [56]:
df['Type'].loc[9148]

'Free'

In [59]:
df.isnull().sum()

App               0
Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              0
Price             0
Content Rating    0
Genres            0
Last Updated      0
Current Ver       8
Android Ver       3
dtype: int64

In [62]:
df[df['Current Ver'].isnull()]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
15,Learn To Draw Kawaii Characters,ART_AND_DESIGN,3.2,55.0,2.7,5000.0,Free,0.0,Everyone,Art & Design,2018-06-06,,4.2 and up
1553,Market Update Helper,LIBRARIES_AND_DEMO,4.1,20145.0,11000.0,1000000.0,Free,0.0,Everyone,Libraries & Demo,2013-02-12,,1.5 and up
6322,Virtual DJ Sound Mixer,TOOLS,4.2,4010.0,8.7,500000.0,Free,0.0,Everyone,Tools,2017-05-10,,4.0 and up
6803,BT Master,FAMILY,0.0,0.0,222000.0,100.0,Free,0.0,Everyone,Education,2016-11-06,,1.6 and up
7333,Dots puzzle,FAMILY,4.0,179.0,14000000.0,50000.0,Paid,0.99,Everyone,Puzzle,2018-04-18,,4.0 and up
7407,Calculate My IQ,FAMILY,0.0,44.0,7.2,10000.0,Free,0.0,Everyone,Entertainment,2017-04-03,,2.3 and up
7730,UFO-CQ,TOOLS,0.0,1.0,237000.0,10.0,Paid,0.99,Everyone,Tools,2016-07-04,,2.0 and up
10342,La Fe de Jesus,BOOKS_AND_REFERENCE,0.0,8.0,658000.0,1000.0,Free,0.0,Everyone,Books & Reference,2017-01-31,,3.0 and up


In [67]:
df['Current Ver'] = df['Current Ver'].fillna('0')

In [68]:
df.isnull().sum()

App               0
Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              0
Price             0
Content Rating    0
Genres            0
Last Updated      0
Current Ver       0
Android Ver       3
dtype: int64

In [71]:
df['Android Ver'] = df['Android Ver'].fillna('0')

In [72]:
df.isnull().sum()

App               0
Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              0
Price             0
Content Rating    0
Genres            0
Last Updated      0
Current Ver       0
Android Ver       0
dtype: int64

# Export Data

In [73]:
# df.to_csv('googleplaystore_cleaned.csv', index=False)