### Setup

In [31]:
import numpy as np
import pandas as pd
import re

apps = pd.read_csv('~/greenfox/data-science/week-01/googleplaystore.csv')

### Cleaning DataFrame

In [32]:
apps = apps.drop_duplicates()

apps.loc[apps['Reviews'] == '3.0M']

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
10472,Life Made WI-Fi Touchscreen Photo Frame,1.9,19.0,3.0M,"1,000+",Free,0,Everyone,,"February 11, 2018",1.0.19,4.0 and up,


In [33]:
apps.loc[10472] = apps.loc[10472].shift(periods=1, axis=0, freq=None)
apps.loc[10472, 'App'] = 'Life Made WI-Fi Touchscreen Photo Frame'
apps.loc[10472, 'Category'] = np.nan

In [34]:
apps['Reviews'] = apps['Reviews'].astype('int64')
apps.dtypes

App               object
Category          object
Rating            object
Reviews            int64
Size              object
Installs          object
Type              object
Price             object
Content Rating    object
Genres            object
Last Updated      object
Current Ver       object
Android Ver       object
dtype: object

In [35]:
min = apps.Reviews.min()
print(min)
apps.loc[apps.Reviews == min, ['App', 'Reviews']].head()

0


Unnamed: 0,App,Reviews
478,Truth or Dare Pro,0
479,"Private Dating, Hide App- Blue for PrivacyHider",0
621,Dating White Girls,0
623,Geeks Dating,0
627,CAM5678 Video Chat,0


In [36]:
apps['Rating'] = apps['Rating'].astype('float64')
print(apps['Rating'].mean())
print(apps['Rating'].median())
print(apps['Rating'].mode())

4.187619475992361
4.3
0    4.4
dtype: float64


### Duplicates present after removal

In [37]:
apps.loc[apps.App == 'ROBLOX']

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
1653,ROBLOX,GAME,4.5,4447388,67M,"100,000,000+",Free,0,Everyone 10+,Adventure;Action & Adventure,"July 31, 2018",2.347.225742,4.1 and up
1701,ROBLOX,GAME,4.5,4447346,67M,"100,000,000+",Free,0,Everyone 10+,Adventure;Action & Adventure,"July 31, 2018",2.347.225742,4.1 and up
1748,ROBLOX,GAME,4.5,4448791,67M,"100,000,000+",Free,0,Everyone 10+,Adventure;Action & Adventure,"July 31, 2018",2.347.225742,4.1 and up
1841,ROBLOX,GAME,4.5,4449882,67M,"100,000,000+",Free,0,Everyone 10+,Adventure;Action & Adventure,"July 31, 2018",2.347.225742,4.1 and up
1870,ROBLOX,GAME,4.5,4449910,67M,"100,000,000+",Free,0,Everyone 10+,Adventure;Action & Adventure,"July 31, 2018",2.347.225742,4.1 and up
2016,ROBLOX,FAMILY,4.5,4449910,67M,"100,000,000+",Free,0,Everyone 10+,Adventure;Action & Adventure,"July 31, 2018",2.347.225742,4.1 and up
2088,ROBLOX,FAMILY,4.5,4450855,67M,"100,000,000+",Free,0,Everyone 10+,Adventure;Action & Adventure,"July 31, 2018",2.347.225742,4.1 and up
2206,ROBLOX,FAMILY,4.5,4450890,67M,"100,000,000+",Free,0,Everyone 10+,Adventure;Action & Adventure,"July 31, 2018",2.347.225742,4.1 and up
4527,ROBLOX,FAMILY,4.5,4443407,67M,"100,000,000+",Free,0,Everyone 10+,Adventure;Action & Adventure,"July 31, 2018",2.347.225742,4.1 and up


### Conversion of Size

In [38]:
regex = re.compile(r'^(\d+)[.,]?\d*([km])$', re.IGNORECASE)
regex.match(apps['Size'][0])

order = ['b', 'k', 'm', 'gb', 'tb', 'pb']

In [39]:
def convert_sizes(row):    
    if regex.match(str(row.Size)):
        grp_1 = regex.match(row.Size).group(1)
        grp_2 = regex.match(row.Size).group(2)
        row.Size = (int(float(grp_1) * (1024**order.index(grp_2.lower()))))
    else:
        row.Size = np.nan
    
    return row

apps = apps.apply(convert_sizes, axis='columns')

In [40]:
apps.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19922944.0,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14680064.0,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8388608.0,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,26214400.0,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2097152.0,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [41]:
print(apps.Size.mean())
print(apps.Size.median())
print(apps.Size.mode())

22130396.52173913
13631488.0
0    3145728.0
dtype: float64


### Most popular genres

In [43]:
apps.Genres.str.get_dummies(sep=';').sum().sort_values(ascending=False).head()

Tools            843
Education        673
Entertainment    628
Business         427
Medical          408
dtype: int64