## The goal for this project is to analyze data to help "developers" understand what type of apps are likely to attract more users.

In [1]:
import pandas as pd
import numpy as np
googleapps = pd.read_csv('./resources/googleplaystore.csv')

In [2]:
googleapps.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [3]:
googleapps['Category'].value_counts()

FAMILY                 1972
GAME                   1144
TOOLS                   843
MEDICAL                 463
BUSINESS                460
PRODUCTIVITY            424
PERSONALIZATION         392
COMMUNICATION           387
SPORTS                  384
LIFESTYLE               382
FINANCE                 366
HEALTH_AND_FITNESS      341
PHOTOGRAPHY             335
SOCIAL                  295
NEWS_AND_MAGAZINES      283
SHOPPING                260
TRAVEL_AND_LOCAL        258
DATING                  234
BOOKS_AND_REFERENCE     231
VIDEO_PLAYERS           175
EDUCATION               156
ENTERTAINMENT           149
MAPS_AND_NAVIGATION     137
FOOD_AND_DRINK          127
HOUSE_AND_HOME           88
AUTO_AND_VEHICLES        85
LIBRARIES_AND_DEMO       85
WEATHER                  82
ART_AND_DESIGN           65
EVENTS                   64
PARENTING                60
COMICS                   60
BEAUTY                   53
1.9                       1
Name: Category, dtype: int64

In [4]:
googleapps.Price.value_counts()

0         10040
$0.99       148
$2.99       129
$1.99        73
$4.99        72
          ...  
$15.99        1
$25.99        1
$4.29         1
$2.95         1
$1.96         1
Name: Price, Length: 93, dtype: int64

### Pt. - IIa Investigate the 1.9 in "Category" column

In [5]:
googleapps.loc[10470:10474]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
10470,Jazz Wi-Fi,COMMUNICATION,3.4,49,4.0M,"10,000+",Free,0,Everyone,Communication,"February 10, 2017",0.1,2.3 and up
10471,Xposed Wi-Fi-Pwd,PERSONALIZATION,3.5,1042,404k,"100,000+",Free,0,Everyone,Personalization,"August 5, 2014",3.0.0,4.0.3 and up
10472,Life Made WI-Fi Touchscreen Photo Frame,1.9,19.0,3.0M,"1,000+",Free,0,Everyone,,"February 11, 2018",1.0.19,4.0 and up,
10473,osmino Wi-Fi: free WiFi,TOOLS,4.2,134203,4.1M,"10,000,000+",Free,0,Everyone,Tools,"August 7, 2018",6.06.14,4.4 and up
10474,Sat-Fi Voice,COMMUNICATION,3.4,37,14M,"1,000+",Free,0,Everyone,Communication,"November 21, 2014",2.2.1.5,2.2 and up


### ...it is missing a category data. Let us delete the row entirely.

In [6]:
googleapps.drop([10472], inplace=True)

In [7]:
googleapps.loc[10470:10474]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
10470,Jazz Wi-Fi,COMMUNICATION,3.4,49,4.0M,"10,000+",Free,0,Everyone,Communication,"February 10, 2017",0.1,2.3 and up
10471,Xposed Wi-Fi-Pwd,PERSONALIZATION,3.5,1042,404k,"100,000+",Free,0,Everyone,Personalization,"August 5, 2014",3.0.0,4.0.3 and up
10473,osmino Wi-Fi: free WiFi,TOOLS,4.2,134203,4.1M,"10,000,000+",Free,0,Everyone,Tools,"August 7, 2018",6.06.14,4.4 and up
10474,Sat-Fi Voice,COMMUNICATION,3.4,37,14M,"1,000+",Free,0,Everyone,Communication,"November 21, 2014",2.2.1.5,2.2 and up


### Pt. IIb - Searching for duplicates

In [8]:
googleapps['App'].duplicated().value_counts()

False    9659
True     1181
Name: App, dtype: int64

In [9]:
duplicate_gApps = googleapps[googleapps['App'].duplicated()].copy()

In [10]:
duplicate_gApps['App'].value_counts()

ROBLOX                                               8
CBS Sports App - Scores, News, Stats & Watch Live    7
8 Ball Pool                                          6
ESPN                                                 6
Duolingo: Learn Languages Free                       6
                                                    ..
Digit Save Money Automatically                       1
All Social Networks                                  1
Google Translate                                     1
PBS KIDS Video                                       1
Fruits Bomb                                          1
Name: App, Length: 798, dtype: int64

In [11]:
duplicate_gApps[duplicate_gApps.App == 'Instagram']

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
2604,Instagram,SOCIAL,4.5,66577446,Varies with device,"1,000,000,000+",Free,0,Teen,Social,"July 31, 2018",Varies with device,Varies with device
2611,Instagram,SOCIAL,4.5,66577313,Varies with device,"1,000,000,000+",Free,0,Teen,Social,"July 31, 2018",Varies with device,Varies with device
3909,Instagram,SOCIAL,4.5,66509917,Varies with device,"1,000,000,000+",Free,0,Teen,Social,"July 31, 2018",Varies with device,Varies with device


In [12]:
googleapps[googleapps.App.str.contains('Facebook')].head(8)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
201,Facebook Pages Manager,BUSINESS,4.0,1279184,Varies with device,"50,000,000+",Free,0,Everyone,Business,"August 2, 2018",Varies with device,Varies with device
215,Facebook Ads Manager,BUSINESS,4.1,19023,Varies with device,"1,000,000+",Free,0,Everyone,Business,"August 1, 2018",99.0.0.35.75,4.1 and up
2544,Facebook,SOCIAL,4.1,78158306,Varies with device,"1,000,000,000+",Free,0,Teen,Social,"August 3, 2018",Varies with device,Varies with device
2546,Facebook Lite,SOCIAL,4.3,8606259,Varies with device,"500,000,000+",Free,0,Teen,Social,"August 1, 2018",Varies with device,Varies with device
2561,Who Viewed My Facebook Profile - Stalkers Visi...,SOCIAL,4.6,271445,9.9M,"5,000,000+",Free,0,Everyone,Social,"June 24, 2018",4.1.1,4.0.3 and up
2572,Facebook Local,SOCIAL,4.2,4751,Varies with device,"1,000,000+",Free,0,Teen,Social,"June 30, 2018",13.0,4.1 and up
2577,HTC Social Plugin - Facebook,SOCIAL,3.6,13223,2.8M,"10,000,000+",Free,0,Mature 17+,Social,"June 8, 2016",8.00.752746,4.4 and up
3943,Facebook,SOCIAL,4.1,78128208,Varies with device,"1,000,000,000+",Free,0,Teen,Social,"August 3, 2018",Varies with device,Varies with device


In [13]:
print(googleapps.loc[3943])

App                         Facebook
Category                      SOCIAL
Rating                           4.1
Reviews                     78128208
Size              Varies with device
Installs              1,000,000,000+
Type                            Free
Price                              0
Content Rating                  Teen
Genres                        Social
Last Updated          August 3, 2018
Current Ver       Varies with device
Android Ver       Varies with device
Name: 3943, dtype: object


In [14]:
googleapps.loc[2544]

App                         Facebook
Category                      SOCIAL
Rating                           4.1
Reviews                     78158306
Size              Varies with device
Installs              1,000,000,000+
Type                            Free
Price                              0
Content Rating                  Teen
Genres                        Social
Last Updated          August 3, 2018
Current Ver       Varies with device
Android Ver       Varies with device
Name: 2544, dtype: object

### Pt. IIb - Pulling a sample of the duplicates to test our methodology – remove all duplicates except the one with the most amount of ratings

#### Let us remind ourselves how many duplicates there are 🤔

In [15]:
googleapps['App'].duplicated().value_counts()

False    9659
True     1181
Name: App, dtype: int64

### Pt. IIc - Testing our methodolody for removing duplicates

In [16]:
ig = googleapps[googleapps.App == 'Instagram'].copy()

In [17]:
ig.duplicated(subset='App')

2545    False
2604     True
2611     True
3909     True
dtype: bool

In [18]:
ig

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
2545,Instagram,SOCIAL,4.5,66577313,Varies with device,"1,000,000,000+",Free,0,Teen,Social,"July 31, 2018",Varies with device,Varies with device
2604,Instagram,SOCIAL,4.5,66577446,Varies with device,"1,000,000,000+",Free,0,Teen,Social,"July 31, 2018",Varies with device,Varies with device
2611,Instagram,SOCIAL,4.5,66577313,Varies with device,"1,000,000,000+",Free,0,Teen,Social,"July 31, 2018",Varies with device,Varies with device
3909,Instagram,SOCIAL,4.5,66509917,Varies with device,"1,000,000,000+",Free,0,Teen,Social,"July 31, 2018",Varies with device,Varies with device


In [19]:
ig.sort_values(by=['Reviews'], inplace=True, ascending=False) #sort values by Reviews so the highest number of values for a particular set of duplicates rises to the top.
ig

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
2604,Instagram,SOCIAL,4.5,66577446,Varies with device,"1,000,000,000+",Free,0,Teen,Social,"July 31, 2018",Varies with device,Varies with device
2545,Instagram,SOCIAL,4.5,66577313,Varies with device,"1,000,000,000+",Free,0,Teen,Social,"July 31, 2018",Varies with device,Varies with device
2611,Instagram,SOCIAL,4.5,66577313,Varies with device,"1,000,000,000+",Free,0,Teen,Social,"July 31, 2018",Varies with device,Varies with device
3909,Instagram,SOCIAL,4.5,66509917,Varies with device,"1,000,000,000+",Free,0,Teen,Social,"July 31, 2018",Varies with device,Varies with device


In [20]:
ig.drop_duplicates(subset=['App'],keep='first', inplace=True) #Then drop all duplicates except the very first one.

In [21]:
ig

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
2604,Instagram,SOCIAL,4.5,66577446,Varies with device,"1,000,000,000+",Free,0,Teen,Social,"July 31, 2018",Varies with device,Varies with device


### Pt. IIIa - Applying our methodology to the dataframe

In [22]:
googleapps.sort_values(by=['Reviews'], inplace=True, ascending=False)

In [23]:
googleapps.drop_duplicates(subset=['App'], keep='first', inplace=True)

#### Now let us check how many duplicates there are after applying our methodology

In [24]:
googleapps['App'].duplicated().value_counts()

False    9659
Name: App, dtype: int64

#### Huzzah! 🙌 zero duplicates! But let us run some tests just to be sure 

In [25]:
googleapps[googleapps.App == 'ROBLOX']

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
2206,ROBLOX,FAMILY,4.5,4450890,67M,"100,000,000+",Free,0,Everyone 10+,Adventure;Action & Adventure,"July 31, 2018",2.347.225742,4.1 and up


In [26]:
googleapps.App.value_counts()

Escaping the Prison                                1
HAWK – Force of an Arcade Shooter. Shoot 'em up    1
Gboard - the Google Keyboard                       1
Financial Calculator India                         1
Cathy AH                                           1
                                                  ..
Guardians of Ancora                                1
ZOOKEEPER DX TouchEdition                          1
PulsePoint AED                                     1
Word Link                                          1
Fame Boom for Real Followers, Likes                1
Name: App, Length: 9659, dtype: int64

### Pt. IVa - More cleaning: filtering out non-English characters

In [27]:
googleapps[googleapps['App'].map(lambda x: x.isascii())]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
2989,GollerCepte Live Score,SPORTS,4.2,9992,31M,"1,000,000+",Free,0,Everyone,Sports,"May 23, 2018",6.5,4.1 and up
4970,Ad Block REMOVER - NEED ROOT,TOOLS,3.3,999,91k,"100,000+",Free,0,Everyone,Tools,"December 17, 2013",3.2,2.2 and up
2723,SnipSnap Coupon App,SHOPPING,4.2,9975,18M,"1,000,000+",Free,0,Everyone,Shopping,"January 22, 2018",1.4,4.3 and up
3079,US Open Tennis Championships 2018,SPORTS,4.0,9971,33M,"1,000,000+",Free,0,Everyone,Sports,"June 5, 2018",7.1,5.0 and up
3229,DreamTrips,TRAVEL_AND_LOCAL,4.7,9971,22M,"500,000+",Free,0,Teen,Travel & Local,"August 6, 2018",1.28.1,5.0 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2465,ear super hearing,MEDICAL,,0,1.4M,5+,Free,0,Everyone,Medical,"July 31, 2018",1.0 Super Ear Hearing,4.0 and up
9925,Reisedealz.eu,SOCIAL,,0,10M,10+,Free,0,Everyone,Social,"September 24, 2015",1.0,4.0 and up
9928,EU Whoiswho,BUSINESS,,0,2.7M,10+,Free,0,Everyone,Business,"December 1, 2016",0.0.1,4.0 and up
9929,EU Brazil Green Business Forum,PRODUCTIVITY,,0,8.7M,10+,Free,0,Everyone,Productivity,"April 18, 2017",1.7.1,2.1 and up


In [28]:
googleapps[googleapps['App'].str.contains('爱奇')]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver


In [29]:
len(googleapps)

9659

#### Now that we have only data with English characters, we will filter the data once more to look at free apps versus paid apps

### Pt. Va - Isolating free apps

In [30]:
free_gapps = googleapps[googleapps['Price'] == 0].copy()

In [31]:
free_gapps

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver


In [32]:
googleapps.dtypes

App                object
Category           object
Rating            float64
Reviews            object
Size               object
Installs           object
Type               object
Price              object
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
dtype: object

### Pt. Vb - oops the Price column is an object data type, we'll need too change it to a float

In [33]:
googleapps['Price'] = googleapps['Price'].str.replace('$', '').astype(float) #replace dollar signs with nothing, then switch dtype to floats

In [34]:
free_gapps = googleapps[googleapps['Price'] == 0].copy()
free_gapps.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
2989,GollerCepte Live Score,SPORTS,4.2,9992,31M,"1,000,000+",Free,0.0,Everyone,Sports,"May 23, 2018",6.5,4.1 and up
4970,Ad Block REMOVER - NEED ROOT,TOOLS,3.3,999,91k,"100,000+",Free,0.0,Everyone,Tools,"December 17, 2013",3.2,2.2 and up
2723,SnipSnap Coupon App,SHOPPING,4.2,9975,18M,"1,000,000+",Free,0.0,Everyone,Shopping,"January 22, 2018",1.4,4.3 and up
3079,US Open Tennis Championships 2018,SPORTS,4.0,9971,33M,"1,000,000+",Free,0.0,Everyone,Sports,"June 5, 2018",7.1,5.0 and up
3229,DreamTrips,TRAVEL_AND_LOCAL,4.7,9971,22M,"500,000+",Free,0.0,Teen,Travel & Local,"August 6, 2018",1.28.1,5.0 and up


#### How many free apps are there in the data?

In [35]:
free_gapps['Price'].value_counts()

0.0    8903
Name: Price, dtype: int64

#### and how many free apps are there in the original dataframe?

In [36]:
googleapps['Price'].value_counts()

0.00      8903
0.99       145
2.99       124
1.99        73
4.99        70
          ... 
299.99       1
154.99       1
1.59         1
46.99        1
400.00       1
Name: Price, Length: 92, dtype: int64

#### Sweet, our math checks out 😎

### Pt. VIa - analyzing free apps on the Google Play store to see which Categories and Genres are the most common

In [37]:
free_gapps['Category'].value_counts().head()

FAMILY       1693
GAME          861
TOOLS         750
BUSINESS      408
LIFESTYLE     350
Name: Category, dtype: int64

In [38]:
free_gapps['Genres'].value_counts().head()

Tools            749
Entertainment    542
Education        480
Business         408
Lifestyle        349
Name: Genres, dtype: int64

In [39]:
free_gapps_category = free_gapps.groupby(['Category'])['App'].agg(total='count').reset_index()
free_gapps_category.sort_values('total', ascending=False).head()

Unnamed: 0,Category,total
11,FAMILY,1693
14,GAME,861
29,TOOLS,750
4,BUSINESS,408
18,LIFESTYLE,350


In [40]:
free_gapps['Reviews'] = free_gapps['Reviews'].astype(float)

In [55]:
free_gapps[free_gapps['Category'].str.contains('family|gaming|tools', case=False)].sort_values('Reviews', ascending=False).head(15)[['App','Category','Size','Rating','Reviews','Installs']]

Unnamed: 0,App,Category,Size,Rating,Reviews,Installs
4005,Clean Master- Space Cleaner & Antivirus,TOOLS,Varies with device,4.7,42916526.0,"500,000,000+"
7536,"Security Master - Antivirus, VPN, AppLock, Boo...",TOOLS,Varies with device,4.7,24900999.0,"500,000,000+"
4568,"360 Security - Free Antivirus, Booster, Cleaner",TOOLS,Varies with device,4.6,16771865.0,"100,000,000+"
8896,DU Battery Saver - Battery Charger & Battery Life,TOOLS,14M,4.5,13479633.0,"100,000,000+"
8894,Cache Cleaner-DU Speed Booster (booster & clea...,TOOLS,15M,4.5,12759815.0,"100,000,000+"
2050,Minion Rush: Despicable Me Official Game,FAMILY,Varies with device,4.5,10216997.0,"100,000,000+"
3975,Hay Day,FAMILY,94M,4.5,10053186.0,"100,000,000+"
7550,Battery Doctor-Battery Life Saver & Battery Co...,TOOLS,17M,4.5,8190074.0,"100,000,000+"
3234,Google,TOOLS,Varies with device,4.4,8033493.0,"1,000,000,000+"
3255,SHAREit - Transfer & Share,TOOLS,17M,4.6,7790693.0,"500,000,000+"


### Conclusion

#### In closing, based on the data analysis that we have just done and the table right above, I recommend an app profile that is either family, gaming or tool related.