# Google Play Store Apps Analysis

This project seeks to perfomr exploratory data analysis (EDA) on the Google Play Store Apps dataset. 

- Dataset: https://www.kaggle.com/lava18/google-play-store-apps


In [1]:
# Imports

import pandas as pd
import numpy as np
import os

In [116]:
data = pd.read_csv('data/googleplaystore.csv')

#### 1. Top 5 Rows of the dataset

In [117]:
data.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


#### 2. Last 3 Rows of the dataset

In [118]:
data.tail(3)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
10838,Parkinson Exercices FR,MEDICAL,,3,9.5M,"1,000+",Free,0,Everyone,Medical,"January 20, 2017",1.0,2.2 and up
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,Varies with device,"1,000+",Free,0,Mature 17+,Books & Reference,"January 19, 2015",Varies with device,Varies with device
10840,iHoroscope - 2018 Daily Horoscope & Astrology,LIFESTYLE,4.5,398307,19M,"10,000,000+",Free,0,Everyone,Lifestyle,"July 25, 2018",Varies with device,Varies with device


#### 3. Find Shape of Our Dataset (Number of Rows & Number of Columns)


In [119]:
data.shape

(10841, 13)

#### 4. Get Information About Our Dataset 


In [120]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


#### 5. Get Overall Statistics 


In [121]:
data.describe(include='all')

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
count,10841,10841,9367.0,10841.0,10841,10841,10840,10841.0,10840,10841,10841,10833,10838
unique,9660,34,,6002.0,462,22,3,93.0,6,120,1378,2832,33
top,ROBLOX,FAMILY,,0.0,Varies with device,"1,000,000+",Free,0.0,Everyone,Tools,"August 3, 2018",Varies with device,4.1 and up
freq,9,1972,,596.0,1695,1579,10039,10040.0,8714,842,326,1459,2451
mean,,,4.193338,,,,,,,,,,
std,,,0.537431,,,,,,,,,,
min,,,1.0,,,,,,,,,,
25%,,,4.0,,,,,,,,,,
50%,,,4.3,,,,,,,,,,
75%,,,4.5,,,,,,,,,,


#### 6. Total Number of App Titles Containing "Astrology"


In [122]:
data.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')

In [125]:
astrology_apps = len(data[data['App'].str.contains('Astrology', case=False)])

astrology_apps

3

In [142]:
data[data['App'].str.contains('Astrology', case=False)]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
1570,Horoscopes – Daily Zodiac Horoscope and Astrology,LIFESTYLE,4.6,161143.0,11M,"10,000,000+",Free,0,Everyone 10+,Lifestyle,"June 25, 2018",5.2.4(881),4.0.3 and up
1592,သိင်္ Astrology - Min Thein Kha BayDin,LIFESTYLE,4.7,2225.0,15M,"100,000+",Free,0,Everyone,Lifestyle,"July 26, 2018",4.2.1,4.0.3 and up
10840,iHoroscope - 2018 Daily Horoscope & Astrology,LIFESTYLE,4.5,398307.0,19M,"10,000,000+",Free,0,Everyone,Lifestyle,"July 25, 2018",Varies with device,Varies with device


#### 7. Find Average App Rating


In [126]:
data['Rating'].mean()

4.193338315362448

#### 8.  Find Total Number of Unique Categories


In [127]:
data['Category'].nunique()

34

#### 9. Top 10 Categories Getting The Highest Average Rating?


In [128]:
data.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')

In [129]:
highest_rating = data.groupby('Category')['Rating'].mean().sort_values(ascending=False)
highest_rating[:10]

Category
1.9                    19.000000
EVENTS                  4.435556
EDUCATION               4.389032
ART_AND_DESIGN          4.358065
BOOKS_AND_REFERENCE     4.346067
PERSONALIZATION         4.335987
PARENTING               4.300000
GAME                    4.286326
BEAUTY                  4.278571
HEALTH_AND_FITNESS      4.277104
Name: Rating, dtype: float64

#### 10. Find Total Number of Apps with 5 Star Ratings


In [130]:
data.head(3)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up


In [131]:
five_star_apps = len(data[data['Rating']==5.0])
five_star_apps

274

#### 11. Find Average Value of Reviews


In [132]:
data['Reviews'].dtype

dtype('O')

In [48]:
data['Reviews'] = data['Reviews'].astype('float')

ValueError: could not convert string to float: '3.0M'

In [133]:
# remove 'M' from row with Reviews of 3.0M

data[data['Reviews']=='3.0M']

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
10472,Life Made WI-Fi Touchscreen Photo Frame,1.9,19.0,3.0M,"1,000+",Free,0,Everyone,,"February 11, 2018",1.0.19,4.0 and up,


In [134]:
data['Reviews'].replace('3.0M', 3.0, inplace=True)

In [135]:
data['Reviews'] = data['Reviews'].astype('float')

In [136]:
data['Reviews'].mean()

444111.9265750392

#### 12. Find Total Number of Free and Paid Apps


In [137]:
data.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159.0,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967.0,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510.0,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644.0,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967.0,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [138]:
data['Type'].unique()

array(['Free', 'Paid', nan, '0'], dtype=object)

In [139]:
# Free Apps

free_apps = data['Type']=='Free'

len(data[free_apps])

10039

In [140]:
# Paid Apps

paid_apps = data['Type']=='Paid'

len(data[paid_apps])

800

#### 13.  Which App Has the Most Reviews?


In [143]:
data[data['Reviews'].max()==data['Reviews']]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
2544,Facebook,SOCIAL,4.1,78158306.0,Varies with device,"1,000,000,000+",Free,0,Teen,Social,"August 3, 2018",Varies with device,Varies with device


#### 14. Top 5 Apps with the Highest Reviews


In [144]:
index = data['Reviews'].sort_values(ascending=False).head().index

In [145]:
data.iloc[index]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
2544,Facebook,SOCIAL,4.1,78158306.0,Varies with device,"1,000,000,000+",Free,0,Teen,Social,"August 3, 2018",Varies with device,Varies with device
3943,Facebook,SOCIAL,4.1,78128208.0,Varies with device,"1,000,000,000+",Free,0,Teen,Social,"August 3, 2018",Varies with device,Varies with device
381,WhatsApp Messenger,COMMUNICATION,4.4,69119316.0,Varies with device,"1,000,000,000+",Free,0,Everyone,Communication,"August 3, 2018",Varies with device,Varies with device
336,WhatsApp Messenger,COMMUNICATION,4.4,69119316.0,Varies with device,"1,000,000,000+",Free,0,Everyone,Communication,"August 3, 2018",Varies with device,Varies with device
3904,WhatsApp Messenger,COMMUNICATION,4.4,69109672.0,Varies with device,"1,000,000,000+",Free,0,Everyone,Communication,"August 3, 2018",Varies with device,Varies with device


#### 15. Find Average Rating of Free and Paid Apps


In [146]:
data.groupby('Type')['Rating'].mean()

Type
0       19.000000
Free     4.186203
Paid     4.266615
Name: Rating, dtype: float64

#### 16. Top  5 Apps with the Most Installs


In [101]:
data['Installs'].unique()

array(['10000', '500000', '5000000', '50000000', '100000', '50000',
       '1000000', '10000000', '5000', '100000000', '1000000000', '1000',
       '500000000', '50', '100', '500', '10', '1', '5', '0', 'Free'],
      dtype=object)

In [147]:
data['Installs_1'] = data['Installs']

data[data['Installs_1']=='Free']

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Installs_1
10472,Life Made WI-Fi Touchscreen Photo Frame,1.9,19.0,3.0,"1,000+",Free,0,Everyone,,"February 11, 2018",1.0.19,4.0 and up,,Free


In [148]:
# Replace 'Free' string '0'

data['Installs_1'] = data['Installs_1'].str.replace('Free', '0')

In [90]:
# Error due to Installs values containing string elements

data['Installs_1'] = data['Installs_1'].astype('int64')

ValueError: could not convert string to float: '10,000+'

In [149]:
# Remove ',' and '+' string from values

data['Installs_1'] = data['Installs_1'].str.replace(',','')

data['Installs_1'] = data['Installs_1'].str.removesuffix('+')

In [150]:
data['Installs_1'] = data['Installs_1'].astype('int64')

In [151]:
index = data['Installs_1'].sort_values(ascending=False).head().index

In [152]:
data.iloc[index]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Installs_1
3896,Subway Surfers,GAME,4.5,27711703.0,76M,"1,000,000,000+",Free,0,Everyone 10+,Arcade,"July 12, 2018",1.90.0,4.1 and up,1000000000
3943,Facebook,SOCIAL,4.1,78128208.0,Varies with device,"1,000,000,000+",Free,0,Teen,Social,"August 3, 2018",Varies with device,Varies with device,1000000000
335,Messenger – Text and Video Chat for Free,COMMUNICATION,4.0,56642847.0,Varies with device,"1,000,000,000+",Free,0,Everyone,Communication,"August 1, 2018",Varies with device,Varies with device,1000000000
3523,Google Drive,PRODUCTIVITY,4.4,2731211.0,Varies with device,"1,000,000,000+",Free,0,Everyone,Productivity,"August 6, 2018",Varies with device,Varies with device,1000000000
3565,Google Drive,PRODUCTIVITY,4.4,2731211.0,Varies with device,"1,000,000,000+",Free,0,Everyone,Productivity,"August 6, 2018",Varies with device,Varies with device,1000000000
