In [1]:
# import pandas library
import pandas as pd

In [2]:
# loading dataset
apps = pd.read_csv('./data/google-play-store-apps/googleplaystore.csv')
apps.head(3)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up


# Overview

In [3]:
# number of rows and columns
apps.shape

(10841, 13)

In [4]:
# print columns
apps.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')

# Cleanup (remove na and duplicate)

In [5]:
# check missing data
apps.isnull().sum()

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64

In [6]:
apps.shape

(10841, 13)

In [7]:
# drop missing data
apps.dropna(how='any', inplace=True)
apps.isnull().sum()

App               0
Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              0
Price             0
Content Rating    0
Genres            0
Last Updated      0
Current Ver       0
Android Ver       0
dtype: int64

In [13]:
# check number or data rows after drop na
apps.shape

(8886, 13)

In [14]:
# check number of duplicated
apps.duplicated().sum()

0

In [15]:
# drop duplicated
apps.drop_duplicates(inplace=True)
apps.head(3)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up


In [16]:
# check data table after drop duplicated
apps.duplicated().sum()

0

# Rating

In [20]:
apps.Rating.describe()

count    8886.000000
mean        4.187959
std         0.522428
min         1.000000
25%         4.000000
50%         4.300000
75%         4.500000
max         5.000000
Name: Rating, dtype: float64

In [17]:
# most install apps
apps.sort_values(by='Installs', inplace=True, ascending=False)
apps.head(3)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
4041,Temple Run 2,GAME,4.3,8116142,62M,"500,000,000+",Free,0,Everyone,Action,"July 5, 2018",1.49.1,4.0 and up
4122,LINE: Free Calls & Messages,COMMUNICATION,4.2,10790092,Varies with device,"500,000,000+",Free,0,Everyone,Communication,"July 26, 2018",Varies with device,Varies with device
4147,Gboard - the Google Keyboard,TOOLS,4.2,1855262,Varies with device,"500,000,000+",Free,0,Everyone,Tools,"July 31, 2018",Varies with device,Varies with device


In [33]:
# most expensive apps
# apps.sort_values(by='Price', inplace=True, ascending=True)
# apps.head(10)
apps_notfree = apps[apps['Type']=='Paid']
apps_notfree.sort_values(by='Price', inplace=True, ascending=False)
apps_notfree.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
4938,Pocket AC,PHOTOGRAPHY,4.8,130,4.4M,"1,000+",Paid,$9.99,Everyone,Photography,"June 27, 2018",3.1.1,4.1 and up
6205,Baldur's Gate II,FAMILY,4.3,5442,16M,"50,000+",Paid,$9.99,Teen,Role Playing,"June 28, 2018",2.5.16.6,3.0 and up
8721,DRAGON QUEST III,FAMILY,4.2,1661,63M,"10,000+",Paid,$9.99,Everyone,Role Playing,"March 9, 2018",1.0.5,4.0 and up
8733,CHRONO TRIGGER (Upgrade Ver.),FAMILY,3.8,11250,6.3M,"100,000+",Paid,$9.99,Everyone 10+,Role Playing,"June 12, 2018",2.0.4,4.2 and up
7798,Jeppesen CR Flight Computer,MAPS_AND_NAVIGATION,3.3,3,26M,100+,Paid,$9.99,Everyone,Maps & Navigation,"October 5, 2016",1.0.8,4.4 and up
7893,Abdominal CT Sectional Walker,MEDICAL,,2,23M,100+,Paid,$9.99,Everyone,Medical,"July 8, 2016",1.1,4.0 and up
7898,Chest CT Sectional Walker,MEDICAL,,2,20M,100+,Paid,$9.99,Everyone,Medical,"July 9, 2016",1.1,4.0 and up
10006,XCOM®: Enemy Within,FAMILY,4.2,13752,21M,"100,000+",Paid,$9.99,Mature 17+,Strategy,"October 24, 2017",1.7.0,4.0 and up
2295,InfantRisk Center HCP,MEDICAL,2.6,41,14M,"1,000+",Paid,$9.99,Everyone,Medical,"May 6, 2015",1.3.4,2.3.3 and up
2401,Diabetes & Diet Tracker,MEDICAL,4.6,395,19M,"1,000+",Paid,$9.99,Everyone,Medical,"July 16, 2018",6.5.1,5.0 and up


In [34]:
# convert price to number
def price_to_number(x):
    return float(x.replace('$',''))

apps_notfree['Price'] = apps_notfree['Price'].apply(price_to_number)
apps_notfree['Price']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


4938      9.99
6205      9.99
8721      9.99
8733      9.99
7798      9.99
7893      9.99
7898      9.99
10006     9.99
2295      9.99
2401      9.99
2289      9.99
2378      9.99
4697      9.99
9409      9.99
5475      9.99
2246      9.99
6180      9.99
9294      9.99
2259      9.99
3441      9.99
1838      9.99
2243      9.00
2366      9.00
9730     89.99
4085      8.99
6809      8.99
8364      8.99
10645     8.99
8211      8.99
5909      8.49
         ...  
5005      0.99
4988      0.99
4983      0.99
9592      0.99
7384      0.99
4971      0.99
7390      0.99
4842      0.99
7465      0.99
7466      0.99
4779      0.99
5257      0.99
5260      0.99
5263      0.99
5777      0.99
5834      0.99
9165      0.99
9170      0.99
9206      0.99
5812      0.99
5804      0.99
5767      0.99
5265      0.99
5660      0.99
9325      0.99
5557      0.99
5480      0.99
7377      0.99
8225      0.99
5269      0.99
Name: Price, Length: 800, dtype: float64

In [35]:
apps_notfree.sort_values(by='Price', inplace=True, ascending=False)
apps_notfree.head(3)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
4367,I'm Rich - Trump Edition,LIFESTYLE,3.6,275,7.3M,"10,000+",Paid,400.0,Everyone,Lifestyle,"May 3, 2018",1.0.1,4.1 and up
5364,I am rich (Most expensive app),FINANCE,4.1,129,2.7M,"1,000+",Paid,399.99,Teen,Finance,"December 6, 2017",2,4.0.3 and up
5351,I am rich,LIFESTYLE,3.8,3547,1.8M,"100,000+",Paid,399.99,Everyone,Lifestyle,"January 12, 2018",2.0,4.0.3 and up


In [5]:
comments = pd.read_csv('./data/google-play-store-apps/googleplaystore_user_reviews.csv')
comments.head(3)

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,


In [11]:
# number of category
apps.Category.unique

<bound method Series.unique of 10472                    1.9
7756            PRODUCTIVITY
6494                   TOOLS
6593                 MEDICAL
6604               LIFESTYLE
7258                 FINANCE
6605      HEALTH_AND_FITNESS
7245               LIFESTYLE
7754                  SPORTS
10697                   GAME
6646          FOOD_AND_DRINK
7243               LIFESTYLE
7514                   TOOLS
9705                  FAMILY
7731                BUSINESS
10690        PERSONALIZATION
6484      HEALTH_AND_FITNESS
7799                  FAMILY
7100               LIFESTYLE
9701               LIFESTYLE
6467                  FAMILY
7506                   TOOLS
9810            PRODUCTIVITY
6726                  FAMILY
7805                   TOOLS
6727           COMMUNICATION
9899               LIFESTYLE
7122      HEALTH_AND_FITNESS
7035                 MEDICAL
7127             PHOTOGRAPHY
                ...         
10746                  TOOLS
10748          COMMUNICATION
10751       

In [9]:
# best apps by rating
apps.sort_values(by='Rating', inplace=True, ascending=False)
apps.head(3)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
10472,Life Made WI-Fi Touchscreen Photo Frame,1.9,19.0,3.0M,"1,000+",Free,0,Everyone,,"February 11, 2018",1.0.19,4.0 and up,
7756,iReadMe,PRODUCTIVITY,5.0,8,22M,100+,Free,0,Everyone,Productivity,"March 6, 2018",1.5,4.4 and up
6494,BM speed test,TOOLS,5.0,1,3.7M,10+,Free,0,Everyone,Tools,"July 13, 2018",2.0,4.4 and up
