# Predicting the success rate of a Google Play Store App based on ratings

## Importing the neccessary libraries

In [None]:
import numpy as np 
import pandas as pd 

## Importing the dataset

In [None]:
from google.colab import files
uploaded = files.upload()

df = pd.read_csv("GooglePlayStore_Apps.csv",
                      delimiter = ',')

# store dataframe into csv file
df.to_csv('GooglePlayStore_Apps.csv',
               index = None)

In [None]:
df.head() #The first five rows is printed to show how the dataset looks like.

Unnamed: 0,App,Category,Content_Rating,Installs,Last_Updated,No_of_Ratings,Rating,Rating_1,Rating_2,Rating_3,Rating_4,Rating_5,Rating_Text,Review_Text,Size,Type,URL
0,Photo Editor & Candy Camera & Grid & ScrapBook,Art & Design,Everyone,"10,000+","January 7, 2018",683,4.2,50,19,94,120,400,1,"Looks like a good app but I can't get into it,...",19M,Free,https://play.google.com/store/apps/details?id=...
1,Photo Editor & Candy Camera & Grid & ScrapBook,Art & Design,Everyone,"10,000+","January 7, 2018",683,4.2,50,19,94,120,400,2,"The only thing I hate about this app is ,whene...",19M,Free,https://play.google.com/store/apps/details?id=...
2,Photo Editor & Candy Camera & Grid & ScrapBook,Art & Design,Everyone,"10,000+","January 7, 2018",683,4.2,50,19,94,120,400,1,It's a very bad app after I installed the app ...,19M,Free,https://play.google.com/store/apps/details?id=...
3,Photo Editor & Candy Camera & Grid & ScrapBook,Art & Design,Everyone,"10,000+","January 7, 2018",683,4.2,50,19,94,120,400,4,"It's really fun to use, but it keeps pestering...",19M,Free,https://play.google.com/store/apps/details?id=...
4,Photo Editor & Candy Camera & Grid & ScrapBook,Art & Design,Everyone,"10,000+","January 7, 2018",683,4.2,50,19,94,120,400,4,Resourceful tool to install and carry along yo...,19M,Free,https://play.google.com/store/apps/details?id=...


In [None]:
df.shape 

(180437, 17)

It contains 180437 rows and 17 columns.

##Data-Processing

In [None]:
df['Review_Text'] = df['Review_Text'].astype(str)

#Finding the polarity and subjectivity of user reviews

from textblob import TextBlob
df[['Polarity','Subjectivity']] = df['Review_Text'].apply(lambda Text: pd.Series(TextBlob(Text).sentiment))

In [None]:
df.head()

Unnamed: 0,App,Category,Content_Rating,Installs,Last_Updated,No_of_Ratings,Rating,Rating_1,Rating_2,Rating_3,Rating_4,Rating_5,Rating_Text,Review_Text,Size,Type,URL,Polarity,Subjectivity
0,Photo Editor & Candy Camera & Grid & ScrapBook,Art & Design,Everyone,"10,000+","January 7, 2018",683,4.2,50,19,94,120,400,1,"Looks like a good app but I can't get into it,...",19M,Free,https://play.google.com/store/apps/details?id=...,0.55,0.55
1,Photo Editor & Candy Camera & Grid & ScrapBook,Art & Design,Everyone,"10,000+","January 7, 2018",683,4.2,50,19,94,120,400,2,"The only thing I hate about this app is ,whene...",19M,Free,https://play.google.com/store/apps/details?id=...,-0.266667,0.633333
2,Photo Editor & Candy Camera & Grid & ScrapBook,Art & Design,Everyone,"10,000+","January 7, 2018",683,4.2,50,19,94,120,400,1,It's a very bad app after I installed the app ...,19M,Free,https://play.google.com/store/apps/details?id=...,-0.405,0.433333
3,Photo Editor & Candy Camera & Grid & ScrapBook,Art & Design,Everyone,"10,000+","January 7, 2018",683,4.2,50,19,94,120,400,4,"It's really fun to use, but it keeps pestering...",19M,Free,https://play.google.com/store/apps/details?id=...,0.3,0.2
4,Photo Editor & Candy Camera & Grid & ScrapBook,Art & Design,Everyone,"10,000+","January 7, 2018",683,4.2,50,19,94,120,400,4,Resourceful tool to install and carry along yo...,19M,Free,https://play.google.com/store/apps/details?id=...,0.6,0.9


In [None]:
#Cleaning and converting into integer
df['Installs'] = [int(i[:-1].replace(',','')) for i in df['Installs']]
df['No_of_Ratings'] = [int(i[:].replace(',','')) for i in df['No_of_Ratings']]
df['Rating_1'] = [int(i[:].replace(',','')) for i in df['Rating_1']]
df['Rating_2'] = [int(i[:].replace(',','')) for i in df['Rating_2']]
df['Rating_3'] = [int(i[:].replace(',','')) for i in df['Rating_3']]
df['Rating_4'] = [int(i[:].replace(',','')) for i in df['Rating_4']]
df['Rating_5'] = [int(i[:].replace(',','')) for i in df['Rating_5']]

df.head()

Unnamed: 0,App,Category,Content_Rating,Installs,Last_Updated,No_of_Ratings,Rating,Rating_1,Rating_2,Rating_3,Rating_4,Rating_5,Rating_Text,Review_Text,Size,Type,URL,Polarity,Subjectivity
0,Photo Editor & Candy Camera & Grid & ScrapBook,Art & Design,Everyone,10000,"January 7, 2018",683,4.2,50,19,94,120,400,1,"Looks like a good app but I can't get into it,...",19M,Free,https://play.google.com/store/apps/details?id=...,0.55,0.55
1,Photo Editor & Candy Camera & Grid & ScrapBook,Art & Design,Everyone,10000,"January 7, 2018",683,4.2,50,19,94,120,400,2,"The only thing I hate about this app is ,whene...",19M,Free,https://play.google.com/store/apps/details?id=...,-0.266667,0.633333
2,Photo Editor & Candy Camera & Grid & ScrapBook,Art & Design,Everyone,10000,"January 7, 2018",683,4.2,50,19,94,120,400,1,It's a very bad app after I installed the app ...,19M,Free,https://play.google.com/store/apps/details?id=...,-0.405,0.433333
3,Photo Editor & Candy Camera & Grid & ScrapBook,Art & Design,Everyone,10000,"January 7, 2018",683,4.2,50,19,94,120,400,4,"It's really fun to use, but it keeps pestering...",19M,Free,https://play.google.com/store/apps/details?id=...,0.3,0.2
4,Photo Editor & Candy Camera & Grid & ScrapBook,Art & Design,Everyone,10000,"January 7, 2018",683,4.2,50,19,94,120,400,4,Resourceful tool to install and carry along yo...,19M,Free,https://play.google.com/store/apps/details?id=...,0.6,0.9


In [None]:
#Grouping similar content rating values
df['Content_Rating'] = df['Content_Rating'].astype(str)
df.loc[df['Content_Rating'].str.contains('Everyone'), 'Content_Rating'] = 'Everyone'
df.loc[df['Content_Rating'].str.contains('Teen'), 'Content_Rating'] = 'Teen'
df.loc[df['Content_Rating'].str.contains('Mature 17+'), 'Content_Rating'] = 'Mature 17+'
df.loc[df['Content_Rating'].str.contains('Adults'), 'Content_Rating'] = 'Adults'
df.loc[df['Content_Rating'].str.contains('Unrated'), 'Content_Rating'] = 'Unrated'

df.head()

Unnamed: 0,App,Category,Content_Rating,Installs,Last_Updated,No_of_Ratings,Rating,Rating_1,Rating_2,Rating_3,Rating_4,Rating_5,Rating_Text,Review_Text,Size,Type,URL,Polarity,Subjectivity
0,Photo Editor & Candy Camera & Grid & ScrapBook,Art & Design,Everyone,10000,"January 7, 2018",683,4.2,50,19,94,120,400,1,"Looks like a good app but I can't get into it,...",19M,Free,https://play.google.com/store/apps/details?id=...,0.55,0.55
1,Photo Editor & Candy Camera & Grid & ScrapBook,Art & Design,Everyone,10000,"January 7, 2018",683,4.2,50,19,94,120,400,2,"The only thing I hate about this app is ,whene...",19M,Free,https://play.google.com/store/apps/details?id=...,-0.266667,0.633333
2,Photo Editor & Candy Camera & Grid & ScrapBook,Art & Design,Everyone,10000,"January 7, 2018",683,4.2,50,19,94,120,400,1,It's a very bad app after I installed the app ...,19M,Free,https://play.google.com/store/apps/details?id=...,-0.405,0.433333
3,Photo Editor & Candy Camera & Grid & ScrapBook,Art & Design,Everyone,10000,"January 7, 2018",683,4.2,50,19,94,120,400,4,"It's really fun to use, but it keeps pestering...",19M,Free,https://play.google.com/store/apps/details?id=...,0.3,0.2
4,Photo Editor & Candy Camera & Grid & ScrapBook,Art & Design,Everyone,10000,"January 7, 2018",683,4.2,50,19,94,120,400,4,Resourceful tool to install and carry along yo...,19M,Free,https://play.google.com/store/apps/details?id=...,0.6,0.9


In [None]:
#Calculating the median and mean values of all reviews

df['Sentiment_Median']  = df.groupby('App')['Polarity'].transform('median')
df['Subjectivity_Mean']  = df.groupby('App')['Subjectivity'].transform('mean')
df['Rating_Mean']  = df.groupby('App')['Rating_Text'].transform('mean')

df.head()

Unnamed: 0,App,Category,Content_Rating,Installs,Last_Updated,No_of_Ratings,Rating,Rating_1,Rating_2,Rating_3,Rating_4,Rating_5,Rating_Text,Review_Text,Size,Type,URL,Polarity,Subjectivity,Sentiment_Median,Subjectivity_Mean,Rating_Mean
0,Photo Editor & Candy Camera & Grid & ScrapBook,Art & Design,Everyone,10000,"January 7, 2018",683,4.2,50,19,94,120,400,1,"Looks like a good app but I can't get into it,...",19M,Free,https://play.google.com/store/apps/details?id=...,0.55,0.55,0.0,0.369962,3.35
1,Photo Editor & Candy Camera & Grid & ScrapBook,Art & Design,Everyone,10000,"January 7, 2018",683,4.2,50,19,94,120,400,2,"The only thing I hate about this app is ,whene...",19M,Free,https://play.google.com/store/apps/details?id=...,-0.266667,0.633333,0.0,0.369962,3.35
2,Photo Editor & Candy Camera & Grid & ScrapBook,Art & Design,Everyone,10000,"January 7, 2018",683,4.2,50,19,94,120,400,1,It's a very bad app after I installed the app ...,19M,Free,https://play.google.com/store/apps/details?id=...,-0.405,0.433333,0.0,0.369962,3.35
3,Photo Editor & Candy Camera & Grid & ScrapBook,Art & Design,Everyone,10000,"January 7, 2018",683,4.2,50,19,94,120,400,4,"It's really fun to use, but it keeps pestering...",19M,Free,https://play.google.com/store/apps/details?id=...,0.3,0.2,0.0,0.369962,3.35
4,Photo Editor & Candy Camera & Grid & ScrapBook,Art & Design,Everyone,10000,"January 7, 2018",683,4.2,50,19,94,120,400,4,Resourceful tool to install and carry along yo...,19M,Free,https://play.google.com/store/apps/details?id=...,0.6,0.9,0.0,0.369962,3.35


In [None]:
#Keeping only the first row of each app since the others are no longer required

df.drop_duplicates(subset='App', keep="first",inplace = True)
df.head()

Unnamed: 0,App,Category,Content_Rating,Installs,Last_Updated,No_of_Ratings,Rating,Rating_1,Rating_2,Rating_3,Rating_4,Rating_5,Rating_Text,Review_Text,Size,Type,URL,Polarity,Subjectivity,Sentiment_Median,Subjectivity_Mean,Rating_Mean
0,Photo Editor & Candy Camera & Grid & ScrapBook,Art & Design,Everyone,10000,"January 7, 2018",683,4.2,50,19,94,120,400,1,"Looks like a good app but I can't get into it,...",19M,Free,https://play.google.com/store/apps/details?id=...,0.55,0.55,0.0,0.369962,3.35
40,Coloring book moana,Art & Design,Everyone,1000000,"January 15, 2018",1780,3.9,338,69,120,146,1107,3,To many adds in a kids game knowing that if yo...,14M,Free,https://play.google.com/store/apps/details?id=...,-0.16,0.34,0.0,0.546181,3.225
80,"U Launcher Lite – FREE Live Cool Themes, Hide ...",Art & Design,Everyone,5000000,"November 1, 2018",119937,4.7,2986,1255,6131,12838,96727,5,I installed and uninstalled several launcher a...,8.9M,Free,https://play.google.com/store/apps/details?id=...,0.32,0.42,0.179167,0.405565,4.275
120,Sketch - Draw & Paint,Art & Design,Teen,100000000,"November 6, 2018",238616,4.5,13024,4688,13383,29660,177861,1,5 stars when I can export photos at the same q...,32M,Free,https://play.google.com/store/apps/details?id=...,0.0,0.125,0.227976,0.445565,3.975
160,Pixel Draw - Number Art Coloring Book,Art & Design,Everyone,500000,"September 22, 2018",2127,4.4,206,39,75,229,1578,4,I havent had any notificacations yet thats why...,3.6M,Free,https://play.google.com/store/apps/details?id=...,0.65,0.775,0.416667,0.507247,4.7


In [None]:
#Dropping the columns that are no longer required as we have used the Polarity and subjectivity to calculate the median and mean values
df = df.drop(columns=['Rating_Text','Review_Text','Polarity','Subjectivity','URL'])
df.head()

Unnamed: 0,App,Category,Content_Rating,Installs,Last_Updated,No_of_Ratings,Rating,Rating_1,Rating_2,Rating_3,Rating_4,Rating_5,Size,Type,Sentiment_Median,Subjectivity_Mean,Rating_Mean
0,Photo Editor & Candy Camera & Grid & ScrapBook,Art & Design,Everyone,10000,"January 7, 2018",683,4.2,50,19,94,120,400,19M,Free,0.0,0.369962,3.35
40,Coloring book moana,Art & Design,Everyone,1000000,"January 15, 2018",1780,3.9,338,69,120,146,1107,14M,Free,0.0,0.546181,3.225
80,"U Launcher Lite – FREE Live Cool Themes, Hide ...",Art & Design,Everyone,5000000,"November 1, 2018",119937,4.7,2986,1255,6131,12838,96727,8.9M,Free,0.179167,0.405565,4.275
120,Sketch - Draw & Paint,Art & Design,Teen,100000000,"November 6, 2018",238616,4.5,13024,4688,13383,29660,177861,32M,Free,0.227976,0.445565,3.975
160,Pixel Draw - Number Art Coloring Book,Art & Design,Everyone,500000,"September 22, 2018",2127,4.4,206,39,75,229,1578,3.6M,Free,0.416667,0.507247,4.7


### Making neccessary changes to values of the column to make the dataset values consistent.

In [None]:
#converting size to bytes

df.Size=df.Size.str.replace('k','e+3')
df.Size=df.Size.str.replace('M','e+6')

In [None]:
def is_convertable(v):
    try:
        float(v)
        return True
    except ValueError:
        return False
    
temp=df.Size.apply(lambda x: is_convertable(x))

In [None]:
df.Size=df.Size.replace('Varies with device',np.nan)

In [None]:
df.Size=pd.to_numeric(df.Size)

In [None]:
df['Size'].fillna(df['Size'].mean(), inplace=True) #filling the ‘’varies with device’’ with the mean size

In [None]:
df['Size'] *= 0.000001 #converting again to Megabyes

In [None]:
df.head()

Unnamed: 0,App,Category,Content_Rating,Installs,Last_Updated,No_of_Ratings,Rating,Rating_1,Rating_2,Rating_3,Rating_4,Rating_5,Size,Type,Sentiment_Median,Subjectivity_Mean,Rating_Mean
0,Photo Editor & Candy Camera & Grid & ScrapBook,Art & Design,Everyone,10000,"January 7, 2018",683,4.2,50,19,94,120,400,19.0,Free,0.0,0.369962,3.35
40,Coloring book moana,Art & Design,Everyone,1000000,"January 15, 2018",1780,3.9,338,69,120,146,1107,14.0,Free,0.0,0.546181,3.225
80,"U Launcher Lite – FREE Live Cool Themes, Hide ...",Art & Design,Everyone,5000000,"November 1, 2018",119937,4.7,2986,1255,6131,12838,96727,8.9,Free,0.179167,0.405565,4.275
120,Sketch - Draw & Paint,Art & Design,Teen,100000000,"November 6, 2018",238616,4.5,13024,4688,13383,29660,177861,32.0,Free,0.227976,0.445565,3.975
160,Pixel Draw - Number Art Coloring Book,Art & Design,Everyone,500000,"September 22, 2018",2127,4.4,206,39,75,229,1578,3.6,Free,0.416667,0.507247,4.7


In [None]:
#label encoding

df["Category"] = df["Category"].astype('category')
df["Category.cat"] = df["Category"].cat.codes
df["Content_Rating"] = df["Content_Rating"].astype('category')
df["Content_Rating.cat"] = df["Content_Rating"].cat.codes
df["Type"] = df["Type"].astype('category')
df["Type.cat"] = df["Type"].cat.codes
df["Last_Updated"] = df["Last_Updated"].astype('category')
df["Last_Updated.cat"] = df["Last_Updated"].cat.codes


In [None]:
df = df.drop(columns=['Category','Content_Rating','Type','Last_Updated'])
df.head()

Unnamed: 0,App,Installs,No_of_Ratings,Rating,Rating_1,Rating_2,Rating_3,Rating_4,Rating_5,Size,Sentiment_Median,Subjectivity_Mean,Rating_Mean,Category.cat,Content_Rating.cat,Type.cat,Last_Updated.cat
0,Photo Editor & Candy Camera & Grid & ScrapBook,10000,683,4.2,50,19,94,120,400,19.0,0.0,0.369962,3.35,3,1,0,393
40,Coloring book moana,1000000,1780,3.9,338,69,120,146,1107,14.0,0.0,0.546181,3.225,3,1,0,337
80,"U Launcher Lite – FREE Live Cool Themes, Hide ...",5000000,119937,4.7,2986,1255,6131,12838,96727,8.9,0.179167,0.405565,4.275,3,1,0,735
120,Sketch - Draw & Paint,100000000,238616,4.5,13024,4688,13383,29660,177861,32.0,0.227976,0.445565,3.975,3,3,0,804
160,Pixel Draw - Number Art Coloring Book,500000,2127,4.4,206,39,75,229,1578,3.6,0.416667,0.507247,4.7,3,1,0,969


## Defining the success equation for the app to be classified as successful

In [None]:
#Defining target class i.e success
df.loc[  (((df['Rating_4']+df['Rating_5'])/(df['No_of_Ratings']))>=0.75) & (df['Installs']>=50000), 'Success'] = 1

In [None]:
df['Success'].fillna(0, inplace=True)
df.head(10)

Unnamed: 0,App,Installs,No_of_Ratings,Rating,Rating_1,Rating_2,Rating_3,Rating_4,Rating_5,Size,Sentiment_Median,Subjectivity_Mean,Rating_Mean,Category.cat,Content_Rating.cat,Type.cat,Last_Updated.cat,Success
0,Photo Editor & Candy Camera & Grid & ScrapBook,10000,683,4.2,50,19,94,120,400,19.0,0.0,0.369962,3.35,3,1,0,393,0.0
40,Coloring book moana,1000000,1780,3.9,338,69,120,146,1107,14.0,0.0,0.546181,3.225,3,1,0,337,0.0
80,"U Launcher Lite – FREE Live Cool Themes, Hide ...",5000000,119937,4.7,2986,1255,6131,12838,96727,8.9,0.179167,0.405565,4.275,3,1,0,735,1.0
120,Sketch - Draw & Paint,100000000,238616,4.5,13024,4688,13383,29660,177861,32.0,0.227976,0.445565,3.975,3,3,0,804,1.0
160,Pixel Draw - Number Art Coloring Book,500000,2127,4.4,206,39,75,229,1578,3.6,0.416667,0.507247,4.7,3,1,0,969,1.0
200,DIY Simple Paper Flower,5000,18,4.3,1,2,0,3,12,4.0,0.6,0.8,4.6,17,1,0,5,0.0
205,I Smoke Effect Photo Editor 2017 (New),500000,1298,4.2,154,38,86,98,922,8.7,0.0,0.477333,3.075,32,2,0,1010,1.0
245,Infinite Painter,1000000,38955,4.1,4450,1839,3503,6100,23063,25.20571,0.305152,0.496305,3.8,3,1,0,816,0.0
285,Garden Coloring Book,1000000,14835,4.4,915,406,1061,2374,10079,33.0,0.174336,0.586526,3.825,3,1,0,962,1.0
325,Kids Paint Free - Drawing Fun,50000,179,4.5,11,7,8,8,145,2.3,0.513958,0.643693,4.7,3,1,0,1005,1.0


In [None]:
df['Type.cat'].value_counts() # Free apps are labeled as 0 and paid are labeled as 1

0    5111
1     257
Name: Type.cat, dtype: int64

In [None]:
df['Content_Rating.cat'].value_counts() # 'Everyone' is denoted with 1  by the label encoder

1    4454
3     655
2     254
5       3
4       1
0       1
Name: Content_Rating.cat, dtype: int64

In [None]:
len(df['Success'].unique()) #binary class with 1 meaning Successful and 0 meaning Not Successful

2

In [None]:
df.head()

Unnamed: 0,App,Installs,No_of_Ratings,Rating,Rating_1,Rating_2,Rating_3,Rating_4,Rating_5,Size,Sentiment_Median,Subjectivity_Mean,Rating_Mean,Category.cat,Content_Rating.cat,Type.cat,Last_Updated.cat,Success
0,Photo Editor & Candy Camera & Grid & ScrapBook,10000,683,4.2,50,19,94,120,400,19.0,0.0,0.369962,3.35,3,1,0,393,0.0
40,Coloring book moana,1000000,1780,3.9,338,69,120,146,1107,14.0,0.0,0.546181,3.225,3,1,0,337,0.0
80,"U Launcher Lite – FREE Live Cool Themes, Hide ...",5000000,119937,4.7,2986,1255,6131,12838,96727,8.9,0.179167,0.405565,4.275,3,1,0,735,1.0
120,Sketch - Draw & Paint,100000000,238616,4.5,13024,4688,13383,29660,177861,32.0,0.227976,0.445565,3.975,3,3,0,804,1.0
160,Pixel Draw - Number Art Coloring Book,500000,2127,4.4,206,39,75,229,1578,3.6,0.416667,0.507247,4.7,3,1,0,969,1.0


In [None]:
#dropping the attributes not used for prediction
df = df.drop(columns=['Installs','Rating','Rating_1','Rating_2','Rating_3','Rating_4','Rating_5'])

In [None]:
df.head()

Unnamed: 0,App,No_of_Ratings,Size,Sentiment_Median,Subjectivity_Mean,Rating_Mean,Category.cat,Content_Rating.cat,Type.cat,Last_Updated.cat,Success
0,Photo Editor & Candy Camera & Grid & ScrapBook,683,19.0,0.0,0.369962,3.35,3,1,0,393,0.0
40,Coloring book moana,1780,14.0,0.0,0.546181,3.225,3,1,0,337,0.0
80,"U Launcher Lite – FREE Live Cool Themes, Hide ...",119937,8.9,0.179167,0.405565,4.275,3,1,0,735,1.0
120,Sketch - Draw & Paint,238616,32.0,0.227976,0.445565,3.975,3,3,0,804,1.0
160,Pixel Draw - Number Art Coloring Book,2127,3.6,0.416667,0.507247,4.7,3,1,0,969,1.0


In [None]:
df['Word_in_AppName'] = df['App'].apply(lambda x: len(str(x).split(' ')))
df.head()

Unnamed: 0,App,No_of_Ratings,Size,Sentiment_Median,Subjectivity_Mean,Rating_Mean,Category.cat,Content_Rating.cat,Type.cat,Last_Updated.cat,Success,Word_in_AppName
0,Photo Editor & Candy Camera & Grid & ScrapBook,683,19.0,0.0,0.369962,3.35,3,1,0,393,0.0,9
40,Coloring book moana,1780,14.0,0.0,0.546181,3.225,3,1,0,337,0.0,3
80,"U Launcher Lite – FREE Live Cool Themes, Hide ...",119937,8.9,0.179167,0.405565,4.275,3,1,0,735,1.0,10
120,Sketch - Draw & Paint,238616,32.0,0.227976,0.445565,3.975,3,3,0,804,1.0,5
160,Pixel Draw - Number Art Coloring Book,2127,3.6,0.416667,0.507247,4.7,3,1,0,969,1.0,7


In [None]:
#Checkpoint
df.to_csv('GooglePlayStore_data_processed.csv')

## Working with new Processed_Data for further implying the machine learning algorithm and generating output

In [None]:
# As we have processed the previous dataset, further implementations is processed under this newly created one. It is used to avoid any overlap during the code.
df = pd.read_csv('/content/GooglePlayStore_data_processed.csv')

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,App,No_of_Ratings,Size,Sentiment_Median,Subjectivity_Mean,Rating_Mean,Category.cat,Content_Rating.cat,Type.cat,Last_Updated.cat,Success,Word_in_AppName
0,0,Photo Editor & Candy Camera & Grid & ScrapBook,683,19.0,0.0,0.369962,3.35,3,1,0,393,0.0,9
1,40,Coloring book moana,1780,14.0,0.0,0.546181,3.225,3,1,0,337,0.0,3
2,80,"U Launcher Lite – FREE Live Cool Themes, Hide ...",119937,8.9,0.179167,0.405565,4.275,3,1,0,735,1.0,10
3,120,Sketch - Draw & Paint,238616,32.0,0.227976,0.445565,3.975,3,3,0,804,1.0,5
4,160,Pixel Draw - Number Art Coloring Book,2127,3.6,0.416667,0.507247,4.7,3,1,0,969,1.0,7


In [None]:
df = df.drop(columns='Unnamed: 0')
df.head()

Unnamed: 0,App,No_of_Ratings,Size,Sentiment_Median,Subjectivity_Mean,Rating_Mean,Category.cat,Content_Rating.cat,Type.cat,Last_Updated.cat,Success,Word_in_AppName
0,Photo Editor & Candy Camera & Grid & ScrapBook,683,19.0,0.0,0.369962,3.35,3,1,0,393,0.0,9
1,Coloring book moana,1780,14.0,0.0,0.546181,3.225,3,1,0,337,0.0,3
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",119937,8.9,0.179167,0.405565,4.275,3,1,0,735,1.0,10
3,Sketch - Draw & Paint,238616,32.0,0.227976,0.445565,3.975,3,3,0,804,1.0,5
4,Pixel Draw - Number Art Coloring Book,2127,3.6,0.416667,0.507247,4.7,3,1,0,969,1.0,7


## Defining X and Y value, splitting data for training and testing

In [None]:
from sklearn import tree
import numpy as np
import pandas as pd


X = np.array(df.drop(['Success','App'], 1))
y = np.array(df['Success'])

### Splitting the dataset into the Training set and Test set

---



In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)
print("X_train : ",X_train.shape)
print("X_test : ",X_test.shape)
print("y_train : ",y_train.shape)
print("y_test : ",y_test.shape)

X_train :  (3757, 10)
X_test :  (1611, 10)
y_train :  (3757,)
y_test :  (1611,)


In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Implementing the machine learning algorithms and computing the best model

####Random Forest Classifier model

In [None]:
#Training the Random Forest Regression model on the Training set

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[1. 1.]
 [1. 1.]
 [1. 1.]
 ...
 [0. 1.]
 [0. 1.]
 [1. 1.]]


In [None]:
#Predicting the Test set results
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[1. 1.]
 [1. 1.]
 [1. 1.]
 ...
 [0. 1.]
 [0. 1.]
 [1. 1.]]


In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[412 151]
 [164 884]]


0.8044692737430168

In [None]:
from sklearn.metrics import classification_report
#target_names = df['Success']
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.72      0.73      0.72       563
         1.0       0.85      0.84      0.85      1048

    accuracy                           0.80      1611
   macro avg       0.78      0.79      0.79      1611
weighted avg       0.81      0.80      0.80      1611



In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 80.44 %
Standard Deviation: 2.03 %


####K-Nearest Neighbors (K-NN)

In [None]:
#Training the K-NN model on the Training set
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

KNeighborsClassifier()

In [None]:
#Predicting the Test set results
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[1. 1.]
 [1. 1.]
 [1. 1.]
 ...
 [1. 1.]
 [1. 1.]
 [0. 1.]]


In [None]:
#Making the confusion matrix

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[249 314]
 [185 863]]


0.6902545003103663

In [None]:
from sklearn.metrics import classification_report
#target_names = df['Success']
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.57      0.44      0.50       563
         1.0       0.73      0.82      0.78      1048

    accuracy                           0.69      1611
   macro avg       0.65      0.63      0.64      1611
weighted avg       0.68      0.69      0.68      1611



In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 71.01 %
Standard Deviation: 1.19 %


####Kernel SVM

In [None]:
#Training the Kernel SVM model on the Training set
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

SVC(random_state=0)

In [None]:
#Predicting the Test set results
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[1. 1.]
 [1. 1.]
 [1. 1.]
 ...
 [1. 1.]
 [1. 1.]
 [0. 1.]]


In [None]:
#Making the confusion matrix

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[233 330]
 [109 939]]


0.7274984481688392

In [None]:
from sklearn.metrics import classification_report
#target_names = df['Success']
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.68      0.41      0.51       563
         1.0       0.74      0.90      0.81      1048

    accuracy                           0.73      1611
   macro avg       0.71      0.65      0.66      1611
weighted avg       0.72      0.73      0.71      1611



In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 72.93 %
Standard Deviation: 2.08 %


#### XG Boost


In [None]:
#Training XGBoost on the Training set
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X_train, y_train)

XGBClassifier()

In [None]:
#Predicting the Test set results
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[1. 1.]
 [1. 1.]
 [1. 1.]
 ...
 [0. 1.]
 [0. 1.]
 [0. 1.]]


In [None]:
#Making the confusion matrix

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[394 169]
 [133 915]]


0.8125387957790192

In [None]:
from sklearn.metrics import classification_report
#target_names = df['Success']
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.75      0.70      0.72       563
         1.0       0.84      0.87      0.86      1048

    accuracy                           0.81      1611
   macro avg       0.80      0.79      0.79      1611
weighted avg       0.81      0.81      0.81      1611



In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 82.51 %
Standard Deviation: 1.77 %
