In [256]:
#importing library
import pandas as pd
import numpy as np
import plotly.express as ps
import re
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [257]:
#loading the data
df=pd.read_csv('../input/google-play-store-apps/googleplaystore.csv')
df

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10836,Sya9a Maroc - FR,FAMILY,4.5,38,53M,"5,000+",Free,0,Everyone,Education,"July 25, 2017",1.48,4.1 and up
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3.6M,100+,Free,0,Everyone,Education,"July 6, 2018",1.0,4.1 and up
10838,Parkinson Exercices FR,MEDICAL,,3,9.5M,"1,000+",Free,0,Everyone,Medical,"January 20, 2017",1.0,2.2 and up
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,Varies with device,"1,000+",Free,0,Mature 17+,Books & Reference,"January 19, 2015",Varies with device,Varies with device


In [258]:
#getting information aboput the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


# Preprocessing

In [259]:
unneeded_column=['App','Current Ver','Android Ver']
df=df.drop(unneeded_column,axis=1)

In [260]:
#checking null values
df.isna().sum()

Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
dtype: int64

In [261]:
#checking the shape
df.shape

(10841, 10)

In [262]:
df['Rating']=df['Rating'].fillna(df['Rating'].mean())


In [263]:
#getting unique
def get_unique(df,columns):
    return {column: list(df[column].unique()) for column in columns}

#getting categorical column
def get_categorical_column(df):
    return [column for column in df.columns if df.dtypes[column]=='object']
    

In [264]:
#calling the function to get categorical data
get_categorical_column(df)

['Category',
 'Reviews',
 'Size',
 'Installs',
 'Type',
 'Price',
 'Content Rating',
 'Genres',
 'Last Updated']

In [265]:
get_unique(df,['Installs'])

{'Installs': ['10,000+',
  '500,000+',
  '5,000,000+',
  '50,000,000+',
  '100,000+',
  '50,000+',
  '1,000,000+',
  '10,000,000+',
  '5,000+',
  '100,000,000+',
  '1,000,000,000+',
  '1,000+',
  '500,000,000+',
  '50+',
  '100+',
  '500+',
  '10+',
  '1+',
  '5+',
  '0+',
  '0',
  'Free']}

In [266]:
df[df['Category']=='1.9']

Unnamed: 0,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated
10472,1.9,19.0,3.0M,"1,000+",Free,0,Everyone,,"February 11, 2018",1.0.19


In [267]:
#droping the rows with index values
df=df.drop(10472,axis=0).reset_index(drop=True)

In [268]:
#changing the column to float
df['Reviews']=df['Reviews'].astype(np.float)

In [269]:
(df['Size']=='Varies with device').sum()

1695

In [270]:
#converting the Size column into float
df['Size']=df['Size'].apply(lambda x: np.NaN if x=='Varies with device' else x)

In [271]:
df['Size']=df['Size'].apply(lambda x: np.float(x.replace('M',""))*1e6 if type(x)!=float and 'M' in x else x)

In [272]:
df['Size']=df['Size'].apply(lambda x: np.float(x.replace('k',""))*1e6 if type(x)!=float and 'k' in x else x)

In [273]:
df['Size'].astype(np.float)

0        19000000.0
1        14000000.0
2         8700000.0
3        25000000.0
4         2800000.0
            ...    
10835    53000000.0
10836     3600000.0
10837     9500000.0
10838           NaN
10839    19000000.0
Name: Size, Length: 10840, dtype: float64

In [274]:
df

Unnamed: 0,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated
0,ART_AND_DESIGN,4.100000,159.0,19000000.0,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018"
1,ART_AND_DESIGN,3.900000,967.0,14000000.0,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018"
2,ART_AND_DESIGN,4.700000,87510.0,8700000.0,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018"
3,ART_AND_DESIGN,4.500000,215644.0,25000000.0,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018"
4,ART_AND_DESIGN,4.300000,967.0,2800000.0,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018"
...,...,...,...,...,...,...,...,...,...,...
10835,FAMILY,4.500000,38.0,53000000.0,"5,000+",Free,0,Everyone,Education,"July 25, 2017"
10836,FAMILY,5.000000,4.0,3600000.0,100+,Free,0,Everyone,Education,"July 6, 2018"
10837,MEDICAL,4.193338,3.0,9500000.0,"1,000+",Free,0,Everyone,Medical,"January 20, 2017"
10838,BOOKS_AND_REFERENCE,4.500000,114.0,,"1,000+",Free,0,Mature 17+,Books & Reference,"January 19, 2015"


In [275]:
df['Installs']=df['Installs'].apply(lambda x:np.float(x.replace(',','').replace('+','')))

In [276]:
#converting Price column to float
df['Price']=df['Price'].apply(lambda x:np.float(x.replace('$','')))

In [277]:
get_unique(df,get_categorical_column(df))

{'Category': ['ART_AND_DESIGN',
  'AUTO_AND_VEHICLES',
  'BEAUTY',
  'BOOKS_AND_REFERENCE',
  'BUSINESS',
  'COMICS',
  'COMMUNICATION',
  'DATING',
  'EDUCATION',
  'ENTERTAINMENT',
  'EVENTS',
  'FINANCE',
  'FOOD_AND_DRINK',
  'HEALTH_AND_FITNESS',
  'HOUSE_AND_HOME',
  'LIBRARIES_AND_DEMO',
  'LIFESTYLE',
  'GAME',
  'FAMILY',
  'MEDICAL',
  'SOCIAL',
  'SHOPPING',
  'PHOTOGRAPHY',
  'SPORTS',
  'TRAVEL_AND_LOCAL',
  'TOOLS',
  'PERSONALIZATION',
  'PRODUCTIVITY',
  'PARENTING',
  'WEATHER',
  'VIDEO_PLAYERS',
  'NEWS_AND_MAGAZINES',
  'MAPS_AND_NAVIGATION'],
 'Type': ['Free', 'Paid', nan],
 'Content Rating': ['Everyone',
  'Teen',
  'Everyone 10+',
  'Mature 17+',
  'Adults only 18+',
  'Unrated'],
 'Genres': ['Art & Design',
  'Art & Design;Pretend Play',
  'Art & Design;Creativity',
  'Art & Design;Action & Adventure',
  'Auto & Vehicles',
  'Beauty',
  'Books & Reference',
  'Business',
  'Comics',
  'Comics;Creativity',
  'Communication',
  'Dating',
  'Education;Education',
 

In [278]:
df[df['Type'].isna()]

Unnamed: 0,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated
9148,FAMILY,4.193338,0.0,,0.0,,0.0,Everyone 10+,Strategy,"June 28, 2018"


In [279]:
df=df.drop(9148,axis=0).reset_index(drop=True)

In [280]:
df.isna().sum()

Category             0
Rating               0
Reviews              0
Size              1694
Installs             0
Type                 0
Price                0
Content Rating       0
Genres               0
Last Updated         0
dtype: int64

In [281]:
#filling na value with mean in rating column
df['Rating']=df['Rating'].fillna(df['Rating'].mean())

In [282]:
#filling na values with mean in size column
df['Size']=df['Size'].fillna(df['Size'].mean())

In [283]:
#checking na values
df.isna().sum()

Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              0
Price             0
Content Rating    0
Genres            0
Last Updated      0
dtype: int64

In [284]:
df

Unnamed: 0,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated
0,ART_AND_DESIGN,4.100000,159.0,1.900000e+07,10000.0,Free,0.0,Everyone,Art & Design,"January 7, 2018"
1,ART_AND_DESIGN,3.900000,967.0,1.400000e+07,500000.0,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018"
2,ART_AND_DESIGN,4.700000,87510.0,8.700000e+06,5000000.0,Free,0.0,Everyone,Art & Design,"August 1, 2018"
3,ART_AND_DESIGN,4.500000,215644.0,2.500000e+07,50000000.0,Free,0.0,Teen,Art & Design,"June 8, 2018"
4,ART_AND_DESIGN,4.300000,967.0,2.800000e+06,100000.0,Free,0.0,Everyone,Art & Design;Creativity,"June 20, 2018"
...,...,...,...,...,...,...,...,...,...,...
10834,FAMILY,4.500000,38.0,5.300000e+07,5000.0,Free,0.0,Everyone,Education,"July 25, 2017"
10835,FAMILY,5.000000,4.0,3.600000e+06,100.0,Free,0.0,Everyone,Education,"July 6, 2018"
10836,MEDICAL,4.193338,3.0,9.500000e+06,1000.0,Free,0.0,Everyone,Medical,"January 20, 2017"
10837,BOOKS_AND_REFERENCE,4.500000,114.0,3.703544e+07,1000.0,Free,0.0,Mature 17+,Books & Reference,"January 19, 2015"


In [285]:
#extracing month
df['Month']=df['Last Updated'].apply(lambda x: re.search('^[^\s]+',x).group(0))


In [286]:
#extracting year
df['Year']=df['Last Updated'].apply(lambda x: re.search('[^\s]+$',x).group(0))

In [287]:
df=df.drop('Last Updated',axis=1)

In [288]:
#changing data type to float
df['Year']=df['Year'].apply(lambda x: np.float(x))

In [289]:
df

Unnamed: 0,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Month,Year
0,ART_AND_DESIGN,4.100000,159.0,1.900000e+07,10000.0,Free,0.0,Everyone,Art & Design,January,2018.0
1,ART_AND_DESIGN,3.900000,967.0,1.400000e+07,500000.0,Free,0.0,Everyone,Art & Design;Pretend Play,January,2018.0
2,ART_AND_DESIGN,4.700000,87510.0,8.700000e+06,5000000.0,Free,0.0,Everyone,Art & Design,August,2018.0
3,ART_AND_DESIGN,4.500000,215644.0,2.500000e+07,50000000.0,Free,0.0,Teen,Art & Design,June,2018.0
4,ART_AND_DESIGN,4.300000,967.0,2.800000e+06,100000.0,Free,0.0,Everyone,Art & Design;Creativity,June,2018.0
...,...,...,...,...,...,...,...,...,...,...,...
10834,FAMILY,4.500000,38.0,5.300000e+07,5000.0,Free,0.0,Everyone,Education,July,2017.0
10835,FAMILY,5.000000,4.0,3.600000e+06,100.0,Free,0.0,Everyone,Education,July,2018.0
10836,MEDICAL,4.193338,3.0,9.500000e+06,1000.0,Free,0.0,Everyone,Medical,January,2017.0
10837,BOOKS_AND_REFERENCE,4.500000,114.0,3.703544e+07,1000.0,Free,0.0,Mature 17+,Books & Reference,January,2015.0


In [290]:
#labeling category column with label encoder
label_encoder=LabelEncoder()
df['Category']=label_encoder.fit_transform(df['Category'])


In [291]:
category_mapping={index:label for index,label in enumerate(label_encoder.classes_)}
category_mapping

{0: 'ART_AND_DESIGN',
 1: 'AUTO_AND_VEHICLES',
 2: 'BEAUTY',
 3: 'BOOKS_AND_REFERENCE',
 4: 'BUSINESS',
 5: 'COMICS',
 6: 'COMMUNICATION',
 7: 'DATING',
 8: 'EDUCATION',
 9: 'ENTERTAINMENT',
 10: 'EVENTS',
 11: 'FAMILY',
 12: 'FINANCE',
 13: 'FOOD_AND_DRINK',
 14: 'GAME',
 15: 'HEALTH_AND_FITNESS',
 16: 'HOUSE_AND_HOME',
 17: 'LIBRARIES_AND_DEMO',
 18: 'LIFESTYLE',
 19: 'MAPS_AND_NAVIGATION',
 20: 'MEDICAL',
 21: 'NEWS_AND_MAGAZINES',
 22: 'PARENTING',
 23: 'PERSONALIZATION',
 24: 'PHOTOGRAPHY',
 25: 'PRODUCTIVITY',
 26: 'SHOPPING',
 27: 'SOCIAL',
 28: 'SPORTS',
 29: 'TOOLS',
 30: 'TRAVEL_AND_LOCAL',
 31: 'VIDEO_PLAYERS',
 32: 'WEATHER'}

In [292]:
def binary_encoder(df,column,positive_value):
    df=df.copy()
    df[column]=df[column].apply(lambda x:1 if x==positive_value else 0)
    return df

def ordinal_encode(df,column,ordering):
    df=df.copy()
    df[column]=df[column].apply(lambda x:ordering.index(x))
    return df

In [293]:
 install_ordering=[  '0',
  '0+',
  '1+',
  '5+',
  '10+',
  '50+',
  '100+',
  '500+',
  '1,000+',
  '5,000+',
    '10,000+',
       '50,000+',
       '100,000+',
       '500,000+',
      '1,000,000+',
       '5,000,000+',
  '10,000,000+',
  '50,000,000+',
  '100,000,000+',
       
  '500,000,000+',
       '1,000,000,000+'
  ]


In [294]:
print(install_ordering)

['0', '0+', '1+', '5+', '10+', '50+', '100+', '500+', '1,000+', '5,000+', '10,000+', '50,000+', '100,000+', '500,000+', '1,000,000+', '5,000,000+', '10,000,000+', '50,000,000+', '100,000,000+', '500,000,000+', '1,000,000,000+']


In [295]:
df['Content Rating'].unique()

array(['Everyone', 'Teen', 'Everyone 10+', 'Mature 17+',
       'Adults only 18+', 'Unrated'], dtype=object)

In [296]:
rating_order=['Everyone','Everyone 10+','Teen','Mature 17+',
             'Adults only 18+','Unrated']

In [297]:
df['Month'].unique()

array(['January', 'August', 'June', 'March', 'April', 'September', 'July',
       'October', 'November', 'May', 'December', 'February'], dtype=object)

In [298]:
month_ordering=['January','February','March','April',
               'May','June','July','August','September',
                'October','November','December']

In [299]:
df['Type'].unique()

array(['Free', 'Paid'], dtype=object)

In [300]:
df=binary_encode(df,'Type','Paid')

NameError: name 'binary_encode' is not defined

In [None]:
df=ordinal_encode(df,'Month',month_ordering)

In [None]:
df=ordinal_encode(df,'Content Rating',rating_order)

In [None]:
df['Installs'].unique()

In [None]:
df

In [None]:
df=df.drop('Genres',axis=1)

In [None]:
df

In [None]:
y=df['Category']
x=df.drop('Category',axis=1)

In [None]:
scaler=StandardScaler()
x=pd.DataFrame(scaler.fit_transform(x),columns=x.columns)

In [None]:
x

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7)

In [None]:
x.shape

In [None]:
inputs=tf.keras.Input(shape=(9,))
x=tf.keras.layers.Dense(64,activation='relu')(inputs)
x=tf.keras.layers.Dense(64,activation='relu')(x)
outputs=tf.keras.layers.Dense(33,activation='softmax')(x)
model=tf.keras.Model(inputs=inputs,outputs=outputs)
model.compile(
optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
batch_size=64
epochs=100
history=model.fit(
x_train,
y_train,
validation_split=0.2,
batch_size=batch_size,
epochs=epochs,
callbacks=[tf.keras.callbacks.ReduceLROnPlateau()],
verbose=0)

In [301]:
model.evaluate(x_test,y_test)



[2.70094633102417, 0.24938499927520752]