In [3]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Requirement already up-to-date: sklearn in c:\users\tasnu\anaconda3\envs\pythondata\lib\site-packages (0.0)


In [4]:
# install joblib. This will be used to save our model. 
# Restart the kernel after installing 
!pip install joblib




## Import Libraries

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import mplcursors
import mpld3

## Read the CSV and Perform Basic Data Cleaning

In [6]:
# Read the file
df = pd.read_csv("googleplaystore.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head(2)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up


In [7]:
# Check the raw data type
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9360 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             9360 non-null   object 
 1   Category        9360 non-null   object 
 2   Rating          9360 non-null   float64
 3   Reviews         9360 non-null   object 
 4   Size            9360 non-null   object 
 5   Installs        9360 non-null   object 
 6   Type            9360 non-null   object 
 7   Price           9360 non-null   object 
 8   Content Rating  9360 non-null   object 
 9   Genres          9360 non-null   object 
 10  Last Updated    9360 non-null   object 
 11  Current Ver     9360 non-null   object 
 12  Android Ver     9360 non-null   object 
dtypes: float64(1), object(12)
memory usage: 1023.8+ KB


## Perform intense data cleaing 

In [8]:
# Get names of indexes for which column Size has value "Varies with device"
indexNames = df[ df['Size'] == 'Varies with device' ].index
# Delete these row indexes from dataFrame
df.drop(indexNames , inplace=True)

In [9]:
# Get rid of the non numeric values and symbols from the rows with numbers

df['Price'] = df['Price'].apply(lambda x: x.replace('$', '')
                                if isinstance(x, str) else x).astype(float)
df['Reviews'] = df['Reviews'].apply(lambda x: x.replace('M', '').replace(',', '').replace('+', '')
                                if isinstance(x, str) else x).astype(float) 
df['Size'] = df['Size'].apply(lambda x: x.replace('M', '').replace(',', '').replace('+', '').replace('k', '')
                                if isinstance(x, str) else x).astype(float) 
df['Installs'] = df['Installs'].apply(lambda x: x.replace('M', '').replace(',', '').replace('+', '')
                                if isinstance(x, str) else x).astype(float) 

In [10]:
# Import scikit library to evaluate the models

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error # 0.3 error
from sklearn.model_selection import train_test_split

# Label encoding
lb_make = LabelEncoder()
# Create column for "numeric" Content Rating 
df["Content Rating NUM"] = lb_make.fit_transform(df["Content Rating"])
# Form dicitonary for Content Rating and numeric values 
dict_content_rating = {"Adults only 18+": 0, "Everyone": 1, "Everyone 10+": 2, "Mature 17+": 3, "Teen": 4}
# Numeric value for Content Rating
'''
Adults only 18+ = 0
Everyone = 1
Everyone 10+ = 2
Mature 17+ = 3
Teen = 4
'''

'\nAdults only 18+ = 0\nEveryone = 1\nEveryone 10+ = 2\nMature 17+ = 3\nTeen = 4\n'

In [11]:
# Change the Column headers
df = df.rename(columns = {'Content Rating':'ContentRating','Content Rating NUM':'ContentRatingNUM','Last Updated':'LastUpdated', 'Current Ver':'CurrentVer','Android Ver':'AndroidVer'})

In [12]:
# Check the data type after cleaning
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7723 entries, 0 to 10840
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   App               7723 non-null   object 
 1   Category          7723 non-null   object 
 2   Rating            7723 non-null   float64
 3   Reviews           7723 non-null   float64
 4   Size              7723 non-null   float64
 5   Installs          7723 non-null   float64
 6   Type              7723 non-null   object 
 7   Price             7723 non-null   float64
 8   ContentRating     7723 non-null   object 
 9   Genres            7723 non-null   object 
 10  LastUpdated       7723 non-null   object 
 11  CurrentVer        7723 non-null   object 
 12  AndroidVer        7723 non-null   object 
 13  ContentRatingNUM  7723 non-null   int32  
dtypes: float64(5), int32(1), object(8)
memory usage: 874.9+ KB


In [13]:
#Check the final data frame 
df.head(2)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,ContentRating,Genres,LastUpdated,CurrentVer,AndroidVer,ContentRatingNUM
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159.0,19.0,10000.0,Free,0.0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up,1
1,Coloring book moana,ART_AND_DESIGN,3.9,967.0,14.0,500000.0,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,1


## Create a Train Test Split

In [14]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size = 0.2)
print(train.shape)
print(test.shape)

(6178, 14)
(1545, 14)


In [15]:
# Train and test based on Type

train_X = train[['Rating','Reviews','Size','Installs','Price','ContentRatingNUM']]# taking the training data features
train_y = train.Type # output of our training data
test_X = test[['Rating','Reviews','Size','Installs','Price', 'ContentRatingNUM']] # taking test data features
test_y = test.Type   #output value of test data

In [16]:
train_X.head(2)

Unnamed: 0,Rating,Reviews,Size,Installs,Price,ContentRatingNUM
1669,4.3,10306.0,50.0,1000000.0,0.0,3
10529,5.0,1.0,3.5,10.0,0.0,1


In [17]:
test_X.head(2)

Unnamed: 0,Rating,Reviews,Size,Installs,Price,ContentRatingNUM
9922,3.7,98.0,9.7,10000.0,0.0,1
6761,4.4,48451.0,52.0,1000000.0,0.0,4


In [18]:
test.Type

9922    Free
6761    Free
4413    Paid
1436    Free
7402    Free
        ... 
6220    Free
3363    Free
4766    Free
6063    Free
3643    Free
Name: Type, Length: 1545, dtype: object

## Pre-processing

In [19]:
# Scale the data
from sklearn.preprocessing import MinMaxScaler
X_scaler =MinMaxScaler().fit(train_X)
X_train_scaled =X_scaler.transform(train_X)
X_test_scaled =X_scaler.transform(test_X)

In [20]:
from sklearn.linear_model import LogisticRegression
model1 = LogisticRegression()
model1.fit(X_train_scaled, train_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [21]:
print (model1.score(X_train_scaled, train_y))
print (model1.score(X_test_scaled, test_y))


0.927484622855293
0.9262135922330097


## Hyperparameter Tuning
#Use GridSearchCV to tune the model's parameters

In [22]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid_1 = {'C': [0.01, 10.0, 50.0, 100.0], 'penalty':['l1','l2']}
grid1 = GridSearchCV(model1, param_grid_1)

In [1]:
#grid1.fit(X_train_scaled, train_y)

In [24]:
print(grid1.best_params_)
print(grid1.best_score_)

{'C': 100.0, 'penalty': 'l2'}
0.9407561285588878


## Save the model

In [27]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib


In [29]:
filename = 'google_model2.sav'
joblib.dump(model1, filename)

['google_model2.sav']