## Day 30 Lecture 2 Assignment

In this assignment, we will learn about random forests. We will use the google play store dataset loaded below.

In [46]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

In [47]:
reviews = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/googleplaystore.csv')

reviews

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10836,Sya9a Maroc - FR,FAMILY,4.5,38,53M,"5,000+",Free,0,Everyone,Education,"July 25, 2017",1.48,4.1 and up
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3.6M,100+,Free,0,Everyone,Education,"July 6, 2018",1.0,4.1 and up
10838,Parkinson Exercices FR,MEDICAL,,3,9.5M,"1,000+",Free,0,Everyone,Medical,"January 20, 2017",1.0,2.2 and up
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,Varies with device,"1,000+",Free,0,Mature 17+,Books & Reference,"January 19, 2015",Varies with device,Varies with device


In this assignment, you will work more independently. Perform the following steps:
    
1. Select which columns are best suited to predict whether the rating is above 4.5
2. Process the data (including transforming to the correct column type, removing missing values, creating dummy variables, and removing irrelevant variables)
3. Create a random forest model and evaluate
4. Using grid search cross validation, tweak the parameters to produce a better performing model
5. Show and discuss your results

Good luck!

In [48]:
rev = reviews.drop(columns=['App', 'Category', 'Size', 'Installs', 'Last Updated', 'Current Ver', 'Genres', 'Android Ver'], axis=1)
rev = rev.dropna()
type_dummies = pd.get_dummies(rev['Type'], drop_first=True)
content_dummies = pd.get_dummies(rev['Content Rating'], drop_first=True)
data = rev.drop(columns=['Type', 'Content Rating'], axis=1)
clean_data = pd.concat([data, type_dummies, content_dummies], axis=1)
clean_data['Reviews'] = clean_data['Reviews'].apply(lambda x: float(x))
def money_to_float(money):
  if money[0] == '$':
    money = money[1::]
  return float(money)
clean_data['Price'] = clean_data['Price'].apply(lambda x: money_to_float(x))
clean_data = clean_data[clean_data['Rating'] <=5.0]
clean_data['Over45'] = pd.cut(clean_data['Rating'], [0, 4.5, 100000], labels=[0, 1])

In [49]:
clean_data = clean_data.drop('Rating', axis=1)

In [50]:
X = clean_data.drop('Over45', axis=1)
y = clean_data['Over45']


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [51]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [52]:
rf.score(X_train, y_train)

0.9327282434596903

In [53]:
rf.score(X_test, y_test)

0.7753468516542156

In [54]:
y_test_pred = rf.predict(X_test)

In [55]:
confusion_matrix(y_test, y_test_pred)

array([[1343,  186],
       [ 235,  110]])

In [24]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.85      0.87      0.86      1529
           1       0.38      0.34      0.36       345

    accuracy                           0.78      1874
   macro avg       0.62      0.61      0.61      1874
weighted avg       0.77      0.78      0.77      1874



In [56]:
X

Unnamed: 0,Reviews,Price,Paid,Everyone,Everyone 10+,Mature 17+,Teen,Unrated
0,159.0,0.0,0,1,0,0,0,0
1,967.0,0.0,0,1,0,0,0,0
2,87510.0,0.0,0,1,0,0,0,0
3,215644.0,0.0,0,0,0,0,1,0
4,967.0,0.0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...
10834,7.0,0.0,0,1,0,0,0,0
10836,38.0,0.0,0,1,0,0,0,0
10837,4.0,0.0,0,1,0,0,0,0
10839,114.0,0.0,0,0,0,1,0,0


In [41]:
grid = {'n_estimators': [10, 20, 40, 80, 160], 'max_depth': [None, 5, 10, 15], 'min_samples_split': [20, 50, 100, 200], 'max_features':['sqrt', 0.5, None]}
rf_cv = GridSearchCV(RandomForestClassifier(), grid, verbose=1, cv=5, scoring='f1')
rf_cv.fit(X_train, y_train)


Fitting 5 folds for each of 240 candidates, totalling 1200 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1200 out of 1200 | elapsed:  7.6min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [57]:
print('best number of estimators:', rf_cv.best_estimator_.n_estimators)
print('best max depth:', rf_cv.best_estimator_.max_depth)
print('best min samples split:', rf_cv.best_estimator_.min_samples_split)
print('best max features:', rf_cv.best_estimator_.max_features)

best number of estimators: 20
best max depth: None
best min samples split: 20
best max features: None


In [58]:
rf_best = RandomForestClassifier(n_estimators=20, max_depth=None, min_samples_split=20, max_features=None)
rf_best.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=None,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=20,
                       min_weight_fraction_leaf=0.0, n_estimators=20,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [59]:
pd.DataFrame({'feature': X_train.columns, 'importance': rf_best.feature_importances_}).sort_values(by='importance', ascending=False)


Unnamed: 0,feature,importance
0,Reviews,0.885138
1,Price,0.060322
3,Everyone,0.016304
6,Teen,0.012327
4,Everyone 10+,0.010478
5,Mature 17+,0.010194
2,Paid,0.005237
7,Unrated,0.0
