In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

## Day 30 Lecture 2 Assignment

In this assignment, we will learn about random forests. We will use the google play store dataset loaded below.

In [85]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix, classification_report

import ssl

ssl._create_default_https_context = ssl._create_unverified_context

<IPython.core.display.Javascript object>

In [13]:
def one_hot_encode(X, encode_cols, index=None):
    other_cols = [c for c in X.columns if c not in encode_cols]

    ct = ColumnTransformer(
        #   Format
        #   [("name of step", what_to_do(), [what columns to do it to])]
        [("one hot encode", OneHotEncoder(drop="first", sparse=False), encode_cols)],
        remainder="passthrough",
    )

    ct.fit(X)

    # This is not the flexible and definitely not the most
    # readable way to get column names, a function would be better
    encoded_names = ct.transformers_[0][1].get_feature_names()
    encoded_names = list(encoded_names)

    X_encoded = ct.transform(X)
    X_encoded = pd.DataFrame(X_encoded, columns=encoded_names + other_cols, index=index)

    return X_encoded

<IPython.core.display.Javascript object>

In [59]:
reviews = pd.read_csv(
    "https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/googleplaystore.csv"
)

reviews.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


<IPython.core.display.Javascript object>

In this assignment, you will work more independently. Perform the following steps:
    
1. Select which columns are best suited to predict whether the rating is above 4.5
2. Process the data (including transforming to the correct column type, removing missing values, creating dummy variables, and removing irrelevant variables)
3. Create a random forest model and evaluate
4. Tweak the parameters to produce a better performing model
5. Show and discuss your results

Good luck!

In [60]:
print(reviews.shape)
reviews.info()

(10841, 13)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


<IPython.core.display.Javascript object>

In [61]:
reviews = reviews.dropna()
reviews.shape

(9360, 13)

<IPython.core.display.Javascript object>

In [62]:
# Convert Category, Content Rating to one hot
# Convert Installs to ordinal
# Encode Type
# Drop Size, Price, Genres, Last Updated, Current Ver, Android Ver
# Use App as index
# Create rating threshold column

<IPython.core.display.Javascript object>

In [63]:
reviews = one_hot_encode(
    reviews, encode_cols=["Category", "Content Rating"], index=reviews.index
)
reviews.head()

Unnamed: 0,x0_AUTO_AND_VEHICLES,x0_BEAUTY,x0_BOOKS_AND_REFERENCE,x0_BUSINESS,x0_COMICS,x0_COMMUNICATION,x0_DATING,x0_EDUCATION,x0_ENTERTAINMENT,x0_EVENTS,...,Rating,Reviews,Size,Installs,Type,Price,Genres,Last Updated,Current Ver,Android Ver
0,0,0,0,0,0,0,0,0,0,0,...,4.1,159,19M,"10,000+",Free,0,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,0,0,0,0,0,0,0,0,0,0,...,3.9,967,14M,"500,000+",Free,0,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,0,0,0,0,0,0,0,0,0,0,...,4.7,87510,8.7M,"5,000,000+",Free,0,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,0,0,0,0,0,0,0,0,0,0,...,4.5,215644,25M,"50,000,000+",Free,0,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,0,0,0,0,0,0,0,0,0,0,...,4.3,967,2.8M,"100,000+",Free,0,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


<IPython.core.display.Javascript object>

In [64]:
installs_dict = reviews["Installs"].value_counts().to_dict()
installs_dict = dict(sorted(reviews_dict.items()))

installs_map = dict()
count = 0
for k, v in installs_dict.items():
    installs_map[k] = count
    count += 1

<IPython.core.display.Javascript object>

In [65]:
reviews["Installs"] = reviews["Installs"].replace(installs_map)

<IPython.core.display.Javascript object>

In [66]:
reviews["Type"] = (reviews["Type"] == "Free").astype(int)

<IPython.core.display.Javascript object>

In [67]:
reviews = reviews.drop(
    columns=["Size", "Price", "Genres", "Last Updated", "Current Ver", "Android Ver"]
)

<IPython.core.display.Javascript object>

In [68]:
reviews = reviews.set_index("App")

<IPython.core.display.Javascript object>

In [69]:
reviews

Unnamed: 0_level_0,x0_AUTO_AND_VEHICLES,x0_BEAUTY,x0_BOOKS_AND_REFERENCE,x0_BUSINESS,x0_COMICS,x0_COMMUNICATION,x0_DATING,x0_EDUCATION,x0_ENTERTAINMENT,x0_EVENTS,...,x0_WEATHER,x1_Everyone,x1_Everyone 10+,x1_Mature 17+,x1_Teen,x1_Unrated,Rating,Reviews,Installs,Type
App,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Photo Editor & Candy Camera & Grid & ScrapBook,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,4.1,159,5,1
Coloring book moana,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,3.9,967,17,1
"U Launcher Lite – FREE Live Cool Themes, Hide Apps",0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,4.7,87510,12,1
Sketch - Draw & Paint,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,4.5,215644,15,1
Pixel Draw - Number Art Coloring Book,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,4.3,967,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
FR Calculator,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,4,7,16,1
Sya9a Maroc - FR,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,4.5,38,11,1
Fr. Mike Schmitz Audio Teachings,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,5,4,7,1
The SCP Foundation DB fr nn5n,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,4.5,114,1,1


<IPython.core.display.Javascript object>

In [70]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9360 entries, Photo Editor & Candy Camera & Grid & ScrapBook to iHoroscope - 2018 Daily Horoscope & Astrology
Data columns (total 41 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   x0_AUTO_AND_VEHICLES    9360 non-null   object
 1   x0_BEAUTY               9360 non-null   object
 2   x0_BOOKS_AND_REFERENCE  9360 non-null   object
 3   x0_BUSINESS             9360 non-null   object
 4   x0_COMICS               9360 non-null   object
 5   x0_COMMUNICATION        9360 non-null   object
 6   x0_DATING               9360 non-null   object
 7   x0_EDUCATION            9360 non-null   object
 8   x0_ENTERTAINMENT        9360 non-null   object
 9   x0_EVENTS               9360 non-null   object
 10  x0_FAMILY               9360 non-null   object
 11  x0_FINANCE              9360 non-null   object
 12  x0_FOOD_AND_DRINK       9360 non-null   object
 13  x0_GAME                 936

<IPython.core.display.Javascript object>

In [71]:
reviews["Rating"] = pd.to_numeric(reviews["Rating"])
reviews["Reviews"] = pd.to_numeric(reviews["Reviews"])

<IPython.core.display.Javascript object>

In [79]:
reviews["Rating Above 4.5"] = (reviews["Rating"] > 4.5).astype(int)
reviews = reviews.drop(columns=["Rating"])

<IPython.core.display.Javascript object>

In [80]:
X = reviews.drop(columns=["Rating Above 4.5"])
y = reviews["Rating Above 4.5"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

<IPython.core.display.Javascript object>

In [81]:
selector = SelectFromModel(RandomForestClassifier())
selector.fit(X_train, y_train)

feat_names = X.columns[selector.get_support()]

X_train = selector.transform(X_train)
X_test = selector.transform(X_test)

X_train = pd.DataFrame(X_train, index=y_train.index, columns=feat_names)
X_test = pd.DataFrame(X_test, index=y_test.index, columns=feat_names)

<IPython.core.display.Javascript object>

In [82]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

<IPython.core.display.Javascript object>

In [83]:
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print(f"train_score: {train_score}")
print(f"test_score: {test_score}")

train_score: 0.9421741452991453
test_score: 0.7938034188034188


<IPython.core.display.Javascript object>

In [86]:
y_pred_prob = model.predict_proba(X_test)[:, 1]
y_pred = (y_pred_prob > 0.2).astype(int)

confusion_matrix(y_test, y_pred)

array([[1106,  411],
       [ 122,  233]])

<IPython.core.display.Javascript object>

In [87]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.73      0.81      1517
           1       0.36      0.66      0.47       355

    accuracy                           0.72      1872
   macro avg       0.63      0.69      0.64      1872
weighted avg       0.80      0.72      0.74      1872



<IPython.core.display.Javascript object>

In [88]:
importance_df = pd.DataFrame(
    {"feat": X_train.columns, "importance": model.feature_importances_}
)

<IPython.core.display.Javascript object>

In [89]:
importance_df = importance_df.sort_values("importance", ascending=False)
importance_df

Unnamed: 0,feat,importance
0,Reviews,0.920817
1,Installs,0.079183


<IPython.core.display.Javascript object>

In [90]:
X_train.columns

Index(['Reviews', 'Installs'], dtype='object')

<IPython.core.display.Javascript object>

In [91]:
X.columns

Index(['x0_AUTO_AND_VEHICLES', 'x0_BEAUTY', 'x0_BOOKS_AND_REFERENCE',
       'x0_BUSINESS', 'x0_COMICS', 'x0_COMMUNICATION', 'x0_DATING',
       'x0_EDUCATION', 'x0_ENTERTAINMENT', 'x0_EVENTS', 'x0_FAMILY',
       'x0_FINANCE', 'x0_FOOD_AND_DRINK', 'x0_GAME', 'x0_HEALTH_AND_FITNESS',
       'x0_HOUSE_AND_HOME', 'x0_LIBRARIES_AND_DEMO', 'x0_LIFESTYLE',
       'x0_MAPS_AND_NAVIGATION', 'x0_MEDICAL', 'x0_NEWS_AND_MAGAZINES',
       'x0_PARENTING', 'x0_PERSONALIZATION', 'x0_PHOTOGRAPHY',
       'x0_PRODUCTIVITY', 'x0_SHOPPING', 'x0_SOCIAL', 'x0_SPORTS', 'x0_TOOLS',
       'x0_TRAVEL_AND_LOCAL', 'x0_VIDEO_PLAYERS', 'x0_WEATHER', 'x1_Everyone',
       'x1_Everyone 10+', 'x1_Mature 17+', 'x1_Teen', 'x1_Unrated', 'Reviews',
       'Installs', 'Type'],
      dtype='object')

<IPython.core.display.Javascript object>