In [1]:
import pickle, json, collections, itertools
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.preprocessing import label
from tqdm import tqdm_notebook as tqdm
import numpy as np
import pandas as pd
from scipy import stats
from matplotlib import pyplot as plt
import seaborn
%matplotlib inline

# Reading engineered data

Unlike previous notebooks, this data is loaded after preprocessing

In [2]:
df = pd.read_csv('android_bids_us_eng.csv', index_col=0)
df

  mask |= (ar1 == a)


Unnamed: 0,device_osv,marketplace,click,app_cat,user_tz,device_landscape,user_hour,user_day_of_week,device_maker_common,user_isp_simple,user_month,user_day
0,7.0,chartboost,0,UNKNOWN,CST,True,18,5,False,AT&T,9,8
1,7.0,chartboost,0,UNKNOWN,CST,False,18,5,False,T-Mobile,9,8
2,7.1.1,chartboost,0,GAME_ARCADE,CST,True,18,5,False,AT&T,9,8
3,6.0.1,chartboost,0,UNKNOWN,CST,True,18,5,False,Comcast,9,8
4,7.1.1,chartboost,0,GAME_EDUCATIONAL,EST,False,18,5,False,AT&T,9,8
5,7.0,chartboost,0,GAME_SIMULATION,EST,True,18,5,False,T-Mobile,9,8
6,7.1.1,chartboost,0,GAME_CASINO,EST,True,18,5,False,Spectrum,9,8
7,8.0.0,chartboost,0,GAME_ARCADE,EST,False,18,5,False,Sprint,9,8
8,8.0.0,chartboost,0,GAME_ACTION,EST,True,18,5,False,Spectrum,9,8
9,7.0,chartboost,0,GAME_RACING,MST,True,18,5,False,Comcast,9,8


# Transforming strings to labels

In [3]:
labeler = label.LabelEncoder()
df["app_cat"] = labeler.fit_transform(df["app_cat"])
df["device_osv"] = labeler.fit_transform(df["device_osv"])
df["marketplace"] = labeler.fit_transform(df["marketplace"])
df["user_tz"] = labeler.fit_transform(df["user_tz"])
df["user_isp_simple"] = labeler.fit_transform(df["user_isp_simple"])

# Splitting to train set and test set
Our data is time dependent, thus we cannot split it randomly.

We have to split it at a certain time stamp

In [4]:
df_test = df[(df.user_month==11)].drop(["user_month", "user_day"]  , axis=1)
df_train = df[(df.user_month<11)].drop(["user_month", "user_day"] , axis=1)
#del df
print ("Train Size: {train} , Test Size: {test}".format(train=len(df_train),test=len(df_test), ))

Train Size: 2690292 , Test Size: 458536


# Model v1

In [5]:
model = LogisticRegression()
model.fit(df_train.drop("click",axis=1), df_train.click)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

# Metrics

In [6]:
def print_metrics(model):
    y_train = df_train.click
    y_test = df_test.click
    yhat_train = model.predict(df_train.drop("click",axis=1))
    yhat_test = model.predict(df_test.drop("click",axis=1))
    print ("============ Train Accuracy {a:0.2f}% ===========".format(a=100*accuracy_score(y_train, yhat_train)))
    print (classification_report(y_train, yhat_train))
    print ("============ Test Accuracy {a:0.2f}% ===========".format(a=100*accuracy_score(y_test, yhat_test)))
    print (classification_report(y_test, yhat_test))

In [7]:
print_metrics(model)



  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       0.93      1.00      0.97   2510270
           1       0.00      0.00      0.00    180022

   micro avg       0.93      0.93      0.93   2690292
   macro avg       0.47      0.50      0.48   2690292
weighted avg       0.87      0.93      0.90   2690292

              precision    recall  f1-score   support

           0       0.93      1.00      0.96    425359
           1       0.00      0.00      0.00     33177

   micro avg       0.93      0.93      0.93    458536
   macro avg       0.46      0.50      0.48    458536
weighted avg       0.86      0.93      0.89    458536



# Model v2 - balanced

In [8]:
model = LogisticRegression(class_weight='balanced')
model.fit(df_train.drop("click",axis=1), df_train.click)
print_metrics(model)



              precision    recall  f1-score   support

           0       0.95      0.57      0.71   2510270
           1       0.09      0.56      0.15    180022

   micro avg       0.57      0.57      0.57   2690292
   macro avg       0.52      0.57      0.43   2690292
weighted avg       0.89      0.57      0.68   2690292

              precision    recall  f1-score   support

           0       0.94      0.61      0.74    425359
           1       0.10      0.54      0.17     33177

   micro avg       0.61      0.61      0.61    458536
   macro avg       0.52      0.58      0.45    458536
weighted avg       0.88      0.61      0.70    458536



## Understanding the model


In [9]:
def rank_features(model):
    coef = list(zip([col for col in df_test.columns if col!='click'], model.coef_.reshape(-1)))
    return sorted(coef, key=lambda t: np.abs(t[1]), reverse=True)

rank_features(model)

[('device_landscape', -0.5649738155755559),
 ('device_maker_common', 0.27679649803698103),
 ('device_osv', -0.029923238953753456),
 ('user_tz', 0.015399447106073357),
 ('user_day_of_week', 0.009583738209991107),
 ('user_hour', 0.0013556088754316296),
 ('app_cat', -0.0004252171730277998),
 ('user_isp_simple', -0.00017707912736387876),
 ('marketplace', 0.0)]

# Model v3 - dummy variables

In [10]:
df = pd.read_csv('android_bids_us_eng.csv', index_col=0)
df["user_day_of_week"] = df["user_day_of_week"].astype('category')
df["user_hour"] = df["user_hour"].astype('category')
df = pd.get_dummies(df)
# splitting to test & train
df_test = df[(df.user_month==11)].drop(["user_month", "user_day"]  , axis=1)
df_train = df[(df.user_month<11)].drop(["user_month", "user_day"] , axis=1)
df

  mask |= (ar1 == a)


Unnamed: 0,click,device_landscape,device_maker_common,user_month,user_day,device_osv_4.4.4 KitKat Os For 9inch QuadCore Tablet Pc,device_osv_5.0 by mrkindaiji,device_osv_6.0,device_osv_6.0.1,device_osv_6.1,...,user_day_of_week_5,user_day_of_week_6,user_isp_simple_AT&T,user_isp_simple_AT&T:,user_isp_simple_Comcast,user_isp_simple_Other,user_isp_simple_Spectrum,user_isp_simple_Sprint,user_isp_simple_T-Mobile,user_isp_simple_Verizon
0,0,True,False,9,8,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
1,0,False,False,9,8,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
2,0,True,False,9,8,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
3,0,True,False,9,8,0,0,0,1,0,...,1,0,0,0,1,0,0,0,0,0
4,0,False,False,9,8,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
5,0,True,False,9,8,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
6,0,True,False,9,8,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
7,0,False,False,9,8,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
8,0,True,False,9,8,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
9,0,True,False,9,8,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0


In [17]:
model = LogisticRegression(class_weight='balanced', C=0.01)
model.fit(df_train.drop("click",axis=1), df_train.click)
print_metrics(model)



              precision    recall  f1-score   support

           0       0.95      0.60      0.73   2510270
           1       0.09      0.57      0.16    180022

   micro avg       0.59      0.59      0.59   2690292
   macro avg       0.52      0.58      0.45   2690292
weighted avg       0.89      0.59      0.69   2690292

              precision    recall  f1-score   support

           0       0.95      0.60      0.74    425359
           1       0.10      0.57      0.17     33177

   micro avg       0.60      0.60      0.60    458536
   macro avg       0.52      0.59      0.45    458536
weighted avg       0.89      0.60      0.70    458536



In [12]:
rank_features(model)

[('app_cat_GAME_EDUCATIONAL', 0.689606329387082),
 ('app_cat_TOOLS', -0.6800891232152044),
 ('app_cat_GAME_CASUAL', 0.6733188934950181),
 ('app_cat_SOCIAL', -0.6488102549171029),
 ('app_cat_BEAUTY', -0.6204353608737592),
 ('device_landscape', -0.5775376086890068),
 ('app_cat_SHOPPING', -0.5684771211815373),
 ('app_cat_PRODUCTIVITY', 0.5258730564878898),
 ('app_cat_PHOTOGRAPHY', -0.4959234128347795),
 ('app_cat_GAME_ROLE_PLAYING', 0.49468541858826265),
 ('app_cat_SPORTS', 0.48821179442527396),
 ('app_cat_GAME_BOARD', -0.459639692908016),
 ('app_cat_FINANCE', -0.4548820661413046),
 ('app_cat_PERSONALIZATION', 0.45436610333624794),
 ('app_cat_LIBRARIES_AND_DEMO', 0.42216232755560035),
 ('app_cat_MUSIC_AND_AUDIO', -0.40605886196311836),
 ('app_cat_GAME_STRATEGY', -0.3921470898505859),
 ('app_cat_ENTERTAINMENT', 0.3658711263322068),
 ('device_osv_8.0.0', -0.3556747556993395),
 ('app_cat_GAME_MUSIC', 0.34607177128494937),
 ('app_cat_COMICS', -0.3315855041901488),
 ('app_cat_GAME_SPORTS', 0.3

# Model v4 - Removing features
We actually remove the `device_osv` and `marketplace` columns, 
and inrease regularization

In [13]:
df = pd.read_csv('android_bids_us_eng.csv', index_col=0)
df["user_day_of_week"] = df["user_day_of_week"].astype('category')
df["user_hour"] = df["user_hour"].astype('category')
df = pd.get_dummies(df.drop(["device_osv", "marketplace"], axis=1))
# splitting to test & train
df_test = df[(df.user_month==11)].drop(["user_month", "user_day"]  , axis=1)
df_train = df[(df.user_month<11)].drop(["user_month", "user_day"] , axis=1)
df

  mask |= (ar1 == a)


Unnamed: 0,click,device_landscape,device_maker_common,user_month,user_day,app_cat_ART_AND_DESIGN,app_cat_AUTO_AND_VEHICLES,app_cat_BEAUTY,app_cat_BOOKS_AND_REFERENCE,app_cat_BUSINESS,...,user_day_of_week_5,user_day_of_week_6,user_isp_simple_AT&T,user_isp_simple_AT&T:,user_isp_simple_Comcast,user_isp_simple_Other,user_isp_simple_Spectrum,user_isp_simple_Sprint,user_isp_simple_T-Mobile,user_isp_simple_Verizon
0,0,True,False,9,8,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
1,0,False,False,9,8,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
2,0,True,False,9,8,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
3,0,True,False,9,8,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
4,0,False,False,9,8,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
5,0,True,False,9,8,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
6,0,True,False,9,8,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
7,0,False,False,9,8,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
8,0,True,False,9,8,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
9,0,True,False,9,8,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0


In [14]:
model = LogisticRegression(class_weight='balanced', C=0.001)
model.fit(df_train.drop("click",axis=1), df_train.click)
print_metrics(model)



              precision    recall  f1-score   support

           0       0.95      0.59      0.73   2510270
           1       0.09      0.57      0.16    180022

   micro avg       0.59      0.59      0.59   2690292
   macro avg       0.52      0.58      0.44   2690292
weighted avg       0.89      0.59      0.69   2690292

              precision    recall  f1-score   support

           0       0.95      0.60      0.73    425359
           1       0.10      0.57      0.17     33177

   micro avg       0.60      0.60      0.60    458536
   macro avg       0.52      0.59      0.45    458536
weighted avg       0.89      0.60      0.69    458536



In [15]:
rank_features(model)

[('device_landscape', -0.5672591608089603),
 ('app_cat_GAME_CASUAL', 0.5671894597382877),
 ('app_cat_GAME_EDUCATIONAL', 0.5501618759428467),
 ('app_cat_GAME_BOARD', -0.5403013209228194),
 ('app_cat_GAME_ROLE_PLAYING', 0.3812559503483676),
 ('app_cat_GAME_STRATEGY', -0.3750969730544569),
 ('app_cat_TOOLS', -0.36590387342570374),
 ('app_cat_SOCIAL', -0.3250720437001381),
 ('app_cat_GAME_CARD', -0.310438923035424),
 ('app_cat_MUSIC_AND_AUDIO', -0.30141549351680724),
 ('app_cat_PRODUCTIVITY', 0.2632368627889057),
 ('app_cat_ENTERTAINMENT', 0.2610528770412266),
 ('app_cat_PERSONALIZATION', 0.2461618203829356),
 ('device_maker_common', 0.2192632608384496),
 ('app_cat_LIBRARIES_AND_DEMO', 0.21621399245736916),
 ('app_cat_GAME_TRIVIA', -0.21518080690181207),
 ('app_cat_GAME_SPORTS', 0.20729781221515994),
 ('app_cat_GAME_RACING', 0.19165909134722192),
 ('app_cat_GAME_WORD', -0.18912768236888983),
 ('app_cat_SPORTS', 0.15485744560707365),
 ('app_cat_GAME_MUSIC', 0.14848448993483743),
 ('app_cat_

# Exercise:
1. Train a `DecisionTree` model, and plot it with `export_graphviz` to see the important features
1. Train a `RandomForest` model, and print its `feature_importances_`
1. What is the difference between this feature importance measure and `mutual_info` ?