The marketing department took a look at the statistics. They realized that Android users are performing much better in the app (more activity, more revenue, etc.) and they started to work harder on getting more Android users registered, as well as on dedicated in-app campaigns for retaining these users.

But they have also realized that nearly 40% of users didn't send phone_type information. This must be a logging error. 
The task is: 
1. to take the phone_type column of your registrations table and try to figure out the real values of the error values in it.
2. to estimate the accuracy of the model and try to map out a few segments where it's surely underperforming.

In [65]:
#importing the necessary libraries
import pandas as pd
import numpy as np

In [2]:
#reading the necessary datasets
registration = pd.read_csv('/home/dataguy/registration/registration.csv', delimiter = ' ', names=['day','date','time','user_id','event','birth_year','phone','country','source'])
free_tree = pd.read_csv('/home/dataguy/free_tree/free_tree.csv', header = None, delimiter = ' ', names = ['day','date','user_id','event'])
super_tree = pd.read_csv('/home/dataguy/super_tree/super_tree.csv', header = None, delimiter = ' ', names = ['day','date','user_id','event'])

In [3]:
registration.tail()

Unnamed: 0,day,date,time,user_id,event,birth_year,phone,country,source
253048,day_266,2020-06-04,23:57:50,1253049,registration,1983,android,brazil,invite_a_friend
253049,day_266,2020-06-04,23:57:55,1253050,registration,1991,error,united_states,invite_a_friend
253050,day_266,2020-06-04,23:58:53,1253051,registration,1983,error,philippines,invite_a_friend
253051,day_266,2020-06-04,23:59:17,1253052,registration,1992,error,brazil,article
253052,day_266,2020-06-04,23:59:46,1253053,registration,1985,ios,philippines,invite_a_friend


In [7]:
#in my solution I will be trying to predict the phone type based on the number of free and super trees sent by a user,
#the acquisition channel, the location, and the birth year 
free_count = free_tree.groupby('user_id').count()[['event']].reset_index()
super_count = super_tree.groupby('user_id').count()[['event']].reset_index()

In [129]:
reg_free = registration.merge(free_count, on = 'user_id')[['user_id','phone','source','country','birth_year','event_y']].rename(columns={'event_y':'free_tree'})

In [130]:
reg_free.head()

Unnamed: 0,user_id,phone,source,country,birth_year,free_tree
0,1000001,android,invite_a_friend,sweden,1983,5
1,1000002,ios,invite_a_friend,united_states,1986,4
2,1000003,error,invite_a_friend,sweden,1989,37
3,1000005,ios,invite_a_friend,united_states,1986,6
4,1000006,android,invite_a_friend,sweden,1983,4


In [134]:
big_table = reg_free.merge(super_count, how ='outer', on = 'user_id')[['user_id','phone','source','country','birth_year','free_tree','event']].rename(columns={'event':'super_tree'})
big_table = big_table.fillna(value=0)

In [135]:
big_table.head()

Unnamed: 0,user_id,phone,source,country,birth_year,free_tree,super_tree
0,1000001,android,invite_a_friend,sweden,1983.0,5.0,0.0
1,1000002,ios,invite_a_friend,united_states,1986.0,4.0,0.0
2,1000003,error,invite_a_friend,sweden,1989.0,37.0,0.0
3,1000005,ios,invite_a_friend,united_states,1986.0,6.0,0.0
4,1000006,android,invite_a_friend,sweden,1983.0,4.0,0.0


In [136]:
#mapping of the categorical values
phone_dict = {'android':0,'ios':1,'error':2}
source_dict = {'invite_a_friend':0,'google':1,'article':2,'paid':3}
country_dict = {'sweden':0,'united_states':1,'brazil':2,'germany':3,'philippines':4}
big_table['phone'] = big_table['phone'].map(phone_dict)
big_table['source'] = big_table['source'].map(source_dict)
big_table['country'] = big_table['country'].map(country_dict)

In [137]:
big_table.sample(10)

Unnamed: 0,user_id,phone,source,country,birth_year,free_tree,super_tree
181995,1235116,1.0,0.0,1.0,1989.0,7.0,0.0
90861,1135975,0.0,2.0,1.0,1988.0,9.0,0.0
21504,1047250,0.0,0.0,3.0,1991.0,3.0,0.0
37645,1067839,0.0,1.0,0.0,1986.0,3.0,0.0
116665,1150983,2.0,0.0,0.0,1998.0,1.0,0.0
34386,1063622,0.0,1.0,1.0,1990.0,7.0,1.0
111264,1144023,2.0,1.0,1.0,1987.0,7.0,0.0
76798,1118035,2.0,0.0,1.0,1985.0,23.0,0.0
173772,1224563,1.0,0.0,2.0,1989.0,1.0,0.0
139339,1180158,0.0,0.0,0.0,1984.0,7.0,0.0


In [177]:
#separating the data into datasets with android and ios users and the error (unknown) values
known_values = big_table[(big_table.phone == 0) | (big_table.phone == 1)]
unknown_values = big_table[big_table.phone == 2]

In [139]:
unknown_values.head()

Unnamed: 0,user_id,phone,source,country,birth_year,free_tree,super_tree
2,1000003,2.0,0.0,0.0,1989.0,37.0,0.0
7,1000010,2.0,0.0,2.0,1993.0,1.0,6.0
8,1000011,2.0,0.0,1.0,1995.0,11.0,2.0
9,1000012,2.0,0.0,1.0,1986.0,35.0,0.0
10,1000013,2.0,2.0,1.0,1993.0,9.0,1.0


In [47]:
from sklearn.ensemble import RandomForestClassifier

In [140]:
#calling the model and establishing features and the output variable
model = RandomForestClassifier(n_estimators=100)
X = known_values[['source','country','birth_year','free_tree','super_tree']]
y = known_values['phone']

In [141]:
model.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [148]:
#pulling predictions for the unknown (error) values
predictions = model.predict(unknown_values[['source','country','birth_year','free_tree','super_tree']])

In [None]:
unknown_values.loc[:,'predictions'] = predictions

In [156]:
unknown_values.tail(20)

Unnamed: 0,user_id,phone,source,country,birth_year,free_tree,super_tree,predictions
195635,1252987,2.0,1.0,2.0,1989.0,1.0,0.0,0.0
195641,1252995,2.0,0.0,4.0,1986.0,1.0,0.0,1.0
195645,1252999,2.0,0.0,1.0,1986.0,12.0,0.0,0.0
195646,1253002,2.0,0.0,1.0,1983.0,2.0,0.0,1.0
195647,1253004,2.0,2.0,1.0,1996.0,1.0,0.0,0.0
195649,1253006,2.0,1.0,2.0,1987.0,1.0,0.0,0.0
195650,1253009,2.0,0.0,2.0,1988.0,1.0,2.0,0.0
195651,1253010,2.0,0.0,1.0,1990.0,16.0,1.0,0.0
195654,1253014,2.0,0.0,2.0,1990.0,1.0,1.0,1.0
195656,1253016,2.0,1.0,0.0,1988.0,1.0,0.0,1.0


In [195]:
phone_reverse = {0:'android',1:'ios',2:'error'}
source_reverse = {0:'invite_a_friend',1:'google',2:'article',3:'paid'}
country_reverse = {0:'sweden',1:'united_states',2:'brazil',3:'germany',4:'philippines'}
unknown_values.loc[:,'phone'] = unknown_values['phone'].map(phone_reverse)
unknown_values.loc[:,'predictions'] = unknown_values['predictions'].map(phone_reverse)
unknown_values.loc[:,'source'] = unknown_values['source'].map(source_reverse)
unknown_values.loc[:,'country'] = unknown_values['country'].map(country_reverse)

In [196]:
#the result of our model
unknown_values.head()

Unnamed: 0,user_id,phone,source,country,birth_year,free_tree,super_tree,predictions
2,1000003,error,invite_a_friend,sweden,1989.0,37.0,0.0,android
7,1000010,error,invite_a_friend,brazil,1993.0,1.0,6.0,android
8,1000011,error,invite_a_friend,united_states,1995.0,11.0,2.0,android
9,1000012,error,invite_a_friend,united_states,1986.0,35.0,0.0,android
10,1000013,error,article,united_states,1993.0,9.0,1.0,android


In [171]:
#now we can check how well the model works for the known values
pred2 = model.predict(known_values[['source','country','birth_year','free_tree','super_tree']])

In [None]:
known_values.loc[:,'predictions'] = pred2

In [180]:
known_values.loc[:,'phone'] = known_values['phone'].map(phone_reverse)
known_values.loc[:,'predictions'] = known_values['predictions'].map(phone_reverse)
known_values.loc[:,'source'] = known_values['source'].map(source_reverse)
known_values.loc[:,'country'] = known_values['country'].map(country_reverse)
known_values.sample(5)

Unnamed: 0,user_id,phone,source,country,birth_year,free_tree,super_tree,predictions
36911,1066884,ios,invite_a_friend,united_states,1988.0,5.0,0.0,ios
82530,1125407,android,invite_a_friend,philippines,1990.0,18.0,0.0,android
130906,1169372,ios,invite_a_friend,sweden,1989.0,2.0,0.0,ios
7016,1028796,android,invite_a_friend,brazil,1985.0,4.0,0.0,android
25926,1052861,android,google,sweden,1995.0,1.0,0.0,android


In [118]:
#by the means of sklearn.metrics package we can calcualte the accuracy of our model
from sklearn import metrics

In [185]:
metrics.confusion_matrix(known_values.phone,known_values.predictions)

array([[88838,  1863],
       [ 1099, 23027]])

In [186]:
metrics.accuracy_score(known_values.phone,known_values.predictions)

0.9742046731169498

In [None]:
#Segments, where the model is underperforming

In [188]:
#first, I check where we have a mismatch between the actual values and the predicted ones
mismatch = known_values[known_values.phone != known_values.predictions]

In [192]:
#then I group the result by microsegments (source and country)
#the result follows the logic of the free and super tree sends as those segments are the most popular
mismatch.groupby(['source','country']).count()[['predictions']].sort_values(by = 'predictions', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,predictions
source,country,Unnamed: 2_level_1
invite_a_friend,united_states,718
invite_a_friend,brazil,505
google,united_states,353
invite_a_friend,sweden,234
invite_a_friend,philippines,189
google,sweden,172
article,united_states,165
paid,united_states,124
google,philippines,121
invite_a_friend,germany,91
