## Import Libraries

In [1]:
import pandas as pd                                                #import dataset
from sklearn.model_selection import train_test_split               #split dataset into train and test
from sklearn.metrics import mean_squared_error                     #metrics
import numpy as np                                                 #numpy operations
import seaborn as sns                                              #plotting
import matplotlib.pyplot as plt                                    #plotting
from scipy.sparse import hstack, vstack                            #concatenation of features
import itertools                                                   #loop through hyperparameters
import xgboost as xgb                                              #XGBoost Regressor
from sklearn.model_selection import RandomizedSearchCV             #hyperparamter tuning
from tqdm import tqdm                                              #to track progress
from sklearn.preprocessing import Normalizer, StandardScaler       #standardizing
from sklearn.feature_extraction.text import TfidfVectorizer        #TF IDF Vectorizer
from prettytable import PrettyTable                                #summarize results
from sklearn.linear_model import SGDRegressor                      #Linear regressor
from sklearn.svm import SVR                                        #SVM regressor

## Data Import

In [2]:
df_noevents = pd.read_pickle("dataset/gender_predictions/df_train_noevents.pkl")
df_events = pd.read_pickle("dataset/gender_predictions/df_train_events.pkl")

## Data Cleaning

In [3]:
df_noevents.loc[df_noevents['specs_available']==0,'screen_size']=0
df_noevents.loc[df_noevents['specs_available']==0,'ram_gb']=0
df_noevents.loc[df_noevents['specs_available']==0,'camera']=0

In [4]:
df_events.loc[df_events['specs_available']==0,'screen_size']=0
df_events.loc[df_events['specs_available']==0,'ram_gb']=0
df_events.loc[df_events['specs_available']==0,'camera']=0
df_events.loc[df_events['app_usage'].isna(),'app_usage']=0
df_events.loc[df_events['app_usage_session'].isna(),'app_usage_session']=0

### Splitting into Train, Cross-Validate and Test Sets

In [5]:
df_noevents_data, df_noevents_test = train_test_split(df_noevents, test_size = 0.2)
df_noevents_train, df_noevents_cv = train_test_split(df_noevents_data, test_size = 0.2)

In [6]:
df_events_data, df_events_test = train_test_split(df_events, test_size = 0.2)
df_events_train, df_events_cv = train_test_split(df_events_data, test_size = 0.2)

In [7]:
print(df_noevents_train.shape, df_noevents_cv.shape, df_noevents_test.shape)
print(df_events_train.shape, df_events_cv.shape, df_events_test.shape)

(32867, 13) (8217, 13) (10271, 13)
(14905, 26) (3727, 26) (4658, 26)


In [8]:
print(df_noevents_train.columns)
print(df_events_train.columns)

Index(['device_id', 'brand', 'model', 'gender', 'age', 'group', 'screen_size',
       'ram_gb', 'camera', 'specs_available', 'female_pred', 'male_pred',
       'release_bin'],
      dtype='object')
Index(['device_id', 'brand', 'model', 'gender', 'age', 'group', 'app_usage',
       'app_usage_session', 'active_app_usage_counts', 'activity_hour',
       'activity_day', 'num_travels', 'mean_latitude', 'mean_longitude',
       'screen_size', 'ram_gb', 'camera', 'installed_app_labels',
       'active_app_labels', 'installed_app_counts', 'active_app_usage',
       'location_available', 'specs_available', 'female_pred', 'male_pred',
       'release_bin'],
      dtype='object')


## No Events

## Utility Functions

In [9]:
def hyperparameter_tuning(regressor, hp_list, hp, X_train, y_train, X_cv, y_cv, X_test, y_test):
    """Function to perform Hyperparameter tuning of the models. Takes the model name, hyperparameter name and list 
    along with the train, cross validation and test datasets as input"""
    
    #stores the losses
    cv_log_error_array = []
    
    for i in hp_list:
        
        print("for {} = {}".format(hp,i))
        
        #initialize the classifier with the hyperparameter
        if regressor == "lr-l2":
            reg = SGDRegressor(penalty='l2', alpha=i)
        elif regressor == "lr-l1":
            reg = SGDRegressor(penalty='l1', alpha=i)
        elif regressor == "svr-rbf":
            reg = SVR(kernel='rbf', C=i)
        elif regressor == "svr-l":
            reg = SVR(kernel='linear', C=i)
        
        
        #fit and train the model
        reg.fit(X_train, y_train)
        reg_pred = reg.predict(X_cv)
        cv_log_error_array.append(mean_squared_error(y_cv, reg_pred))
        
        # to avoid rounding error while multiplying probabilites we use log-probability estimates
        print("Squared error :",mean_squared_error(y_cv, reg_pred)) 

    #select the hyperparameter with the best performance metric
    best_hp = np.argmin(cv_log_error_array)
    
    #retraining with the best hyperparameter
    if regressor == "lr-l2":
        reg = SGDRegressor(penalty='l2', alpha=hp_list[best_hp])
    elif regressor == "lr-l1":
        reg = SGDRegressor(penalty='l1', alpha=hp_list[best_hp])
    elif regressor == "svr-rbf":
        reg = SVR(kernel='rbf', C=hp_list[best_hp])
    elif regressor == "svr-l":
        reg = SVR(kernel='linear', C=hp_list[best_hp])
    
    reg.fit(X_train, y_train)
       
    #getting train, cross validation and test log loss
    predict_y = reg.predict(X_train)
    train_loss = mean_squared_error(y_train, predict_y)
    print('For values of best ',hp,' = ', hp_list[best_hp], "The train squared error is:", train_loss)
    predict_y = reg.predict(X_cv)
    cv_loss = mean_squared_error(y_cv, predict_y)
    print('For values of best ',hp,' = ', hp_list[best_hp], "The cross validation squared error is:", cv_loss)
    predict_y = reg.predict(X_test)
    test_loss = mean_squared_error(y_test, predict_y)
    print('For values of best ',hp,' = ', hp_list[best_hp], "The test squared error is:", test_loss)
    
    return test_loss

### Feature Variables

In [10]:
noevents_brand_encoder = TfidfVectorizer()

noevents_brand_train = noevents_brand_encoder.fit_transform(df_noevents_train['brand'])
noevents_brand_cv = noevents_brand_encoder.transform(df_noevents_cv['brand'])
noevents_brand_test = noevents_brand_encoder.transform(df_noevents_test['brand'])

In [11]:
noevents_model_encoder = TfidfVectorizer()

noevents_model_train = noevents_model_encoder.fit_transform(df_noevents_train['model'])
noevents_model_cv = noevents_model_encoder.transform(df_noevents_cv['model'])
noevents_model_test = noevents_model_encoder.transform(df_noevents_test['model'])

In [12]:
noevents_x_train = hstack((noevents_brand_train, noevents_model_train))
noevents_x_cv = hstack((noevents_brand_cv, noevents_model_cv))
noevents_x_test = hstack((noevents_brand_test, noevents_model_test))

### Target Variables

In [13]:
noevents_y_train = df_noevents_train['age']
noevents_y_cv = df_noevents_cv['age']
noevents_y_test = df_noevents_test['age']

In [14]:
noevents_model_lrl2 = "No Events - Ridge Regression"

alpha = [10 ** x for x in range(-6, 3)]

noevents_mse_lrl2 = hyperparameter_tuning("lr-l2", alpha, "alpha", noevents_x_train, noevents_y_train, noevents_x_cv, noevents_y_cv, noevents_x_test, noevents_y_test)

for alpha = 1e-06
Squared error : 95.52341878415832
for alpha = 1e-05
Squared error : 95.50810322309036
for alpha = 0.0001
Squared error : 95.32938806965176
for alpha = 0.001
Squared error : 95.16945076640057
for alpha = 0.01
Squared error : 94.88997206694378
for alpha = 0.1
Squared error : 95.58721524962007
for alpha = 1
Squared error : 96.82189480643736
for alpha = 10
Squared error : 97.30932627133294
for alpha = 100
Squared error : 97.32446422966807
For values of best  alpha  =  0.01 The train squared error is: 94.34106077957861
For values of best  alpha  =  0.01 The cross validation squared error is: 94.89081637561932
For values of best  alpha  =  0.01 The test squared error is: 94.59755890108413


In [15]:
noevents_model_lrl1 = "No Events - Lasso Regression"

alpha = [10 ** x for x in range(-6, 3)]

noevents_mse_lrl1 = hyperparameter_tuning("lr-l1", alpha, "alpha", noevents_x_train, noevents_y_train, noevents_x_cv, noevents_y_cv, noevents_x_test, noevents_y_test)

for alpha = 1e-06
Squared error : 95.52666786890819
for alpha = 1e-05
Squared error : 95.51671118308965
for alpha = 0.0001
Squared error : 95.45144436024934
for alpha = 0.001
Squared error : 95.38433272908036
for alpha = 0.01
Squared error : 95.17970006723331
for alpha = 0.1
Squared error : 96.35832921316091
for alpha = 1
Squared error : 97.3441000975766
for alpha = 10
Squared error : 97.34355760971404
for alpha = 100
Squared error : 97.34347832879641
For values of best  alpha  =  0.01 The train squared error is: 94.9684364042655
For values of best  alpha  =  0.01 The cross validation squared error is: 95.18445276310342
For values of best  alpha  =  0.01 The test squared error is: 95.15704461367886


In [16]:
noevents_model_xgb = "No Events - RBF XGBRegressor"

params = {
'eta':[0.02, 0.01, 0.1, 0.5],
'max_depth':[4,6,8,10],
'n_estimators':[5,10,20,50,100]
}
reg_age_noevents = xgb.XGBRegressor()
x = vstack((noevents_x_train, noevents_x_cv))
y = np.concatenate([df_noevents_train['age'], df_noevents_cv['age']])
#trying out 30 possible combinations of hyperparameters
rs_reg = RandomizedSearchCV(reg_age_noevents, params, verbose = 2, cv = 2, n_iter = 30, n_jobs=8)
rs_reg.fit(x,y)
best_reg_age_noevents = rs_reg.best_estimator_
predict_y = best_reg_age_noevents.predict(noevents_x_test)
noevents_mse_xgb = mean_squared_error(df_noevents_test['age'], predict_y)
print("The test mean squared error is:",noevents_mse_xgb)

Fitting 2 folds for each of 30 candidates, totalling 60 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    3.6s
[Parallel(n_jobs=8)]: Done  60 out of  60 | elapsed:    7.5s finished


The test mean squared error is: 94.32567334444948


## Results for records without events

In [17]:
#summarizing results
from prettytable import PrettyTable

table = PrettyTable()

table.field_names = ["Model", "MSE"]

table.add_row([noevents_model_lrl2, noevents_mse_lrl2])
table.add_row([noevents_model_lrl1, noevents_mse_lrl1])
table.add_row([noevents_model_xgb, noevents_mse_xgb])

print(table)

+------------------------------+-------------------+
|            Model             |        MSE        |
+------------------------------+-------------------+
| No Events - Ridge Regression | 94.59755890108413 |
| No Events - Lasso Regression | 95.15704461367886 |
| No Events - RBF XGBRegressor | 94.32567334444948 |
+------------------------------+-------------------+


## Including device specifications

In [18]:
scaler = StandardScaler()

In [19]:
noevents_screen_train = scaler.fit_transform(df_noevents_train['screen_size'].values.reshape(-1,1))
noevents_screen_cv = scaler.transform(df_noevents_cv['screen_size'].values.reshape(-1,1))
noevents_screen_test = scaler.transform(df_noevents_test['screen_size'].values.reshape(-1,1))

In [20]:
noevents_ram_train = scaler.fit_transform(df_noevents_train['ram_gb'].values.reshape(-1,1))
noevents_ram_cv = scaler.transform(df_noevents_cv['ram_gb'].values.reshape(-1,1))
noevents_ram_test = scaler.transform(df_noevents_test['ram_gb'].values.reshape(-1,1))

In [21]:
noevents_camera_train = scaler.fit_transform(df_noevents_train['camera'].values.reshape(-1,1))
noevents_camera_cv = scaler.transform(df_noevents_cv['camera'].values.reshape(-1,1))
noevents_camera_test = scaler.transform(df_noevents_test['camera'].values.reshape(-1,1))

In [22]:
noevents_release_train = scaler.fit_transform(df_noevents_train['release_bin'].values.reshape(-1,1))
noevents_release_cv = scaler.transform(df_noevents_cv['release_bin'].values.reshape(-1,1))
noevents_release_test = scaler.transform(df_noevents_test['release_bin'].values.reshape(-1,1))

In [23]:
noevents_x_train = hstack((noevents_brand_train, noevents_model_train, noevents_screen_train, noevents_ram_train, 
                           noevents_camera_train, noevents_release_train, df_noevents_train['specs_available'].values.reshape(-1,1)))
noevents_x_cv = hstack((noevents_brand_cv, noevents_model_cv, noevents_screen_cv, noevents_ram_cv,
                        noevents_camera_cv, noevents_release_cv, df_noevents_cv['specs_available'].values.reshape(-1,1)))
noevents_x_test = hstack((noevents_brand_test, noevents_model_test, noevents_screen_test, noevents_ram_test,
                          noevents_camera_test, noevents_release_test, df_noevents_test['specs_available'].values.reshape(-1,1)))

In [24]:
noevents_model_lrl2 = "No Events - Ridge Regression"

alpha = [10 ** x for x in range(-6, 3)]

noevents_mse_lrl2 = hyperparameter_tuning("lr-l2", alpha, "alpha", noevents_x_train, noevents_y_train, noevents_x_cv, noevents_y_cv, noevents_x_test, noevents_y_test)

for alpha = 1e-06
Squared error : 95.9081269485412
for alpha = 1e-05
Squared error : 95.89260543077627
for alpha = 0.0001
Squared error : 95.83876979811068
for alpha = 0.001
Squared error : 96.33217765732775
for alpha = 0.01
Squared error : 97.2888999447876
for alpha = 0.1
Squared error : 95.46657129687436
for alpha = 1
Squared error : 96.46977323562638
for alpha = 10
Squared error : 97.20566582660639
for alpha = 100
Squared error : 97.30888558721028
For values of best  alpha  =  0.1 The train squared error is: 95.54667338706129
For values of best  alpha  =  0.1 The cross validation squared error is: 95.45510525067715
For values of best  alpha  =  0.1 The test squared error is: 95.27041430554122


In [25]:
noevents_model_lrl1 = "No Events - Lasso Regression"

alpha = [10 ** x for x in range(-6, 3)]

noevents_mse_lrl1 = hyperparameter_tuning("lr-l1", alpha, "alpha", noevents_x_train, noevents_y_train, noevents_x_cv, noevents_y_cv, noevents_x_test, noevents_y_test)

for alpha = 1e-06
Squared error : 95.92020183839617
for alpha = 1e-05
Squared error : 95.90242361792455
for alpha = 0.0001
Squared error : 95.89707843956855
for alpha = 0.001
Squared error : 96.29568419260394
for alpha = 0.01
Squared error : 98.34012079932197
for alpha = 0.1
Squared error : 99.40409663061499
for alpha = 1
Squared error : 99.74208489713766
for alpha = 10
Squared error : 97.34381374761597
for alpha = 100
Squared error : 97.34348158444246
For values of best  alpha  =  0.0001 The train squared error is: 92.98299255698304
For values of best  alpha  =  0.0001 The cross validation squared error is: 95.99105299033506
For values of best  alpha  =  0.0001 The test squared error is: 95.4478632277314


In [26]:
noevents_model_xgb = "No Events - RBF XGBRegressor"

params = {
'eta':[0.02, 0.01, 0.1, 0.5],
'max_depth':[4,6,8,10],
'n_estimators':[5,10,20,50,100]
}
reg_age_noevents = xgb.XGBRegressor()
x = vstack((noevents_x_train, noevents_x_cv))
y = np.concatenate([df_noevents_train['age'], df_noevents_cv['age']])
#trying out 30 possible combinations of hyperparameters
rs_reg = RandomizedSearchCV(reg_age_noevents, params, verbose = 2, cv = 2, n_iter = 30, n_jobs=8)
rs_reg.fit(x,y)
best_reg_age_noevents = rs_reg.best_estimator_
predict_y = best_reg_age_noevents.predict(noevents_x_test)
noevents_mse_xgb = mean_squared_error(df_noevents_test['age'], predict_y)
print("The test mean squared error is:",noevents_mse_xgb)

Fitting 2 folds for each of 30 candidates, totalling 60 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    3.1s
[Parallel(n_jobs=8)]: Done  60 out of  60 | elapsed:    6.6s finished


The test mean squared error is: 94.2153166085976


## Results for records without events

In [27]:
#summarizing results
from prettytable import PrettyTable

table = PrettyTable()

table.field_names = ["Model", "MSE"]

table.add_row([noevents_model_lrl2, noevents_mse_lrl2])
table.add_row([noevents_model_lrl1, noevents_mse_lrl1])
table.add_row([noevents_model_xgb, noevents_mse_xgb])

print(table)

+------------------------------+-------------------+
|            Model             |        MSE        |
+------------------------------+-------------------+
| No Events - Ridge Regression | 95.27041430554122 |
| No Events - Lasso Regression |  95.4478632277314 |
| No Events - RBF XGBRegressor |  94.2153166085976 |
+------------------------------+-------------------+


## With Events

In [28]:
events_brand_encoder = TfidfVectorizer()

events_brand_train = events_brand_encoder.fit_transform(df_events_train['brand'])
events_brand_cv = events_brand_encoder.transform(df_events_cv['brand'])
events_brand_test = events_brand_encoder.transform(df_events_test['brand'])

In [29]:
events_model_encoder = TfidfVectorizer()

events_model_train = events_model_encoder.fit_transform(df_events_train['model'])
events_model_cv = events_model_encoder.transform(df_events_cv['model'])
events_model_test = events_model_encoder.transform(df_events_test['model'])

In [30]:
events_active_labels_encoder = TfidfVectorizer()

events_active_labels_train = events_active_labels_encoder.fit_transform(df_events_train['active_app_labels'])
events_active_labels_cv = events_active_labels_encoder.transform(df_events_cv['active_app_labels'])
events_active_labels_test = events_active_labels_encoder.transform(df_events_test['active_app_labels'])

In [31]:
events_installed_labels_encoder = TfidfVectorizer()

events_installed_labels_train = events_installed_labels_encoder.fit_transform(df_events_train['installed_app_labels'])
events_installed_labels_cv = events_installed_labels_encoder.transform(df_events_cv['installed_app_labels'])
events_installed_labels_test = events_installed_labels_encoder.transform(df_events_test['installed_app_labels'])

In [32]:
scaler = StandardScaler()

In [33]:
events_lat_train = scaler.fit_transform(df_events_train['mean_latitude'].values.reshape(-1,1))
events_lat_cv = scaler.transform(df_events_cv['mean_latitude'].values.reshape(-1,1))
events_lat_test = scaler.transform(df_events_test['mean_latitude'].values.reshape(-1,1))

In [34]:
events_long_train = scaler.fit_transform(df_events_train['mean_longitude'].values.reshape(-1,1))
events_long_cv = scaler.transform(df_events_cv['mean_longitude'].values.reshape(-1,1))
events_long_test = scaler.transform(df_events_test['mean_longitude'].values.reshape(-1,1))

In [35]:
events_travels_train = scaler.fit_transform(df_events_train['num_travels'].values.reshape(-1,1))
events_travels_cv = scaler.transform(df_events_cv['num_travels'].values.reshape(-1,1))
events_travels_test = scaler.transform(df_events_test['num_travels'].values.reshape(-1,1))

In [36]:
events_x_train = hstack((events_brand_train, events_model_train, events_installed_labels_train, events_active_labels_train,
                         events_lat_train, events_long_train, events_travels_train, df_events_train['location_available'].values.reshape(-1,1), 
                         np.array(df_events_train['activity_hour'].to_list()), np.array(df_events_train['activity_day'].to_list()), 
                         df_events_train['app_usage'].values.reshape(-1,1), df_events_train['app_usage_session'].values.reshape(-1,1), 
                         np.array(df_events_train['installed_app_counts'].to_list()), np.array(df_events_train['active_app_usage_counts'].to_list()), 
                         np.array(df_events_train['active_app_usage'].to_list())))

In [37]:
events_x_cv = hstack((events_brand_cv, events_model_cv, events_installed_labels_cv, events_active_labels_cv,
                         events_lat_cv, events_long_cv, events_travels_cv, df_events_cv['location_available'].values.reshape(-1,1), 
                         np.array(df_events_cv['activity_hour'].to_list()), np.array(df_events_cv['activity_day'].to_list()), 
                         df_events_cv['app_usage'].values.reshape(-1,1), df_events_cv['app_usage_session'].values.reshape(-1,1), 
                         np.array(df_events_cv['installed_app_counts'].to_list()), np.array(df_events_cv['active_app_usage_counts'].to_list()), 
                         np.array(df_events_cv['active_app_usage'].to_list())))

In [38]:
events_x_test = hstack((events_brand_test, events_model_test, events_installed_labels_test, events_active_labels_test,
                         events_lat_test, events_long_test, events_travels_test, df_events_test['location_available'].values.reshape(-1,1), 
                         np.array(df_events_test['activity_hour'].to_list()), np.array(df_events_test['activity_day'].to_list()), 
                         df_events_test['app_usage'].values.reshape(-1,1), df_events_test['app_usage_session'].values.reshape(-1,1), 
                         np.array(df_events_test['installed_app_counts'].to_list()), np.array(df_events_test['active_app_usage_counts'].to_list()), 
                         np.array(df_events_test['active_app_usage'].to_list())))

### Target Variables

In [39]:
events_y_train = df_events_train['age']
events_y_cv = df_events_cv['age']
events_y_test = df_events_test['age']

In [40]:
events_model_lrl2 = "Events - Ridge Regression"

alpha = [10 ** x for x in range(-6, 3)]

events_mse_lrl2 = hyperparameter_tuning("lr-l2", alpha, "alpha", events_x_train, events_y_train, events_x_cv, events_y_cv, events_x_test, events_y_test)

for alpha = 1e-06
Squared error : 81.97201095961623
for alpha = 1e-05
Squared error : 81.68781027898974
for alpha = 0.0001
Squared error : 81.13223551783504
for alpha = 0.001
Squared error : 80.3038834034991
for alpha = 0.01
Squared error : 82.65088112082486
for alpha = 0.1
Squared error : 88.73556225646409
for alpha = 1
Squared error : 91.97050104339642
for alpha = 10
Squared error : 96.11970593589973
for alpha = 100
Squared error : 96.87325591020655
For values of best  alpha  =  0.001 The train squared error is: 69.48994999180032
For values of best  alpha  =  0.001 The cross validation squared error is: 80.33694141827215
For values of best  alpha  =  0.001 The test squared error is: 77.44646388606094


In [41]:
events_model_lrl1 = "Events - Lasso Regression"

alpha = [10 ** x for x in range(-6, 3)]

events_mse_lrl1 = hyperparameter_tuning("lr-l1", alpha, "alpha", events_x_train, events_y_train, events_x_cv, events_y_cv, events_x_test, events_y_test)

for alpha = 1e-06
Squared error : 81.49304104650524
for alpha = 1e-05
Squared error : 81.56764432200532
for alpha = 0.0001
Squared error : 81.14212712449111
for alpha = 0.001
Squared error : 80.11998843787917
for alpha = 0.01
Squared error : 80.90649734195249
for alpha = 0.1
Squared error : 91.7375661812818
for alpha = 1
Squared error : 97.01301798594282
for alpha = 10
Squared error : 97.0238551575374
for alpha = 100
Squared error : 97.0226677938974
For values of best  alpha  =  0.001 The train squared error is: 69.00880045438521
For values of best  alpha  =  0.001 The cross validation squared error is: 80.14887723398519
For values of best  alpha  =  0.001 The test squared error is: 77.29103567323254


In [42]:
events_model_xgb = "Events - RBF XGBRegressor"

params = {
'eta':[0.02, 0.01, 0.1, 0.5],
'max_depth':[4,6,8,10],
'n_estimators':[5,10,20,50,100]
}
reg_age_events = xgb.XGBRegressor()
x = vstack((events_x_train, events_x_cv))
y = np.concatenate([df_events_train['age'], df_events_cv['age']])
#trying out 30 possible combinations of hyperparameters
rs_reg = RandomizedSearchCV(reg_age_events, params, verbose = 2, cv = 2, n_iter = 30, n_jobs=8)
rs_reg.fit(x,y)
best_reg_age_events = rs_reg.best_estimator_
predict_y = best_reg_age_events.predict(events_x_test)
events_mse_xgb = mean_squared_error(df_events_test['age'], predict_y)
print("The test mean squared error is:",events_mse_xgb)

Fitting 2 folds for each of 30 candidates, totalling 60 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:  3.2min
[Parallel(n_jobs=8)]: Done  60 out of  60 | elapsed:  4.9min finished


The test mean squared error is: 72.41928931227304


## Results for records with events

In [43]:
#summarizing results
from prettytable import PrettyTable

table = PrettyTable()

table.field_names = ["Model", "MSE"]

table.add_row([events_model_lrl2, events_mse_lrl2])
table.add_row([events_model_lrl1, events_mse_lrl1])
table.add_row([events_model_xgb, events_mse_xgb])

print(table)

+---------------------------+-------------------+
|           Model           |        MSE        |
+---------------------------+-------------------+
| Events - Ridge Regression | 77.44646388606094 |
| Events - Lasso Regression | 77.29103567323254 |
| Events - RBF XGBRegressor | 72.41928931227304 |
+---------------------------+-------------------+


## Including device specifications

In [44]:
scaler = StandardScaler()

In [45]:
events_screen_train = scaler.fit_transform(df_events_train['screen_size'].values.reshape(-1,1))
events_screen_cv = scaler.transform(df_events_cv['screen_size'].values.reshape(-1,1))
events_screen_test = scaler.transform(df_events_test['screen_size'].values.reshape(-1,1))

In [46]:
events_ram_train = scaler.fit_transform(df_events_train['ram_gb'].values.reshape(-1,1))
events_ram_cv = scaler.transform(df_events_cv['ram_gb'].values.reshape(-1,1))
events_ram_test = scaler.transform(df_events_test['ram_gb'].values.reshape(-1,1))

In [47]:
events_camera_train = scaler.fit_transform(df_events_train['camera'].values.reshape(-1,1))
events_camera_cv = scaler.transform(df_events_cv['camera'].values.reshape(-1,1))
events_camera_test = scaler.transform(df_events_test['camera'].values.reshape(-1,1))

In [48]:
events_release_train = scaler.fit_transform(df_events_train['release_bin'].values.reshape(-1,1))
events_release_cv = scaler.transform(df_events_cv['release_bin'].values.reshape(-1,1))
events_release_test = scaler.transform(df_events_test['release_bin'].values.reshape(-1,1))

In [49]:
events_x_train = hstack((events_brand_train, events_model_train, events_installed_labels_train, events_active_labels_train,
                         events_lat_train, events_long_train, events_travels_train, df_events_train['location_available'].values.reshape(-1,1), 
                         np.array(df_events_train['activity_hour'].to_list()), np.array(df_events_train['activity_day'].to_list()), 
                         df_events_train['app_usage'].values.reshape(-1,1), df_events_train['app_usage_session'].values.reshape(-1,1), 
                         np.array(df_events_train['installed_app_counts'].to_list()), np.array(df_events_train['active_app_usage_counts'].to_list()), 
                         np.array(df_events_train['active_app_usage'].to_list()), events_screen_train, events_ram_train,
                         events_camera_train, events_release_train, df_events_train['specs_available'].values.reshape(-1,1)))

In [50]:
events_x_cv = hstack((events_brand_cv, events_model_cv, events_installed_labels_cv, events_active_labels_cv,
                         events_lat_cv, events_long_cv, events_travels_cv, df_events_cv['location_available'].values.reshape(-1,1), 
                         np.array(df_events_cv['activity_hour'].to_list()), np.array(df_events_cv['activity_day'].to_list()), 
                         df_events_cv['app_usage'].values.reshape(-1,1), df_events_cv['app_usage_session'].values.reshape(-1,1), 
                         np.array(df_events_cv['installed_app_counts'].to_list()), np.array(df_events_cv['active_app_usage_counts'].to_list()), 
                         np.array(df_events_cv['active_app_usage'].to_list()), events_screen_cv, events_ram_cv,
                         events_camera_cv, events_release_cv, df_events_cv['specs_available'].values.reshape(-1,1)))

In [51]:
events_x_test = hstack((events_brand_test, events_model_test, events_installed_labels_test, events_active_labels_test,
                         events_lat_test, events_long_test, events_travels_test, df_events_test['location_available'].values.reshape(-1,1), 
                         np.array(df_events_test['activity_hour'].to_list()), np.array(df_events_test['activity_day'].to_list()), 
                         df_events_test['app_usage'].values.reshape(-1,1), df_events_test['app_usage_session'].values.reshape(-1,1), 
                         np.array(df_events_test['installed_app_counts'].to_list()), np.array(df_events_test['active_app_usage_counts'].to_list()), 
                         np.array(df_events_test['active_app_usage'].to_list()), events_screen_test, events_ram_test,
                         events_camera_test, events_release_test, df_events_test['specs_available'].values.reshape(-1,1)))

In [52]:
events_model_lrl2 = "Events - Ridge Regression"

alpha = [10 ** x for x in range(-6, 3)]

events_mse_lrl2 = hyperparameter_tuning("lr-l2", alpha, "alpha", events_x_train, events_y_train, events_x_cv, events_y_cv, events_x_test, events_y_test)

for alpha = 1e-06
Squared error : 80.91990292767196
for alpha = 1e-05
Squared error : 80.78643286011881
for alpha = 0.0001
Squared error : 80.8562210444454
for alpha = 0.001
Squared error : 79.91564603481207
for alpha = 0.01
Squared error : 81.23092823365289
for alpha = 0.1
Squared error : 89.36693261048036
for alpha = 1
Squared error : 91.0143459893718
for alpha = 10
Squared error : 95.55084852391376
for alpha = 100
Squared error : 96.81708829741255
For values of best  alpha  =  0.001 The train squared error is: 68.89550593293215
For values of best  alpha  =  0.001 The cross validation squared error is: 79.84256852391462
For values of best  alpha  =  0.001 The test squared error is: 76.87472817283391


In [53]:
events_model_lrl1 = "Events - Lasso Regression"

alpha = [10 ** x for x in range(-6, 3)]

events_mse_lrl1 = hyperparameter_tuning("lr-l1", alpha, "alpha", events_x_train, events_y_train, events_x_cv, events_y_cv, events_x_test, events_y_test)

for alpha = 1e-06
Squared error : 80.92866315020876
for alpha = 1e-05
Squared error : 81.07936663908835
for alpha = 0.0001
Squared error : 80.6223811086326
for alpha = 0.001
Squared error : 79.76426568547355
for alpha = 0.01
Squared error : 80.96990997598805
for alpha = 0.1
Squared error : 93.25368990686066
for alpha = 1
Squared error : 103.06126515849023
for alpha = 10
Squared error : 97.02115060232052
for alpha = 100
Squared error : 97.02251024550876
For values of best  alpha  =  0.001 The train squared error is: 68.68584252086916
For values of best  alpha  =  0.001 The cross validation squared error is: 79.84549816886901
For values of best  alpha  =  0.001 The test squared error is: 76.67542165571683


In [54]:
events_model_xgb = "Events - RBF XGBRegressor"

params = {
'eta':[0.02, 0.01, 0.1, 0.5],
'max_depth':[4,6,8,10],
'n_estimators':[5,10,20,50,100]
}
reg_age_events = xgb.XGBRegressor()
x = vstack((events_x_train, events_x_cv))
y = np.concatenate([df_events_train['age'], df_events_cv['age']])
#trying out 30 possible combinations of hyperparameters
rs_reg = RandomizedSearchCV(reg_age_events, params, verbose = 2, cv = 2, n_iter = 30, n_jobs=8)
rs_reg.fit(x,y)
best_reg_age_events = rs_reg.best_estimator_
predict_y = best_reg_age_events.predict(events_x_test)
events_mse_xgb = mean_squared_error(df_events_test['age'], predict_y)
print("The test mean squared error is:",events_mse_xgb)

Fitting 2 folds for each of 30 candidates, totalling 60 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:  2.7min
[Parallel(n_jobs=8)]: Done  60 out of  60 | elapsed:  5.4min finished


The test mean squared error is: 71.39284904074094


## Results for records with events

In [55]:
#summarizing results
from prettytable import PrettyTable

table = PrettyTable()

table.field_names = ["Model", "MSE"]

table.add_row([events_model_lrl2, events_mse_lrl2])
table.add_row([events_model_lrl1, events_mse_lrl1])
table.add_row([events_model_xgb, events_mse_xgb])

print(table)

+---------------------------+-------------------+
|           Model           |        MSE        |
+---------------------------+-------------------+
| Events - Ridge Regression | 76.87472817283391 |
| Events - Lasso Regression | 76.67542165571683 |
| Events - RBF XGBRegressor | 71.39284904074094 |
+---------------------------+-------------------+


## Adding Gender Predictions as a feature

In [56]:
noevents_x_train = hstack((noevents_brand_train, noevents_model_train))
noevents_x_cv = hstack((noevents_brand_cv, noevents_model_cv))
noevents_x_test = hstack((noevents_brand_test, noevents_model_test))

In [57]:
noevents_model_xgb = "No Events - RBF XGBRegressor"

params = {
'eta':[0.02, 0.01, 0.1, 0.5],
'max_depth':[4,6,8,10],
'n_estimators':[5,10,20,50,100]
}
reg_age_noevents = xgb.XGBRegressor()
x = vstack((noevents_x_train, noevents_x_cv))
y = np.concatenate([df_noevents_train['age'], df_noevents_cv['age']])
#trying out 30 possible combinations of hyperparameters
rs_reg = RandomizedSearchCV(reg_age_noevents, params, verbose = 2, cv = 2, n_iter = 30, n_jobs=8)
rs_reg.fit(x,y)
best_reg_age_noevents = rs_reg.best_estimator_
predict_y = best_reg_age_noevents.predict(noevents_x_test)
noevents_mse_xgb = mean_squared_error(df_noevents_test['age'], predict_y)
print("The test mean squared error is:",noevents_mse_xgb)

Fitting 2 folds for each of 30 candidates, totalling 60 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    3.5s
[Parallel(n_jobs=8)]: Done  45 out of  60 | elapsed:    4.4s remaining:    1.4s


The test mean squared error is: 94.49816966472521


[Parallel(n_jobs=8)]: Done  60 out of  60 | elapsed:    5.0s finished


In [58]:
events_x_train = hstack((events_brand_train, events_model_train, events_installed_labels_train, events_active_labels_train,
                         events_lat_train, events_long_train, events_travels_train, df_events_train['location_available'].values.reshape(-1,1), 
                         np.array(df_events_train['activity_hour'].to_list()), np.array(df_events_train['activity_day'].to_list()), 
                         df_events_train['app_usage'].values.reshape(-1,1), df_events_train['app_usage_session'].values.reshape(-1,1), 
                         np.array(df_events_train['installed_app_counts'].to_list()), np.array(df_events_train['active_app_usage_counts'].to_list()), 
                         np.array(df_events_train['active_app_usage'].to_list())))

In [59]:
events_x_cv = hstack((events_brand_cv, events_model_cv, events_installed_labels_cv, events_active_labels_cv,
                         events_lat_cv, events_long_cv, events_travels_cv, df_events_cv['location_available'].values.reshape(-1,1), 
                         np.array(df_events_cv['activity_hour'].to_list()), np.array(df_events_cv['activity_day'].to_list()), 
                         df_events_cv['app_usage'].values.reshape(-1,1), df_events_cv['app_usage_session'].values.reshape(-1,1), 
                         np.array(df_events_cv['installed_app_counts'].to_list()), np.array(df_events_cv['active_app_usage_counts'].to_list()), 
                         np.array(df_events_cv['active_app_usage'].to_list())))

In [60]:
events_x_test = hstack((events_brand_test, events_model_test, events_installed_labels_test, events_active_labels_test,
                         events_lat_test, events_long_test, events_travels_test, df_events_test['location_available'].values.reshape(-1,1), 
                         np.array(df_events_test['activity_hour'].to_list()), np.array(df_events_test['activity_day'].to_list()), 
                         df_events_test['app_usage'].values.reshape(-1,1), df_events_test['app_usage_session'].values.reshape(-1,1), 
                         np.array(df_events_test['installed_app_counts'].to_list()), np.array(df_events_test['active_app_usage_counts'].to_list()), 
                         np.array(df_events_test['active_app_usage'].to_list())))

In [61]:
events_model_xgb = "Events - RBF XGBRegressor"

params = {
'eta':[0.02, 0.01, 0.1, 0.5],
'max_depth':[4,6,8,10],
'n_estimators':[5,10,20,50,100]
}
reg_age_events = xgb.XGBRegressor()
x = vstack((events_x_train, events_x_cv))
y = np.concatenate([df_events_train['age'], df_events_cv['age']])
#trying out 30 possible combinations of hyperparameters
rs_reg = RandomizedSearchCV(reg_age_events, params, verbose = 2, cv = 2, n_iter = 30, n_jobs=8)
rs_reg.fit(x,y)
best_reg_age_events = rs_reg.best_estimator_
predict_y = best_reg_age_events.predict(events_x_test)
events_mse_xgb = mean_squared_error(df_events_test['age'], predict_y)
print("The test mean squared error is:",events_mse_xgb)

Fitting 2 folds for each of 30 candidates, totalling 60 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:  2.5min
[Parallel(n_jobs=8)]: Done  60 out of  60 | elapsed:  5.9min finished


The test mean squared error is: 73.72729043354077


In [83]:
df_train_noevents = pd.read_pickle("dataset/gender_predictions/df_train_noevents.pkl")
df_train_events = pd.read_pickle("dataset/gender_predictions/df_train_events.pkl")
df_test_noevents = pd.read_pickle("dataset/gender_predictions/df_test_noevents.pkl")
df_test_events = pd.read_pickle("dataset/gender_predictions/df_test_events.pkl")

In [84]:
df_train_events.loc[df_train_events['app_usage'].isna(),'app_usage']=0
df_train_events.loc[df_train_events['app_usage_session'].isna(),'app_usage_session']=0

In [85]:
df_test_events.loc[df_test_events['app_usage'].isna(),'app_usage']=0
df_test_events.loc[df_test_events['app_usage_session'].isna(),'app_usage_session']=0

### No Events

In [86]:
noevents_brand_train = noevents_brand_encoder.transform(df_train_noevents['brand'])
noevents_brand_test = noevents_brand_encoder.transform(df_test_noevents['brand'])

In [87]:
noevents_model_train = noevents_model_encoder.transform(df_train_noevents['model'])
noevents_model_test = noevents_model_encoder.transform(df_test_noevents['model'])

In [88]:
noevents_x_train = hstack((noevents_brand_train, noevents_model_train))
noevents_x_test = hstack((noevents_brand_test, noevents_model_test))

In [89]:
age_pred_train = best_reg_age_noevents.predict(noevents_x_train)
age_pred_test = best_reg_age_noevents.predict(noevents_x_test)

In [90]:
df_train_noevents['pred_age'] = age_pred_train
df_test_noevents['pred_age'] = age_pred_test

### Events

In [91]:
events_brand_train = events_brand_encoder.transform(df_train_events['brand'])
events_brand_test = events_brand_encoder.transform(df_test_events['brand'])

In [92]:
events_model_train = events_model_encoder.transform(df_train_events['model'])
events_model_test = events_model_encoder.transform(df_test_events['model'])

In [93]:
events_active_labels_train = events_active_labels_encoder.transform(df_train_events['active_app_labels'])
events_active_labels_test = events_active_labels_encoder.transform(df_test_events['active_app_labels'])

In [94]:
events_installed_labels_train = events_installed_labels_encoder.transform(df_train_events['installed_app_labels'])
events_installed_labels_test = events_installed_labels_encoder.transform(df_test_events['installed_app_labels'])

In [95]:
scaler = StandardScaler()

In [96]:
events_lat_train = scaler.fit_transform(df_train_events['mean_latitude'].values.reshape(-1,1))
events_lat_test = scaler.transform(df_test_events['mean_latitude'].values.reshape(-1,1))

In [97]:
events_long_train = scaler.fit_transform(df_train_events['mean_longitude'].values.reshape(-1,1))
events_long_test = scaler.transform(df_test_events['mean_longitude'].values.reshape(-1,1))

In [98]:
events_travels_train = scaler.fit_transform(df_train_events['num_travels'].values.reshape(-1,1))
events_travels_test = scaler.transform(df_test_events['num_travels'].values.reshape(-1,1))

In [99]:
events_x_train = hstack((events_brand_train, events_model_train, events_installed_labels_train, events_active_labels_train,
                         events_lat_train, events_long_train, events_travels_train, df_train_events['location_available'].values.reshape(-1,1), 
                         np.array(df_train_events['activity_hour'].to_list()), np.array(df_train_events['activity_day'].to_list()), 
                         df_train_events['app_usage'].values.reshape(-1,1), df_train_events['app_usage_session'].values.reshape(-1,1), 
                         np.array(df_train_events['installed_app_counts'].to_list()), np.array(df_train_events['active_app_usage_counts'].to_list()), 
                         np.array(df_train_events['active_app_usage'].to_list())))

In [100]:
events_x_test = hstack((events_brand_test, events_model_test, events_installed_labels_test, events_active_labels_test,
                         events_lat_test, events_long_test, events_travels_test, df_test_events['location_available'].values.reshape(-1,1), 
                         np.array(df_test_events['activity_hour'].to_list()), np.array(df_test_events['activity_day'].to_list()), 
                         df_test_events['app_usage'].values.reshape(-1,1), df_test_events['app_usage_session'].values.reshape(-1,1), 
                         np.array(df_test_events['installed_app_counts'].to_list()), np.array(df_test_events['active_app_usage_counts'].to_list()), 
                         np.array(df_test_events['active_app_usage'].to_list())))

In [101]:
age_pred_train = best_reg_age_events.predict(events_x_train)
age_pred_test = best_reg_age_events.predict(events_x_test)

In [102]:
df_train_events['pred_age'] = age_pred_train
df_test_events['pred_age'] = age_pred_test

In [103]:
print(df_train_events.head())

              device_id    brand          model gender  age   group  \
4  -5827952925479472594   xiaomi    mi one plus      M   30  M29-31   
8    773248989809697210     vivo          x5pro      M   20    M22-   
14 -5913071468598874323  samsung          s7566      M   40    M39+   
30   375031242916141301     sony       xperia z      M   32  M32-38   
36 -1025284795963440768  samsung  galaxy note 2      M   56    M39+   

    app_usage  app_usage_session  \
4    0.390244           0.167480   
8    0.607143           0.084127   
14   1.000000           0.660131   
30   0.945946           0.485220   
36   0.521739           0.521739   

                              active_app_usage_counts  \
4   [0.049575070821529746, 0.0, 0.0750708215297450...   
8   [0.11339962121212122, 0.010179924242424242, 0....   
14  [0.09965635738831616, 0.0, 0.08132875143184422...   
30  [0.07363304981773998, 0.012393681652490888, 0....   
36  [0.08, 0.013333333333333334, 0.146666666666666...   

             

In [107]:
df_train_noevents.to_pickle("dataset/age_predictions/df_train_noevents.pkl")
df_train_events.to_pickle("dataset/age_predictions/df_train_events.pkl")
df_test_noevents.to_pickle("dataset/age_predictions/df_test_noevents.pkl")
df_test_events.to_pickle("dataset/age_predictions/df_test_events.pkl")