In [1]:
import pandas as pd 
import numpy as np 
import  seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import mean_squared_error, r2_score
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [2]:
filename = 'data/RF_lag_1_original'

In [3]:
bigdf = pd.read_csv(filename, sep = "\t")
print(bigdf.shape)
bigdf.head(n=3)

(1268, 18)


Unnamed: 0,id,time,mood_next_day,screen,activity,total_app_time,circumplex.arousal,circumplex.valence,call_sms,mood,is_weekday,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
0,AS14.01,2014-02-25,6.25,12008.029778,0.091284,11261.123467,-0.242963,0.702222,3.0,7.057778,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,AS14.01,2014-02-26,6.333333,12008.029778,0.091284,11261.123467,-0.25,0.75,3.0,6.25,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,AS14.01,2014-03-20,6.2,2275.944,0.081548,3608.214,-0.242963,0.702222,1.0,7.057778,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [4]:
#Globalmodel 

In [5]:
# list of columns you want to drop from x matrix ( this is just example) always call df as df 
dropcols = ['id', 'time', 'mood_next_day'] 
target = 'mood_next_day' # target you want to predict 
test_size = 0.2 # 30 % data as test set 

In [6]:
# Define your hyperparameters
n_estimaters = [int(x) for x in np.linspace(start = 10 , stop = 100 , num = 10)]
max_depth = [2,3,4]
max_features = ['auto', 'sqrt']

#set parameter grid for hyper parameter tuning 
param_grid = {"n_estimators" : n_estimaters,
             "max_depth" : max_depth,
             "max_features" : max_features}
print(param_grid)

{'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100], 'max_depth': [2, 3, 4], 'max_features': ['auto', 'sqrt']}


In [7]:

# these function is access within rfmodel but you can use it separatetly
#train test split 
def get_train_test(df, dropcols, target, test_size):
    """this function will take raw dataframe 
    and gives you train and test for model building """
    X = df.drop(dropcols, axis=1)
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=test_size,
                                                        random_state=10)
    return X_train, X_test, y_train, y_test 

#parameter tuning 

def param_tuning(X_train, X_test, y_train, y_test):
    """ this function will return best parameters"""
    rfr = RandomForestRegressor() # initialize model with no parameters 
    rf_tune = RandomizedSearchCV(rfr, param_distributions=param_grid,
                              cv = 5, verbose =2 , n_jobs=4)
    rf_tune.fit(X_train, y_train)
    print(f'Train Accuracy - : {rf_tune.score(X_train, y_train):.3f}')
    print(f'Test Accuracy - : {rf_tune.score(X_test, y_test):.3f}')
    return rf_tune.best_params_
#how to run 
#best_param = param_tuning(X_train, X_test, y_train, y_test)

# final model you should run this command only
def rfmodel():
    '''run rf model and hyperparameter'''
    X_train, X_test, y_train, y_test = get_train_test(df=df,
                                                      dropcols =dropcols,
                                                      target = target,
                                                      test_size = test_size)
    best_params = param_tuning(X_train, X_test, y_train, y_test)
    
    #fit best model 
    n_estimator = best_params['n_estimators']
    max_feature = best_params['max_features']
    max_dept = best_params['max_depth']
    bestmodel = RandomForestRegressor(n_estimators=n_estimator,
                                      max_features=max_feature,
                                      max_depth = max_dept,
                                      random_state = 10)
    bestmodel.fit(X_train, y_train)
    y_pred = bestmodel.predict(X_test)
    #get performance stats 
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    #get feature importance 
    feat_importance = pd.Series(bestmodel.feature_importances_, 
                                 index=X_train.columns)
    #feat_importance.nlargest(X_train.columns).plot(kind='barh', color = 'darkblue')
    
    return [bestmodel, mse, r2, feat_importance] # this return list of factors

 # how to run results = rfmodel()   # access model results[0], access mse = results[1]

# DO quick check

#### Run global model 

In [8]:
#global model quick check 
df = bigdf.copy()
results = rfmodel()

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    7.0s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    8.2s finished


Train Accuracy - : 0.243
Test Accuracy - : 0.187


In [9]:
#global results 
results # access mse = results[1]

[RandomForestRegressor(max_depth=2, n_estimators=60, random_state=10),
 0.49993319798291747,
 0.1901588713682728,
 screen                0.003103
 activity              0.002695
 total_app_time        0.005317
 circumplex.arousal    0.072416
 circumplex.valence    0.000000
 call_sms              0.008760
 mood                  0.904231
 is_weekday            0.000000
 Friday                0.003478
 Monday                0.000000
 Saturday              0.000000
 Sunday                0.000000
 Thursday              0.000000
 Tuesday               0.000000
 Wednesday             0.000000
 dtype: float64]

##### Run Individual model 

In [10]:
#Individual level model 
# quick check ( per person model )
mse = {}
for ids in bigdf.id.value_counts().index:
    print(ids)
    df = bigdf[bigdf.id==ids]
    results = rfmodel()
    mse[ids] = results[1]
    

pd.DataFrame(list(mse.items()),columns = ['id','mse'])['mse'].mean()

AS14.26
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    2.9s finished


Train Accuracy - : 0.441
Test Accuracy - : 0.439
AS14.08
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  43 out of  50 | elapsed:    2.5s remaining:    0.3s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    2.7s finished


Train Accuracy - : 0.542
Test Accuracy - : -0.287
AS14.17
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    2.7s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Train Accuracy - : 0.353
Test Accuracy - : 0.348
AS14.15
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    2.0s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Train Accuracy - : 0.576
Test Accuracy - : 0.169
AS14.24
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Done  43 out of  50 | elapsed:    2.0s remaining:    0.2s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    2.1s finished


Train Accuracy - : 0.366
Test Accuracy - : -0.319
AS14.05
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    2.1s finished


Train Accuracy - : 0.860
Test Accuracy - : -0.034
AS14.16
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    3.4s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    4.3s finished


Train Accuracy - : 0.788
Test Accuracy - : 0.246
AS14.13
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    2.9s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    3.8s finished


Train Accuracy - : 0.740
Test Accuracy - : -0.469
AS14.07
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    2.9s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Train Accuracy - : 0.336
Test Accuracy - : 0.045
AS14.03
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    1.6s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    2.4s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Train Accuracy - : 0.676
Test Accuracy - : 0.098
AS14.20
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    1.9s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    2.4s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Train Accuracy - : 0.509
Test Accuracy - : -0.484
AS14.30
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    2.6s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Train Accuracy - : 0.502
Test Accuracy - : -0.290
AS14.01
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    2.4s finished


Train Accuracy - : 0.488
Test Accuracy - : 0.591
AS14.09
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  43 out of  50 | elapsed:    1.8s remaining:    0.2s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    1.8s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Train Accuracy - : 0.726
Test Accuracy - : -0.032
AS14.19
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    2.3s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Train Accuracy - : 0.779
Test Accuracy - : -0.527
AS14.06
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    1.6s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    2.2s finished


Train Accuracy - : 0.521
Test Accuracy - : -0.349
AS14.14
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    1.8s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    2.5s finished


Train Accuracy - : 0.575
Test Accuracy - : -2.470
AS14.33
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    2.4s finished


Train Accuracy - : 0.672
Test Accuracy - : 0.060
AS14.31
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    2.3s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Train Accuracy - : 0.482
Test Accuracy - : -0.503
AS14.29
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    1.7s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    2.3s finished


Train Accuracy - : 0.653
Test Accuracy - : -1.077
AS14.27
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    2.6s finished


Train Accuracy - : 0.633
Test Accuracy - : -0.240
AS14.02
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    2.6s finished


Train Accuracy - : 0.735
Test Accuracy - : 0.201
AS14.12
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    1.8s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    2.4s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Train Accuracy - : 0.363
Test Accuracy - : -0.185
AS14.23
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    2.5s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Train Accuracy - : 0.730
Test Accuracy - : 0.056
AS14.28
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    3.6s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    4.3s finished


Train Accuracy - : 0.605
Test Accuracy - : -1.626
AS14.32
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  43 out of  50 | elapsed:    2.3s remaining:    0.3s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    2.4s finished


Train Accuracy - : 0.359
Test Accuracy - : 0.030
AS14.25
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    2.4s finished


Train Accuracy - : 0.692
Test Accuracy - : -1.883


0.3734146741310379