In [1]:
import pandas as pd 
import numpy as np 
import  seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import mean_squared_error, r2_score
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [2]:
filename = 'data/RF_lag_3_original'

In [3]:
bigdf = pd.read_csv(filename, sep = "\t")
print(bigdf.shape)
bigdf.head(n=3)

(1267, 11)


Unnamed: 0,id,time,mood_next_day,screen,activity,total_app_time,circumplex.arousal,circumplex.valence,call_sms,mood,is_weekday
0,AS14.01,2014-02-25,6.25,11883.724018,0.091654,11060.902622,-0.232593,0.685556,2.333333,7.021481,0.666667
1,AS14.01,2014-02-26,6.333333,11883.724018,0.091654,11060.902622,-0.25,0.75,3.0,6.25,0.666667
2,AS14.01,2014-03-20,6.2,2275.944,0.081548,1202.738,-0.232593,0.685556,1.666667,7.021481,1.0


In [4]:
#Globalmodel 

In [5]:
# list of columns you want to drop from x matrix ( this is just example) always call df as df 
dropcols = ['id', 'time', 'mood_next_day'] 
target = 'mood_next_day' # target you want to predict 
test_size = 0.3 # 30 % data as test set 

In [6]:
# Define your hyperparameters
n_estimaters = [int(x) for x in np.linspace(start = 10 , stop = 100 , num = 10)]
max_depth = [2,3,4]
max_features = ['auto', 'sqrt']

#set parameter grid for hyper parameter tuning 
param_grid = {"n_estimators" : n_estimaters,
             "max_depth" : max_depth,
             "max_features" : max_features}
print(param_grid)

{'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100], 'max_depth': [2, 3, 4], 'max_features': ['auto', 'sqrt']}


In [7]:

# these function is access within rfmodel but you can use it separatetly
#train test split 
def get_train_test(df, dropcols, target, test_size):
    """this function will take raw dataframe 
    and gives you train and test for model building """
    X = df.drop(dropcols, axis=1)
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=test_size,
                                                        random_state=10)
    return X_train, X_test, y_train, y_test 

#parameter tuning 

def param_tuning(X_train, X_test, y_train, y_test):
    """ this function will return best parameters"""
    rfr = RandomForestRegressor() # initialize model with no parameters 
    rf_tune = RandomizedSearchCV(rfr, param_distributions=param_grid,
                              cv = 5, verbose =2 , n_jobs=4)
    rf_tune.fit(X_train, y_train)
    print(f'Train Accuracy - : {rf_tune.score(X_train, y_train):.3f}')
    print(f'Test Accuracy - : {rf_tune.score(X_test, y_test):.3f}')
    return rf_tune.best_params_
#how to run 
#best_param = param_tuning(X_train, X_test, y_train, y_test)

# final model you should run this command only
def rfmodel():
    '''run rf model and hyperparameter'''
    X_train, X_test, y_train, y_test = get_train_test(df=df,
                                                      dropcols =dropcols,
                                                      target = target,
                                                      test_size = test_size)
    best_params = param_tuning(X_train, X_test, y_train, y_test)
    
    #fit best model 
    n_estimator = best_params['n_estimators']
    max_feature = best_params['max_features']
    max_dept = best_params['max_depth']
    bestmodel = RandomForestRegressor(n_estimators=n_estimator,
                                      max_features=max_feature,
                                      max_depth = max_dept,
                                      random_state = 10)
    bestmodel.fit(X_train, y_train)
    y_pred = bestmodel.predict(X_test)
    #get performance stats 
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    #get feature importance 
    feat_importance = pd.Series(bestmodel.feature_importances_, 
                                 index=X_train.columns)
    #feat_importance.nlargest(X_train.columns).plot(kind='barh', color = 'darkblue')
    
    return [bestmodel, mse, r2, feat_importance] # this return list of factors

 # how to run results = rfmodel()   # access model results[0], access mse = results[1]

# DO quick check

#### Run global model 

In [None]:
#global model quick check 
df = bigdf.copy()
results = rfmodel()

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


In [None]:
#global results 
results # access mse = results[1]

##### Run Individual model 

In [None]:
#Individual level model 
# quick check ( per person model )
mse = {}
for ids in bigdf.id.value_counts().index:
    print(ids)
    df = bigdf[bigdf.id==ids]
    results = rfmodel()
    mse[ids] = results[1]
    

pd.DataFrame(list(mse.items()),columns = ['id','mse'])['mse'].mean()