## DT

In [46]:
import pandas as pd
from tqdm import tqdm
import os
import numpy as np
from pathlib import Path
import csv
import random

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
# import lightgbm as lgb

from sklearn.metrics import (precision_score, recall_score, roc_auc_score, accuracy_score, mean_squared_error,
                             confusion_matrix, precision_recall_curve, roc_curve, brier_score_loss)

from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt

In [51]:
data_dir = "/home/asim/takshshila/IOT/Datasets/setup-stuff/gateway_and_dataset"
result_dir = "/home/asim/takshshila/IOT/Datasets/setup-stuff/gateway_and_dataset/Results/DT"
datasets = ["Instant_Liking"]

#x_column_drop = {"Instant_Liking": ['q8.2', 'q8.8', 'q8.12', 'q8.20']}

x_column_drop = {"Instant_Liking": []}

target_column = {"Instant_Liking": ['Instant.Liking']}

seeds = [1, 50, 100, 150, 200, 250, 300, 350, 400, 450]

In [52]:
def DT_pred_tuned(train_path, validation_path, train_column_drop, validation_column, details=None):
    
    df_train = pd.read_csv(train_path)
    df_validation = pd.read_csv(validation_path)
    
    X = df_train.drop(train_column_drop, axis = 1)
    y = df_train[validation_column]
    
    X_validation = df_validation.drop(train_column_drop, axis = 1)
    y_validation = df_validation[validation_column]
    
    # Build models with hyperparameters sets
    RSC = RandomizedSearchCV(
        estimator=DecisionTreeRegressor(),
        param_distributions={
            'criterion': ['squared_error'],
            'max_depth': range(1, 100, 10),
            'max_features': ['auto', 'sqrt', 'log2']}, 
        cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1, verbose = True)
    
    # Fit RandomizedSearchCV to find best hyperparameters
    search_result = RSC.fit(X, y.values.ravel())
    print("Best using: ", search_result.best_params_, "Score: ", search_result.best_score_)

    # Build models with optimized hyperparameters
    model_DT = DecisionTreeRegressor(
        criterion=search_result.best_params_["criterion"],
        max_depth=search_result.best_params_["max_depth"],
        max_features=search_result.best_params_["max_features"])

    model_DT.fit(X, y.values.ravel())
    train_pred = model_DT.predict(X)
    y_pred = model_DT.predict(X_validation)
    
    train_rmse = mean_squared_error(y, train_pred, squared=False)
    
    test_rmse = mean_squared_error(y_validation, y_pred, squared=False)

    details['training_rmse'] = train_rmse
    details['testing_rmse'] = test_rmse

    print('Training RMSE: ', train_rmse, 'Testing RMSE: ', test_rmse)

    return model_DT

In [53]:
def DT_pred_default(train_path, validation_path, train_column_drop, validation_column, details=None):
    
    df_train = pd.read_csv(train_path)
    df_validation = pd.read_csv(validation_path)
    
    
    X = df_train.drop(train_column_drop, axis = 1)
    y = df_train[validation_column]
    
   
    X_validation = df_validation.drop(train_column_drop, axis = 1)
    y_validation = df_validation[validation_column]

    # Build models with optimized hyperparameters
    model_DT = DecisionTreeRegressor()

    model_DT.fit(X, y.values.ravel())
    train_pred = model_DT.predict(X)
    y_pred = model_DT.predict(X_validation)
    
    train_rmse = mean_squared_error(y, train_pred, squared=False)
    
    test_rmse = mean_squared_error(y_validation, y_pred, squared=False)

    details['training_rmse'] = train_rmse
    details['testing_rmse'] = test_rmse

    print('Training RMSE: ', train_rmse, 'Testing RMSE: ', test_rmse)

    return model_DT

In [55]:
for index, dataset in enumerate(datasets):
    df = pd.DataFrame()
    for seed in tqdm(seeds):
        csv_path_train = os.path.join(data_dir, dataset, "Train", dataset + '_seed_' + str(seed) + '_train.csv')
        csv_path_validation = os.path.join(data_dir, dataset, "Train", dataset + '_seed_' + str(seed) + '_validation.csv')
        csv_path_test = os.path.join(data_dir, dataset, "Test", dataset + '_Test_seed' + str(seed) + '_modified.csv')
        df_test = pd.read_csv(csv_path_test)
        
        X_test = df_test.drop(x_column_drop[dataset], axis = 1)
        y_test = df_test[target_column[dataset]]
        
#         print(seed)
#         print("test_", X_test.columns[X_test.isna().any()].tolist(), X_test.columns)
        
#         df_train = pd.read_csv(csv_path_train)
#         df_val = pd.read_csv(csv_path_validation)
        
#         #print(seed)
#         print("train_", df_train.columns[df_train.isna().any()].tolist(), df_train.columns)
#         print("val", df_val.columns[df_val.isna().any()].tolist(), df_val.columns)
        
        
        details = {
            'dataset': dataset,
            'seed': str(seed)
        }
        
        # For DT
        model_DT = DT_pred_tuned(csv_path_train, csv_path_validation, x_column_drop[dataset], target_column[dataset], details)
        
        importances = model_DT.feature_importances_
        indices = np.argsort(importances)[::-1]
        top_k = 10
        top_indices = indices[:top_k]
        details['best_feature_list'] = np.array(X_test.columns)[indices][0:top_k]
        
        y_pred = model_DT.predict(X_test)
        validation_rmse = mean_squared_error(y_test, y_pred, squared=False)
        details['validation_rmse'] = validation_rmse
        
        df = df.append(details, ignore_index=True)
        #filepath = Path(dataset + '_DT_tuned.csv')
        filepath = Path(result_dir, dataset, "{}_DT_tuned_.csv".format(dataset))
        filepath.parent.mkdir(parents=True, exist_ok=True)
        df.to_csv(filepath, index=False)

  0%|                                                    | 0/10 [00:00<?, ?it/s]

Fitting 5 folds for each of 10 candidates, totalling 50 fits


 20%|████████▊                                   | 2/10 [00:01<00:04,  1.70it/s]

Best using:  {'max_features': 'auto', 'max_depth': 41, 'criterion': 'squared_error'} Score:  0.0
Training RMSE:  0.0 Testing RMSE:  0.0
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best using:  {'max_features': 'auto', 'max_depth': 41, 'criterion': 'squared_error'} Score:  0.0
Training RMSE:  0.0 Testing RMSE:  0.0
Fitting 5 folds for each of 10 candidates, totalling 50 fits


 40%|█████████████████▌                          | 4/10 [00:01<00:01,  3.61it/s]

Best using:  {'max_features': 'auto', 'max_depth': 11, 'criterion': 'squared_error'} Score:  0.0
Training RMSE:  0.0 Testing RMSE:  0.0
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best using:  {'max_features': 'auto', 'max_depth': 11, 'criterion': 'squared_error'} Score:  0.0
Training RMSE:  0.0 Testing RMSE:  0.0
Fitting 5 folds for each of 10 candidates, totalling 50 fits


 60%|██████████████████████████▍                 | 6/10 [00:01<00:00,  5.23it/s]

Best using:  {'max_features': 'auto', 'max_depth': 91, 'criterion': 'squared_error'} Score:  0.0
Training RMSE:  0.0 Testing RMSE:  0.0
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best using:  {'max_features': 'auto', 'max_depth': 31, 'criterion': 'squared_error'} Score:  0.0
Training RMSE:  0.0 Testing RMSE:  0.0
Fitting 5 folds for each of 10 candidates, totalling 50 fits


 80%|███████████████████████████████████▏        | 8/10 [00:02<00:00,  6.29it/s]

Best using:  {'max_features': 'auto', 'max_depth': 41, 'criterion': 'squared_error'} Score:  0.0
Training RMSE:  0.0 Testing RMSE:  0.0
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best using:  {'max_features': 'auto', 'max_depth': 31, 'criterion': 'squared_error'} Score:  0.0
Training RMSE:  0.0 Testing RMSE:  0.0
Fitting 5 folds for each of 10 candidates, totalling 50 fits


100%|███████████████████████████████████████████| 10/10 [00:02<00:00,  4.17it/s]

Best using:  {'max_features': 'auto', 'max_depth': 81, 'criterion': 'squared_error'} Score:  0.0
Training RMSE:  0.0 Testing RMSE:  0.0
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best using:  {'max_features': 'auto', 'max_depth': 11, 'criterion': 'squared_error'} Score:  0.0
Training RMSE:  0.0 Testing RMSE:  0.0





In [56]:

for index, dataset in enumerate(datasets):
    df = pd.DataFrame()
    print(dataset)
    for seed in tqdm(seeds):
        csv_path_train = os.path.join(data_dir, dataset, "Train", dataset + '_seed_' + str(seed) + '_train.csv')
        csv_path_validation = os.path.join(data_dir, dataset, "Train", dataset + '_seed_' + str(seed) + '_validation.csv')
        csv_path_test = os.path.join(data_dir, dataset, "Test", dataset + '_Test_seed' + str(seed) + '_modified.csv')
        df_test = pd.read_csv(csv_path_test)
            
        X_test = df_test.drop(x_column_drop[dataset], axis = 1)
        y_test = df_test[target_column[dataset]]
        
        details = {
            'dataset': dataset,
            'seed': str(seed)
        }
        
        # For DT
        model_DT = DT_pred_default(csv_path_train, csv_path_validation, x_column_drop[dataset], target_column[dataset], details)
        
        importances = model_DT.feature_importances_
        indices = np.argsort(importances)[::-1]
        top_k = 10
        top_indices = indices[:top_k]
        details['best_feature_list'] = np.array(X_test.columns)[indices][0:top_k]
        
        y_pred = model_DT.predict(X_test)
        validation_rmse = mean_squared_error(y_test, y_pred, squared=False)
        details['validation_rmse'] = validation_rmse
        
        df = df.append(details, ignore_index=True)
        #filepath = Path(dataset + '_DT_default.csv')
        filepath = Path(result_dir, dataset,"{}_DT_default.csv".format(dataset))
        filepath.parent.mkdir(parents=True, exist_ok=True)
        df.to_csv(filepath, index=False)

Instant_Liking


 40%|█████████████████▌                          | 4/10 [00:00<00:00, 33.04it/s]

Training RMSE:  0.0 Testing RMSE:  0.0
Training RMSE:  0.0 Testing RMSE:  0.0
Training RMSE:  0.0 Testing RMSE:  0.0
Training RMSE:  0.0 Testing RMSE:  0.0
Training RMSE:  0.0 Testing RMSE:  0.0
Training RMSE:  0.0 Testing RMSE:  0.0
Training RMSE:  0.0 Testing RMSE:  0.0


100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 32.47it/s]

Training RMSE:  0.0 Testing RMSE:  0.0
Training RMSE:  0.0 Testing RMSE:  0.0
Training RMSE:  0.0 Testing RMSE:  0.0





## GB

In [57]:
import pandas as pd
from tqdm import tqdm
import os
import numpy as np
from pathlib import Path
import csv
import random

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
# import lightgbm as lgb

from sklearn.metrics import (precision_score, recall_score, roc_auc_score, accuracy_score, mean_squared_error,
                             confusion_matrix, precision_recall_curve, roc_curve, brier_score_loss)

from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt

In [58]:
data_dir = "/home/asim/takshshila/IOT/Datasets/setup-stuff/gateway_and_dataset"
result_dir = "/home/asim/takshshila/IOT/Datasets/setup-stuff/gateway_and_dataset/Results/GB"
datasets = ["Instant_Liking"]

#x_column_drop = {"Instant_Liking": ['q8.2', 'q8.8', 'q8.12', 'q8.20']}
x_column_drop = {"Instant_Liking": []}

target_column = {"Instant_Liking": ['Instant.Liking']}

seeds = [1, 50, 100, 150, 200, 250, 300, 350, 400, 450]

In [59]:
def GB_pred_tuned(train_path, validation_path, train_column_drop, validation_column, details=None):
    
    df_train = pd.read_csv(train_path)
    df_validation = pd.read_csv(validation_path)
    
    X = df_train.drop(train_column_drop, axis = 1)
    y = df_train[validation_column]
    
    X_validation = df_validation.drop(train_column_drop, axis = 1)
    y_validation = df_validation[validation_column]
    
    # Build models with hyperparameters sets
    RSC = RandomizedSearchCV(
        estimator=GradientBoostingRegressor(loss='squared_error'),
        param_distributions={
            'n_estimators': range(1, 200, 10),
            'max_depth': range(1, 100, 10),
            'max_features': ['auto', 'sqrt', 'log2']
        },
        cv=5,
        scoring='neg_root_mean_squared_error',
        n_jobs=-1
    )
    
    # Fit RandomizedSearchCV to find best hyperparameters
    search_result = RSC.fit(X, y.values.ravel())
    print("Best using: ", search_result.best_params_, "Score: ", search_result.best_score_)

    # Build models with optimized hyperparameters
    model_GB = GradientBoostingRegressor(
        n_estimators=search_result.best_params_["n_estimators"],
        max_depth=search_result.best_params_["max_depth"],
        max_features=search_result.best_params_["max_features"],
        loss='squared_error'
    )
    
    # for train, test in kf.split(X):
    model_GB.fit(X, y.values.ravel())
    train_pred = model_GB.predict(X)
    y_pred = model_GB.predict(X_validation)
    
    train_rmse = mean_squared_error(y, train_pred, squared=False)
    
    test_rmse = mean_squared_error(y_validation, y_pred, squared=False)

    details['training_rmse'] = train_rmse
    details['testing_rmse'] = test_rmse

    print('Training RMSE: ', train_rmse, 'Testing RMSE: ', test_rmse)

    return model_GB

In [60]:
def GB_pred_default(train_path, validation_path, train_column_drop, validation_column, details=None):
    
    df_train = pd.read_csv(train_path)
    df_validation = pd.read_csv(validation_path)
    
    X = df_train.drop(train_column_drop, axis = 1)
    y = df_train[validation_column]
    
    X_validation = df_validation.drop(train_column_drop, axis = 1)
    y_validation = df_validation[validation_column]
    
    model_GB = GradientBoostingRegressor(loss='squared_error')
    
    # for train, test in kf.split(X):
    model_GB.fit(X, y.values.ravel())
    train_pred = model_GB.predict(X)
    y_pred = model_GB.predict(X_validation)
    
    train_rmse = mean_squared_error(y, train_pred, squared=False)
    
    test_rmse = mean_squared_error(y_validation, y_pred, squared=False)

    details['training_rmse'] = train_rmse
    details['testing_rmse'] = test_rmse

    print('Training RMSE: ', train_rmse, 'Testing RMSE: ', test_rmse)

    return model_GB

In [61]:
# For GB tuned

for index, dataset in enumerate(datasets):
    df = pd.DataFrame()
    print(dataset)
    for seed in tqdm(seeds):
        csv_path_train = os.path.join(data_dir, dataset, "Train", dataset + '_seed_' + str(seed) + '_train.csv')
        csv_path_validation = os.path.join(data_dir, dataset, "Train", dataset + '_seed_' + str(seed) + '_validation.csv')
        csv_path_test = os.path.join(data_dir, dataset, "Test", dataset + '_Test_seed' + str(seed) + '_modified.csv')
        df_test = pd.read_csv(csv_path_test)
        
        # print(csv_path_train, csv_path_test)
        
        X_test = df_test.drop(x_column_drop[dataset], axis = 1)
        y_test = df_test[target_column[dataset]]
        
        details = {
            'dataset': dataset,
            'seed': str(seed)
        }
        
        # For GB
        model_GB = GB_pred_tuned(csv_path_train, csv_path_validation, x_column_drop[dataset], target_column[dataset], details)
        
        importances = model_GB.feature_importances_
        indices = np.argsort(importances)[::-1]
        top_k = 10
        top_indices = indices[:top_k]
        details['best_feature_list'] = np.array(X_test.columns)[indices][0:top_k]
        
        y_pred = model_GB.predict(X_test)
        validation_rmse = mean_squared_error(y_test, y_pred, squared=False)
        details['validation_rmse'] = validation_rmse
        
        df = df.append(details, ignore_index=True)
        filepath = Path(dataset + '_GB_tuned.csv')
        filepath.parent.mkdir(parents=True, exist_ok=True)
        df.to_csv(filepath, index=False)

Instant_Liking


 10%|████▍                                       | 1/10 [00:01<00:11,  1.22s/it]

Best using:  {'n_estimators': 71, 'max_features': 'auto', 'max_depth': 71} Score:  -0.00024405844094009497
Training RMSE:  0.00024396746158121584 Testing RMSE:  0.00024001456975285008


 20%|████████▊                                   | 2/10 [00:02<00:08,  1.11s/it]

Best using:  {'n_estimators': 61, 'max_features': 'auto', 'max_depth': 31} Score:  -0.0007094611373746764
Training RMSE:  0.0007094072289094762 Testing RMSE:  0.000691482464993189
Best using:  {'n_estimators': 151, 'max_features': 'auto', 'max_depth': 61} Score:  -5.263115722996402e-08


 30%|█████████████▏                              | 3/10 [00:03<00:09,  1.39s/it]

Training RMSE:  5.2617637245807296e-08 Testing RMSE:  5.3999550384404055e-08


 40%|█████████████████▌                          | 4/10 [00:05<00:08,  1.34s/it]

Best using:  {'n_estimators': 71, 'max_features': 'auto', 'max_depth': 31} Score:  -0.000245501488233282
Training RMSE:  0.00024547340241273704 Testing RMSE:  0.00024097045625570904
Best using:  {'n_estimators': 181, 'max_features': 'auto', 'max_depth': 31} Score:  -1.3541376934642696e-08


 50%|██████████████████████                      | 5/10 [00:06<00:07,  1.47s/it]

Training RMSE:  1.3536196845100662e-08 Testing RMSE:  1.347536201008617e-08
Best using:  {'n_estimators': 181, 'max_features': 'auto', 'max_depth': 51} Score:  -1.46020981190102e-08


 60%|██████████████████████████▍                 | 6/10 [00:08<00:06,  1.59s/it]

Training RMSE:  1.4874872162667696e-08 Testing RMSE:  1.4917029983470483e-08
Best using:  {'n_estimators': 121, 'max_features': 'auto', 'max_depth': 71} Score:  -1.256476471088893e-06
Training RMSE:  1.2562288327218208e-06 Testing RMSE:  1.2184833171908293e-06


 70%|██████████████████████████████▊             | 7/10 [00:10<00:04,  1.57s/it]

Best using:  {'n_estimators': 171, 'max_features': 'auto', 'max_depth': 41} Score:  -1.4109985774210231e-08


 80%|███████████████████████████████████▏        | 8/10 [00:12<00:03,  1.76s/it]

Training RMSE:  1.3487292515707262e-08 Testing RMSE:  1.3326263378697638e-08


 90%|███████████████████████████████████████▌    | 9/10 [00:13<00:01,  1.62s/it]

Best using:  {'n_estimators': 121, 'max_features': 'auto', 'max_depth': 81} Score:  -1.2585454373943123e-06
Training RMSE:  1.2584740889837721e-06 Testing RMSE:  1.2861361646827227e-06
Best using:  {'n_estimators': 191, 'max_features': 'auto', 'max_depth': 81} Score:  -1.3693030400662802e-08


100%|███████████████████████████████████████████| 10/10 [00:15<00:00,  1.56s/it]

Training RMSE:  1.3690412573680062e-08 Testing RMSE:  1.3667214956351285e-08





In [62]:
# For GB default

for index, dataset in enumerate(datasets):
    df = pd.DataFrame()
    print(dataset)
    for seed in tqdm(seeds):
        csv_path_train = os.path.join(data_dir, dataset, "Train", dataset + '_seed_' + str(seed) + '_train.csv')
        csv_path_validation = os.path.join(data_dir, dataset, "Train", dataset + '_seed_' + str(seed) + '_validation.csv')
        csv_path_test = os.path.join(data_dir, dataset, "Test", dataset + '_Test_seed' + str(seed) + '_modified.csv')
        df_test = pd.read_csv(csv_path_test)
        
        # print(csv_path_train, csv_path_test)
        
        X_test = df_test.drop(x_column_drop[dataset], axis = 1)
        y_test = df_test[target_column[dataset]]
        
        details = {
            'dataset': dataset,
            'seed': str(seed)
        }
        
        # For GB
        model_GB = GB_pred_default(csv_path_train, csv_path_validation, x_column_drop[dataset], target_column[dataset], details)
        
        importances = model_GB.feature_importances_
        indices = np.argsort(importances)[::-1]
        top_k = 10
        top_indices = indices[:top_k]
        details['best_feature_list'] = np.array(X_test.columns)[indices][0:top_k]
        
        y_pred = model_GB.predict(X_test)
        validation_rmse = mean_squared_error(y_test, y_pred, squared=False)
        details['validation_rmse'] = validation_rmse
        
        df = df.append(details, ignore_index=True)
        filepath = Path(dataset + '_GB_default.csv')
        filepath.parent.mkdir(parents=True, exist_ok=True)
        df.to_csv(filepath, index=False)

Instant_Liking


 10%|████▍                                       | 1/10 [00:00<00:01,  5.06it/s]

Training RMSE:  1.1491181419897724e-05 Testing RMSE:  1.1304995127520057e-05


 30%|█████████████▏                              | 3/10 [00:00<00:01,  4.99it/s]

Training RMSE:  1.1650739120389105e-05 Testing RMSE:  1.1356357079049993e-05
Training RMSE:  1.134392537624323e-05 Testing RMSE:  1.1641854365313064e-05


 40%|█████████████████▌                          | 4/10 [00:00<00:01,  4.32it/s]

Training RMSE:  1.1562113171141504e-05 Testing RMSE:  1.1350018611966868e-05


 50%|██████████████████████                      | 5/10 [00:01<00:01,  4.11it/s]

Training RMSE:  1.1480903293928917e-05 Testing RMSE:  1.1429305435800621e-05


 60%|██████████████████████████▍                 | 6/10 [00:01<00:01,  3.73it/s]

Training RMSE:  1.1354686497578853e-05 Testing RMSE:  1.138686753814791e-05


 70%|██████████████████████████████▊             | 7/10 [00:01<00:00,  3.85it/s]

Training RMSE:  1.148090329392493e-05 Testing RMSE:  1.1135940177065398e-05


 80%|███████████████████████████████████▏        | 8/10 [00:01<00:00,  3.90it/s]

Training RMSE:  1.1439424344666385e-05 Testing RMSE:  1.1302845357478701e-05


 90%|███████████████████████████████████████▌    | 9/10 [00:02<00:00,  3.79it/s]

Training RMSE:  1.1501423098399319e-05 Testing RMSE:  1.1754231828543837e-05


100%|███████████████████████████████████████████| 10/10 [00:02<00:00,  4.05it/s]

Training RMSE:  1.1611703362546328e-05 Testing RMSE:  1.1592027992183146e-05





## Random Forest

In [69]:
import pandas as pd
from tqdm import tqdm
import os
import numpy as np
from pathlib import Path
import csv
import random

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
# import lightgbm as lgb

from sklearn.metrics import (precision_score, recall_score, roc_auc_score, accuracy_score, mean_squared_error,
                             confusion_matrix, precision_recall_curve, roc_curve, brier_score_loss)

from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt

In [70]:
data_dir = "/home/asim/takshshila/IOT/Datasets/setup-stuff/gateway_and_dataset"
result_dir = "/home/asim/takshshila/IOT/Datasets/setup-stuff/gateway_and_dataset/Results/GB"
datasets = ["Instant_Liking"]

x_column_drop = {"Instant_Liking": []} #'q8.2', 'q8.8', 'q8.12', 'q8.20'

target_column = {"Instant_Liking": ['Instant.Liking']}

seeds = [1, 50, 100, 150, 200, 250, 300, 350, 400, 450]

In [71]:
# Random Forest Module -> hyperparameter tuning
def RF_pred_tuned(train_path, validation_path, train_column_drop, validation_column, details=None):
    
    df_train = pd.read_csv(train_path)
    df_validation = pd.read_csv(validation_path)
    
    X = df_train.drop(train_column_drop, axis = 1)
    y = df_train[validation_column]
    
    X_validation = df_validation.drop(train_column_drop, axis = 1)
    y_validation = df_validation[validation_column]
    
    # Build models with hyperparameters sets
    RSC = RandomizedSearchCV(
        estimator=RandomForestRegressor(),
        param_distributions={
            'n_estimators': range(1, 200, 10),
            'max_depth': range(1, 100, 10),
            'max_features': ['auto', 'sqrt', 'log2']}, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
    
    # Fit RandomizedSearchCV to find best hyperparameters
    search_result = RSC.fit(X, y.values.ravel())
    print("Best using: ", search_result.best_params_, "Score: ", search_result.best_score_)

    # Build models with optimized hyperparameters
    model_RF = RandomForestRegressor(
        n_estimators=search_result.best_params_["n_estimators"],
        max_depth=search_result.best_params_["max_depth"],
        max_features=search_result.best_params_["max_features"]
    )
    
    model_RF.fit(X, y.values.ravel())
    train_pred = model_RF.predict(X)
    y_pred = model_RF.predict(X_validation)
    
    train_rmse = mean_squared_error(y, train_pred, squared=False)
    
    test_rmse = mean_squared_error(y_validation, y_pred, squared=False)

    details['training_rmse'] = train_rmse
    details['testing_rmse'] = test_rmse

    print('Training RMSE: ', train_rmse, 'Testing RMSE: ', test_rmse)
    
    return model_RF

In [72]:
# Random Forest Module -> default parameters
def RF_pred_default(train_path, validation_path, train_column_drop, validation_column, details=None):
    
    df_train = pd.read_csv(train_path)
    df_validation = pd.read_csv(validation_path)
    
    X = df_train.drop(train_column_drop, axis = 1)
    y = df_train[validation_column]
    
    X_validation = df_validation.drop(train_column_drop, axis = 1)
    y_validation = df_validation[validation_column]
    
    model_RF = RandomForestRegressor()
    
    model_RF.fit(X, y.values.ravel())
    train_pred = model_RF.predict(X)
    y_pred = model_RF.predict(X_validation)
    
    train_rmse = mean_squared_error(y, train_pred, squared=False)
    
    test_rmse = mean_squared_error(y_validation, y_pred, squared=False)

    details['training_rmse'] = train_rmse
    details['testing_rmse'] = test_rmse

    print('Training RMSE: ', train_rmse, 'Testing RMSE: ', test_rmse)
    
    return model_RF

In [73]:
# Driver for tuned RF

for index, dataset in enumerate(datasets):
    df = pd.DataFrame()
    print(dataset)
    for seed in tqdm(seeds):
        csv_path_train = os.path.join(data_dir, dataset, "Train", dataset + '_seed_' + str(seed) + '_train.csv')
        csv_path_validation = os.path.join(data_dir, dataset, "Train", dataset + '_seed_' + str(seed) + '_validation.csv')
        csv_path_test = os.path.join(data_dir, dataset, "Test", dataset + '_Test_seed' + str(seed) + '_modified.csv')
        df_test = pd.read_csv(csv_path_test)
        
        X_test = df_test.drop(x_column_drop[dataset], axis = 1)
        y_test = df_test[target_column[dataset]]
        
        details = {
            'dataset': dataset,
            'seed': str(seed)
        }
        
        # For RF
        model_RF = RF_pred_tuned(csv_path_train, csv_path_validation, x_column_drop[dataset], target_column[dataset], details)
        
        importances = model_RF.feature_importances_
        indices = np.argsort(importances)[::-1]
        top_k = 10
        top_indices = indices[:top_k]
        details['best_feature_list'] = np.array(X_test.columns)[indices][0:top_k]
        
        y_pred = model_RF.predict(X_test)
        validation_rmse = mean_squared_error(y_test, y_pred, squared=False)
        details['validation_rmse'] = validation_rmse
        
        df = df.append(details, ignore_index=True)
        filepath = Path(dataset + '_RF_tuned.csv')
        filepath.parent.mkdir(parents=True, exist_ok=True)
        df.to_csv(filepath, index=False)

Instant_Liking


  0%|                                                    | 0/10 [00:00<?, ?it/s]

Best using:  {'n_estimators': 171, 'max_features': 'auto', 'max_depth': 21} Score:  0.0


 10%|████▍                                       | 1/10 [00:02<00:26,  3.00s/it]

Training RMSE:  0.0 Testing RMSE:  0.0
Best using:  {'n_estimators': 101, 'max_features': 'auto', 'max_depth': 91} Score:  0.0
Training RMSE:  0.0 Testing RMSE:  0.0


 20%|████████▊                                   | 2/10 [00:04<00:16,  2.10s/it]

Best using:  {'n_estimators': 131, 'max_features': 'auto', 'max_depth': 91} Score:  0.0


 30%|█████████████▏                              | 3/10 [00:06<00:14,  2.08s/it]

Training RMSE:  0.0 Testing RMSE:  0.0


 40%|█████████████████▌                          | 4/10 [00:08<00:11,  1.98s/it]

Best using:  {'n_estimators': 61, 'max_features': 'auto', 'max_depth': 41} Score:  0.0
Training RMSE:  0.0 Testing RMSE:  0.0
Best using:  {'n_estimators': 141, 'max_features': 'auto', 'max_depth': 41} Score:  0.0


 50%|██████████████████████                      | 5/10 [00:10<00:10,  2.02s/it]

Training RMSE:  0.0 Testing RMSE:  0.0
Best using:  {'n_estimators': 141, 'max_features': 'auto', 'max_depth': 11} Score:  0.0


 60%|██████████████████████████▍                 | 6/10 [00:12<00:08,  2.04s/it]

Training RMSE:  0.0 Testing RMSE:  0.0
Best using:  {'n_estimators': 191, 'max_features': 'auto', 'max_depth': 71} Score:  0.0


 70%|██████████████████████████████▊             | 7/10 [00:15<00:07,  2.35s/it]

Training RMSE:  0.0 Testing RMSE:  0.0


 80%|███████████████████████████████████▏        | 8/10 [00:17<00:04,  2.19s/it]

Best using:  {'n_estimators': 51, 'max_features': 'auto', 'max_depth': 81} Score:  0.0
Training RMSE:  0.0 Testing RMSE:  0.0
Best using:  {'n_estimators': 101, 'max_features': 'auto', 'max_depth': 91} Score:  0.0
Training RMSE:  0.0 Testing RMSE:  0.0


 90%|███████████████████████████████████████▌    | 9/10 [00:19<00:02,  2.06s/it]

Best using:  {'n_estimators': 131, 'max_features': 'auto', 'max_depth': 31} Score:  0.0


100%|███████████████████████████████████████████| 10/10 [00:21<00:00,  2.15s/it]

Training RMSE:  0.0 Testing RMSE:  0.0





In [74]:
# Driver for default RF

for index, dataset in enumerate(datasets):
    df = pd.DataFrame()
    print(dataset)
    for seed in tqdm(seeds):
        csv_path_train = os.path.join(data_dir, dataset, "Train", dataset + '_seed_' + str(seed) + '_train.csv')
        csv_path_validation = os.path.join(data_dir, dataset, "Train", dataset + '_seed_' + str(seed) + '_validation.csv')
        csv_path_test = os.path.join(data_dir, dataset, "Test", dataset + '_Test_seed' + str(seed) + '_modified.csv')
        df_test = pd.read_csv(csv_path_test)
        
        X_test = df_test.drop(x_column_drop[dataset], axis = 1)
        y_test = df_test[target_column[dataset]]
        
        details = {
            'dataset': dataset,
            'seed': str(seed)
        }
        
        # For RF
        model_RF = RF_pred_default(csv_path_train, csv_path_validation, x_column_drop[dataset], target_column[dataset], details)
        
        importances = model_RF.feature_importances_
        indices = np.argsort(importances)[::-1]
        top_k = 10
        top_indices = indices[:top_k]
        details['best_feature_list'] = np.array(X_test.columns)[indices][0:top_k]
        
        y_pred = model_RF.predict(X_test)
        validation_rmse = mean_squared_error(y_test, y_pred, squared=False)
        details['validation_rmse'] = validation_rmse
        
        df = df.append(details, ignore_index=True)
        filepath = Path(dataset + '_RF_default.csv')
        filepath.parent.mkdir(parents=True, exist_ok=True)
        df.to_csv(filepath, index=False)

Instant_Liking


  0%|                                                    | 0/10 [00:00<?, ?it/s]

Training RMSE:  0.0 Testing RMSE:  0.0


 20%|████████▊                                   | 2/10 [00:00<00:02,  3.61it/s]

Training RMSE:  0.0 Testing RMSE:  0.0


 30%|█████████████▏                              | 3/10 [00:00<00:01,  3.66it/s]

Training RMSE:  0.0 Testing RMSE:  0.0


 40%|█████████████████▌                          | 4/10 [00:01<00:01,  3.64it/s]

Training RMSE:  0.0 Testing RMSE:  0.0


 50%|██████████████████████                      | 5/10 [00:01<00:01,  3.64it/s]

Training RMSE:  0.0 Testing RMSE:  0.0


 60%|██████████████████████████▍                 | 6/10 [00:01<00:01,  3.84it/s]

Training RMSE:  0.0 Testing RMSE:  0.0


 70%|██████████████████████████████▊             | 7/10 [00:01<00:00,  4.02it/s]

Training RMSE:  0.0 Testing RMSE:  0.0


 80%|███████████████████████████████████▏        | 8/10 [00:02<00:00,  4.17it/s]

Training RMSE:  0.0 Testing RMSE:  0.0


 90%|███████████████████████████████████████▌    | 9/10 [00:02<00:00,  4.29it/s]

Training RMSE:  0.0 Testing RMSE:  0.0


100%|███████████████████████████████████████████| 10/10 [00:02<00:00,  4.01it/s]

Training RMSE:  0.0 Testing RMSE:  0.0



