In [1]:
import pandas as pd
from tqdm import tqdm
import os
import numpy as np
from pathlib import Path
import csv
import random

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
# import lightgbm as lgb

from sklearn.metrics import (precision_score, recall_score, roc_auc_score, accuracy_score, mean_squared_error,
                             confusion_matrix, precision_recall_curve, roc_curve, brier_score_loss)

from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt

In [None]:
columns_drop = {
    "Facebook_data": None,
    "Features_TestSet": None,
    "House_Price_Adv_Regression": ['Id'],
    "Instant_Liking": None,
    "Insurance": None,
    "Isolet": None,
    "new_data_trans": None,
    "OnlineNewsPopularity": None,
    "ParkinsonData": ['subject.'],
    "Sberbank_Russian_Housing_Market": ['id'],
    "slice_localization_data": None,
    "Telecom_data": ['Phone.Number'],
    "yearMSD_new": None,
    "arrhythmia": None,
    "Big_mart_sales": ['Item_Identifier', 'Outlet_Identifier', 'Outlet_Establishment_Year'],
    "blogData": None,
    "communities": ['x4', 'x1'],
    "dengue_features": ['year', 'weekofyear', 'week_start_date'],
    "ECG0_p02": None,
    "ENERGY_DATA_COMPLETE": None
}

fill_na = {
    "Facebook_data": False,
    "Features_TestSet": False,
    "House_Price_Adv_Regression": False,
    "Instant_Liking": False,
    "Insurance": False,
    "Isolet": False,
    "new_data_trans": False,
    "OnlineNewsPopularity": False,
    "ParkinsonData": False,
    "Sberbank_Russian_Housing_Market": False,
    "slice_localization_data": False,
    "Telecom_data": False,
    "yearMSD_new": False,
    "arrhythmia": False,
    "Big_mart_sales": True,
    "blogData": False,
    "communities": False,
    "dengue_features": False,
    "ECG0_p02": False,
    "ENERGY_DATA_COMPLETE": False
}

fill_na_column = {
    "Facebook_data": [],
    "Features_TestSet": [],
    "House_Price_Adv_Regression": [],
    "Instant_Liking": [],
    "Insurance": [],
    "Isolet": [],
    "new_data_trans": [],
    "OnlineNewsPopularity": [],
    "ParkinsonData": [],
    "Sberbank_Russian_Housing_Market": [],
    "slice_localization_data": [],
    "Telecom_data": [],
    "yearMSD_new": [],
    "arrhythmia": [],
    "Big_mart_sales": ['Outlet_Size'],
    "blogData": [],
    "communities": [],
    "dengue_features": [],
    "ECG0_p02": [],
    "ENERGY_DATA_COMPLETE": [],
}

def process_dataset(df_train, df_test, columns_drop, fill_na, fill_na_columns):    
    df_total = pd.concat([df_train, df_test], axis=0)
    tar = df_total.iloc[:, -1]
    df_total = df_total.iloc[:, :-1]
    
    if fill_na:
        for column in fill_na_columns:
            df_total[column].fillna("NOT_PRESENT", inplace=True)
    
    if columns_drop:
        df_total = df_total.drop(columns_drop, axis='columns')
    
    cat = df_total.select_dtypes(include=['object']).columns.to_list()
    df_total = pd.get_dummies(df_total, cat)
    
    df_total = pd.concat([df_total, tar], axis=1)
    df_train = df_total.iloc[:df_train.shape[0],:]
    df_test = df_total.iloc[df_train.shape[0]:,:]
    
    return df_train, df_test

data_dir = "/home/asim/ssriva59/setup-stuff/gateway_and_dataset"
datasets = [
    # "Facebook_data",
    # "Features_TestSet",
    # "House_Price_Adv_Regression",
    # "Instant_Liking",
    # "Insurance",
    "Isolet",
    # "new_data_trans",
    # "OnlineNewsPopularity",
    # "ParkinsonData",
    # "Sberbank_Russian_Housing_Market",
    # "slice_localization_data",
    # "Telecom_data",
    # "yearMSD_new",
    # "arrhythmia",
    # "Big_mart_sales",
    # "blogData",
    # "communities",
    # "dengue_features",
    # "ECG0_p02",
    # "ENERGY_DATA_COMPLETE"
]

# Fill missing for Instant_Liking

seeds = [1, 50, 100, 150, 200, 250, 300, 350, 400, 450]

for index, dataset in enumerate(datasets):
    print(dataset)
    for seed in tqdm(seeds):
        csv_path_train = os.path.join(data_dir, dataset, "Train", dataset + '_Train_seed' + str(seed) + '.csv')
        df_train = pd.read_csv(csv_path_train)
        
        csv_path_test = os.path.join(data_dir, dataset, "Test", dataset + '_Test_seed' + str(seed) + '.csv')
        df_test = pd.read_csv(csv_path_test)
        
        df_train, df_test = process_dataset(df_train, df_test, columns_drop[dataset], fill_na[dataset], fill_na_column[dataset])
        df_train.to_csv(os.path.join(data_dir, dataset, "Train", dataset + '_Train_seed' + str(seed) + '_modified.csv'), index=False)
        df_test.to_csv(os.path.join(data_dir, dataset, "Test", dataset + '_Test_seed' + str(seed) + '_modified.csv'), index=False)

In [None]:
# Specifically for big mart dataset
def process_big_mart(df_train):
    df_train['Outlet_Size'].fillna("NOT_PRESENT", inplace=True)
    
    df_train = df_train.drop(['Item_Identifier', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'Item_Outlet_Sales'], axis='columns')
    
    dummies = pd.get_dummies(df_train.Item_Fat_Content)
    df_train = pd.concat([df_train, dummies], axis='columns')
    dummies = pd.get_dummies(df_train.Item_Type)
    df_train = pd.concat([df_train, dummies], axis='columns')
    dummies = pd.get_dummies(df_train.Outlet_Size)
    df_train = pd.concat([df_train, dummies], axis='columns')
    dummies = pd.get_dummies(df_train.Outlet_Location_Type)
    df_train = pd.concat([df_train, dummies], axis='columns')
    dummies = pd.get_dummies(df_train.Outlet_Type)
    df_train = pd.concat([df_train, dummies], axis='columns')
    df_train['Item_Outlet_Sales_Log'] = np.log(df_train['Item_Outlet_Sales'])
    return df_train

def get_missing():
    missingValueColumns = datafrm.columns[datafrm.isnull().any()].tolist()
    percent_missing = datafrm[missingValueColumns].isnull().sum()
    print("Missing value count columnwise:")
    print(percent_missing)



data_dir = "/home/asim/ssriva59/setup-stuff/gateway_and_dataset"
dataset = "Big_mart_sales"

seeds = [1, 50, 100, 150, 200, 250, 300, 350, 400, 450]
for s in tqdm(seeds):
    csv_path_train = os.path.join(data_dir, dataset, "Train", dataset + '_Train_seed' + str(s) + '.csv')
    df_train = pd.read_csv(csv_path_train)
    csv_path_test = os.path.join(data_dir, dataset, "Test", dataset + '_Test_seed' + str(s) + '.csv')
    df_test = pd.read_csv(csv_path_test)
    
    df_train = process_big_mart(df_train)
    df_test = process_big_mart(df_test)
    df_train.to_csv(os.path.join(data_dir, dataset, "Train", dataset + '_Train_seed' + str(s) + '_modified.csv'), index=False)
    df_test.to_csv(os.path.join(data_dir, dataset, "Test", dataset + '_Test_seed' + str(s) + '_modified.csv'), index=False)
    

In [None]:
def RF_pred_kfold(X, y, details=None):
    
    # Build models with hyperparameters sets
    RSC = RandomizedSearchCV(
        estimator=RandomForestRegressor(),
        param_distributions={
            'n_estimators': range(1, 200, 10),
            'max_depth': range(1, 100, 10),
            'max_features': ['auto', 'sqrt', 'log2']}, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
    
    # Fit RandomizedSearchCV to find best hyperparameters
    search_result = RSC.fit(X, y.values.ravel())
    print("Best using: ", search_result.best_params_, "Score: ", search_result.best_score_)

    # Build models with optimized hyperparameters
    model_RF = RandomForestRegressor(
        n_estimators=search_result.best_params_["n_estimators"],
        max_depth=search_result.best_params_["max_depth"],
        max_features=search_result.best_params_["max_features"]
    )
    

    # Split dataset into 5 consecutive folds
    kf = KFold(n_splits=5, shuffle=True, random_state=None)
    
    i = 1
    for train, test in kf.split(X):
        X_train = X.iloc[train,:]
        y_train = y.iloc[train,:]
        X_test = X.iloc[test]
        y_test = y.iloc[test]
        model_RF.fit(X_train, y_train.values.ravel())
        train_pred = model_RF.predict(X_train)
        y_pred = model_RF.predict(X_test)
        
#         train_accuracy = accuracy_score(y_train, train_pred)
#         train_precision = precision_score(y_train, train_pred)
#         train_recall = recall_score(y_train, train_pred)
#         train_auc = roc_auc_score(y_train, train_pred)
        train_rmse = mean_squared_error(y_train, train_pred, squared=False)
        
#         test_accuracy = accuracy_score(y_test, y_pred)
#         test_precision = precision_score(y_test, y_pred)
#         test_recall = recall_score(y_test, y_pred)
#         test_auc = roc_auc_score(y_test, y_pred)
        test_rmse = mean_squared_error(y_test, y_pred, squared=False)
        
        details['fold'] = str(i)
        details['training_rmse'] = train_rmse
        details['testing_rmse'] = test_rmse
        
        print('Fold '+ str(i), ':  Training RMSE: ', train_rmse, 'Testing RMSE: ', test_rmse)
#         print('Fold '+ str(i), ':  Training precision: ', train_precision, 'Testing precision: ', test_precision)
#         print('Fold '+ str(i), ':  Training recall: ', train_recall, 'Testing accuracy: ', test_recall)
#         print('Fold '+ str(i), ':  Training auc: ', train_auc, 'Testing auc: ', test_auc)
        
        i += 1

    return model_RF

In [None]:
# Decision Tree module
def DT_pred(train_path, validation_path, train_column_drop, validation_column, details=None):
    
    df_train = pd.read_csv(train_path)
    df_validation = pd.read_csv(validation_path)
    
    X = df_train.drop(train_column_drop, axis = 1)
    y = df_train[validation_column]
    
    X_validation = df_validation.drop(train_column_drop, axis = 1)
    y_validation = df_validation[validation_column]
    
    # Build models with hyperparameters sets
    RSC = RandomizedSearchCV(
        estimator=DecisionTreeRegressor(),
        param_distributions={
            'criterion': ['squared_error'],
            'max_depth': range(1, 100, 10),
            'max_features': ['auto', 'sqrt', 'log2']}, 
        cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1, verbose = True)
    
    # Fit RandomizedSearchCV to find best hyperparameters
    search_result = RSC.fit(X, y.values.ravel())
    print("Best using: ", search_result.best_params_, "Score: ", search_result.best_score_)

    # Build models with optimized hyperparameters
    model_DT = DecisionTreeRegressor(
        criterion=search_result.best_params_["criterion"],
        max_depth=search_result.best_params_["max_depth"],
        max_features=search_result.best_params_["max_features"])

    model_DT.fit(X, y.values.ravel())
    train_pred = model_DT.predict(X)
    y_pred = model_DT.predict(X_validation)

#         train_accuracy = accuracy_score(y_train, train_pred)
#         train_precision = precision_score(y_train, train_pred)
#         train_recall = recall_score(y_train, train_pred)
#         train_auc = roc_auc_score(y_train, train_pred)
    train_rmse = mean_squared_error(y, train_pred, squared=False)

#         test_accuracy = accuracy_score(y_test, y_pred)
#         test_precision = precision_score(y_test, y_pred)
#         test_recall = recall_score(y_test, y_pred)
#         test_auc = roc_auc_score(y_test, y_pred)
    test_rmse = mean_squared_error(y_validation, y_pred, squared=False)

    details['training_rmse'] = train_rmse
    details['testing_rmse'] = test_rmse

    print('Training RMSE: ', train_rmse, 'Testing RMSE: ', test_rmse)
#         print('Fold '+ str(i), ':  Training precision: ', train_precision, 'Testing precision: ', test_precision)
#         print('Fold '+ str(i), ':  Training recall: ', train_recall, 'Testing accuracy: ', test_recall)
#         print('Fold '+ str(i), ':  Training auc: ', train_auc, 'Testing auc: ', test_auc)

    return model_DT

In [None]:
# Random Forest Module
def RF_pred(train_path, validation_path, train_column_drop, validation_column, details=None):
    
    df_train = pd.read_csv(train_path)
    df_validation = pd.read_csv(validation_path)
    
    X = df_train.drop(train_column_drop, axis = 1)
    y = df_train[validation_column]
    
    X_validation = df_validation.drop(train_column_drop, axis = 1)
    y_validation = df_validation[validation_column]
    
    # Build models with hyperparameters sets
    RSC = RandomizedSearchCV(
        estimator=RandomForestRegressor(),
        param_distributions={
            'n_estimators': range(1, 200, 10),
            'max_depth': range(1, 100, 10),
            'max_features': ['auto', 'sqrt', 'log2']}, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
    
    # Fit RandomizedSearchCV to find best hyperparameters
    search_result = RSC.fit(X, y.values.ravel())
    print("Best using: ", search_result.best_params_, "Score: ", search_result.best_score_)

    # Build models with optimized hyperparameters
    model_RF = RandomForestRegressor(
        n_estimators=search_result.best_params_["n_estimators"],
        max_depth=search_result.best_params_["max_depth"],
        max_features=search_result.best_params_["max_features"]
    )
    
    # for train, test in kf.split(X):
    model_RF.fit(X, y.values.ravel())
    train_pred = model_RF.predict(X)
    y_pred = model_RF.predict(X_validation)

#         train_accuracy = accuracy_score(y_train, train_pred)
#         train_precision = precision_score(y_train, train_pred)
#         train_recall = recall_score(y_train, train_pred)
#         train_auc = roc_auc_score(y_train, train_pred)
    train_rmse = mean_squared_error(y, train_pred, squared=False)

#         test_accuracy = accuracy_score(y_test, y_pred)
#         test_precision = precision_score(y_test, y_pred)
#         test_recall = recall_score(y_test, y_pred)
#         test_auc = roc_auc_score(y_test, y_pred)
    test_rmse = mean_squared_error(y_validation, y_pred, squared=False)

    details['training_rmse'] = train_rmse
    details['testing_rmse'] = test_rmse

    print('Training RMSE: ', train_rmse, 'Testing RMSE: ', test_rmse)
#         print('Fold '+ str(i), ':  Training precision: ', train_precision, 'Testing precision: ', test_precision)
#         print('Fold '+ str(i), ':  Training recall: ', train_recall, 'Testing accuracy: ', test_recall)
#         print('Fold '+ str(i), ':  Training auc: ', train_auc, 'Testing auc: ', test_auc)

    return model_RF

In [2]:
# Gradient Boosting Module
def GB_pred(train_path, validation_path, train_column_drop, validation_column, details=None):
    
    df_train = pd.read_csv(train_path)
    df_validation = pd.read_csv(validation_path)
    
    X = df_train.drop(train_column_drop, axis = 1)
    y = df_train[validation_column]
    
    X_validation = df_validation.drop(train_column_drop, axis = 1)
    y_validation = df_validation[validation_column]
    
    # Build models with hyperparameters sets
    RSC = RandomizedSearchCV(
        estimator=GradientBoostingRegressor(loss='squared_error'),
        param_distributions={
            'n_estimators': range(1, 200, 10),
            'max_depth': range(1, 100, 10),
            'max_features': ['auto', 'sqrt', 'log2']
        },
        cv=5,
        scoring='neg_root_mean_squared_error',
        n_jobs=-1
    )
    
    # Fit RandomizedSearchCV to find best hyperparameters
    search_result = RSC.fit(X, y.values.ravel())
    print("Best using: ", search_result.best_params_, "Score: ", search_result.best_score_)

    # Build models with optimized hyperparameters
    model_GB = GradientBoostingRegressor(
        n_estimators=search_result.best_params_["n_estimators"],
        max_depth=search_result.best_params_["max_depth"],
        max_features=search_result.best_params_["max_features"],
        loss='squared_error'
    )
    
    # for train, test in kf.split(X):
    model_GB.fit(X, y.values.ravel())
    train_pred = model_GB.predict(X)
    y_pred = model_GB.predict(X_validation)

#         train_accuracy = accuracy_score(y_train, train_pred)
#         train_precision = precision_score(y_train, train_pred)
#         train_recall = recall_score(y_train, train_pred)
#         train_auc = roc_auc_score(y_train, train_pred)
    train_rmse = mean_squared_error(y, train_pred, squared=False)

#         test_accuracy = accuracy_score(y_test, y_pred)
#         test_precision = precision_score(y_test, y_pred)
#         test_recall = recall_score(y_test, y_pred)
#         test_auc = roc_auc_score(y_test, y_pred)
    test_rmse = mean_squared_error(y_validation, y_pred, squared=False)

    details['training_rmse'] = train_rmse
    details['testing_rmse'] = test_rmse

    print('Training RMSE: ', train_rmse, 'Testing RMSE: ', test_rmse)
#         print('Fold '+ str(i), ':  Training precision: ', train_precision, 'Testing precision: ', test_precision)
#         print('Fold '+ str(i), ':  Training recall: ', train_recall, 'Testing accuracy: ', test_recall)
#         print('Fold '+ str(i), ':  Training auc: ', train_auc, 'Testing auc: ', test_auc)

    return model_GB

In [3]:
# normal Random Forest and Decision Tree with train set and validation set

data_dir = "/home/asim/ssriva59/setup-stuff/gateway_and_dataset"
datasets = [
    # "Facebook_data", # -> done,
    # "Features_TestSet", # -> done,
    # "House_Price_Adv_Regression", # -> done,
    # "Instant_Liking",
    # "Insurance", # -> done,
    # "Isolet",
    # "new_data_trans", # -> done,
    # "OnlineNewsPopularity", # -> done,
    # "ParkinsonData", # -> done,
    # "Sberbank_Russian_Housing_Market", # -> done, Gradiant Boosting remains
    # "slice_localization_data", # -> done,
    # "Telecom_data", # -> done,
    # "yearMSD_new", # -> done,
    # "arrhythmia", # -> done,
    # "Big_mart_sales", # -> done,
    # "blogData", # -> done
    # "communities", # -> done,
    # "dengue_features", # -> done,
    # "ECG0_p02", # -> done,
    # "ENERGY_DATA_COMPLETE", # -> done
]

x_column_drop = {
    "Facebook_data": ['Unnamed: 0', 'Total.Interactions'],
    "Features_TestSet": ['Unnamed: 0', 'Target'],
    "House_Price_Adv_Regression": ['Unnamed: 0', 'SalePrice'],
    # "Instant_Liking": ['Instant.Liking'] -> NaN needs to be fixed,
    "Insurance": ['Unnamed: 0', 'charges'],
    "Isolet": ['Unnamed: 0', 'Target'], # -> some random error, need to see,
    "new_data_trans": ['Unnamed: 0', 'X23.Humedad_Exterior_Sensor'],
    "OnlineNewsPopularity": ['Unnamed: 0', 'shares'],
    "ParkinsonData": ['Unnamed: 0', 'total_UPDRS'],
    "Sberbank_Russian_Housing_Market": ['Unnamed: 0', 'price_doc'],
    "slice_localization_data": ['Unnamed: 0', 'reference'],
    "Telecom_data": ['Unnamed: 0', 'Churned.Label'],
    "yearMSD_new": ['Unnamed: 0', 'Year'],
    "arrhythmia": ['Unnamed: 0', 'Defection'],
    "Big_mart_sales": ['Unnamed: 0', 'Item_Outlet_Sales'],
    "blogData": ['Unnamed: 0', 'Comments'],
    "communities": ['Unnamed: 0','ViolentCrimesPerPop'],
    "dengue_features": ['Unnamed: 0','total_cases'],
    "ECG0_p02": ['Unnamed: 0', 'CurrentValue'],
    "ENERGY_DATA_COMPLETE": ['Unnamed: 0', 'Appliances']
}

target_column = {
    "Facebook_data": ['Total.Interactions'],
    "Features_TestSet": ['Target'],
    "House_Price_Adv_Regression": ['SalePrice'],
    # "Instant_Liking": ['Instant.Liking'] -> NaN needs to be fixed,
    "Insurance": ['charges'],
    "Isolet": ['Target'], # -> some random error, need to see,
    "new_data_trans": ['X23.Humedad_Exterior_Sensor'],
    "OnlineNewsPopularity": ['shares'],
    "ParkinsonData": ['total_UPDRS'],
    "Sberbank_Russian_Housing_Market": ['price_doc'],
    "slice_localization_data": ['reference'],
    "Telecom_data": ['Churned.Label'],
    "yearMSD_new": ['Year'],
    "arrhythmia": ['Defection'],
    "Big_mart_sales": ['Item_Outlet_Sales'],
    "blogData": ['Comments'],
    "communities": ['ViolentCrimesPerPop'],
    "dengue_features": ['total_cases'],
    "ECG0_p02": ['CurrentValue'],
    "ENERGY_DATA_COMPLETE": ['Appliances']
}

# dataset = "Big_mart_sales"
seeds = [1, 50, 100, 150, 200, 250, 300, 350, 400, 450]

# For RF
# for index, dataset in enumerate(datasets):
#     df = pd.DataFrame()
#     print(dataset)
#     for seed in tqdm(seeds):
#         csv_path_train = os.path.join(data_dir, dataset, "Train", dataset + '_seed_' + str(seed) + '_train.csv')
#         csv_path_validation = os.path.join(data_dir, dataset, "Train", dataset + '_seed_' + str(seed) + '_validation.csv')
#         csv_path_test = os.path.join(data_dir, dataset, "Test", dataset + '_Test_seed' + str(seed) + '_modified.csv')
#         df_test = pd.read_csv(csv_path_test)
        
#         # print(csv_path_train, csv_path_test)
        
#         X_test = df_test.drop(x_column_drop[dataset], axis = 1)
#         y_test = df_test[target_column[dataset]]
        
#         details = {
#             'dataset': dataset,
#             'seed': str(seed)
#         }
        
#         # For RF
#         model_RF = RF_pred(csv_path_train, csv_path_validation, x_column_drop[dataset], target_column[dataset], details)
        
#         importances = model_RF.feature_importances_
#         indices = np.argsort(importances)[::-1]
#         top_k = 10
#         top_indices = indices[:top_k]
#         details['best_feature_list'] = np.array(X_test.columns)[indices][0:top_k]
        
#         y_pred = model_RF.predict(X_test)
#         validation_rmse = mean_squared_error(y_test, y_pred, squared=False)
#         details['validation_rmse'] = validation_rmse
        
#         df = df.append(details, ignore_index=True)
#         filepath = Path(dataset + '_RF.csv')
#         filepath.parent.mkdir(parents=True, exist_ok=True)
#         df.to_csv(filepath, index=False)

# For DT
# for index, dataset in enumerate(datasets):
#     df = pd.DataFrame()
#     print(dataset)
#     for seed in tqdm(seeds):
#         csv_path_train = os.path.join(data_dir, dataset, "Train", dataset + '_seed_' + str(seed) + '_train.csv')
#         csv_path_validation = os.path.join(data_dir, dataset, "Train", dataset + '_seed_' + str(seed) + '_validation.csv')
#         csv_path_test = os.path.join(data_dir, dataset, "Test", dataset + '_Test_seed' + str(seed) + '_modified.csv')
#         df_test = pd.read_csv(csv_path_test)
        
#         # print(csv_path_train, csv_path_test)
        
#         X_test = df_test.drop(x_column_drop[dataset], axis = 1)
#         y_test = df_test[target_column[dataset]]
        
#         details = {
#             'dataset': dataset,
#             'seed': str(seed)
#         }
        
#         # For DT
#         model_DT = DT_pred(csv_path_train, csv_path_validation, x_column_drop[dataset], target_column[dataset], details)
        
#         importances = model_DT.feature_importances_
#         indices = np.argsort(importances)[::-1]
#         top_k = 10
#         top_indices = indices[:top_k]
#         details['best_feature_list'] = np.array(X_test.columns)[indices][0:top_k]
        
#         y_pred = model_DT.predict(X_test)
#         validation_rmse = mean_squared_error(y_test, y_pred, squared=False)
#         details['validation_rmse'] = validation_rmse
        
#         df = df.append(details, ignore_index=True)
#         filepath = Path(dataset + '_DT.csv')
#         filepath.parent.mkdir(parents=True, exist_ok=True)
#         df.to_csv(filepath, index=False)

# For GB
for index, dataset in enumerate(datasets):
    df = pd.DataFrame()
    print(dataset)
    for seed in tqdm(seeds):
        csv_path_train = os.path.join(data_dir, dataset, "Train", dataset + '_seed_' + str(seed) + '_train.csv')
        csv_path_validation = os.path.join(data_dir, dataset, "Train", dataset + '_seed_' + str(seed) + '_validation.csv')
        csv_path_test = os.path.join(data_dir, dataset, "Test", dataset + '_Test_seed' + str(seed) + '_modified.csv')
        df_test = pd.read_csv(csv_path_test)
        
        # print(csv_path_train, csv_path_test)
        
        X_test = df_test.drop(x_column_drop[dataset], axis = 1)
        y_test = df_test[target_column[dataset]]
        
        details = {
            'dataset': dataset,
            'seed': str(seed)
        }
        
        # For GB
        model_GB = GB_pred(csv_path_train, csv_path_validation, x_column_drop[dataset], target_column[dataset], details)
        
        importances = model_GB.feature_importances_
        indices = np.argsort(importances)[::-1]
        top_k = 10
        top_indices = indices[:top_k]
        details['best_feature_list'] = np.array(X_test.columns)[indices][0:top_k]
        
        y_pred = model_GB.predict(X_test)
        validation_rmse = mean_squared_error(y_test, y_pred, squared=False)
        details['validation_rmse'] = validation_rmse
        
        df = df.append(details, ignore_index=True)
        filepath = Path(dataset + '_GB.csv')
        filepath.parent.mkdir(parents=True, exist_ok=True)
        df.to_csv(filepath, index=False)

slice_localization_data


  0%|                                                                                                                                                                                                   | 0/10 [00:00<?, ?it/s]

Best using:  {'n_estimators': 81, 'max_features': 'sqrt', 'max_depth': 41} Score:  -1.4605271859390634
Training RMSE:  0.004411825181726962 Testing RMSE:  1.3941839523525894




Best using:  {'n_estimators': 171, 'max_features': 'sqrt', 'max_depth': 61} Score:  -1.3618377913222068
Training RMSE:  3.346092648416899e-07 Testing RMSE:  1.2655361691798528




Best using:  {'n_estimators': 61, 'max_features': 'sqrt', 'max_depth': 71} Score:  -1.4303982302789708
Training RMSE:  0.036090103141837554 Testing RMSE:  1.2831078673625183




Best using:  {'n_estimators': 191, 'max_features': 'sqrt', 'max_depth': 61} Score:  -1.427686067500399
Training RMSE:  4.279014980036257e-08 Testing RMSE:  1.282096551402688




Best using:  {'n_estimators': 181, 'max_features': 'sqrt', 'max_depth': 31} Score:  -1.3767274125035684
Training RMSE:  5.130822939161954e-07 Testing RMSE:  1.2377313155822383




Best using:  {'n_estimators': 61, 'max_features': 'sqrt', 'max_depth': 31} Score:  -1.4426862532357605
Training RMSE:  0.037653821595790185 Testing RMSE:  1.1700328655695489


 60%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                        | 6/10 [1:26:43<1:07:09, 1007.44s/it]

Best using:  {'n_estimators': 101, 'max_features': 'log2', 'max_depth': 21} Score:  -1.585558753128107
Training RMSE:  0.021546864704635803 Testing RMSE:  1.4620329685133389




Best using:  {'n_estimators': 111, 'max_features': 'sqrt', 'max_depth': 61} Score:  -1.4306112191471438
Training RMSE:  0.00018576059241402873 Testing RMSE:  1.3624509117749792


 80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                    | 8/10 [2:17:45<44:35, 1337.83s/it]

Best using:  {'n_estimators': 71, 'max_features': 'sqrt', 'max_depth': 71} Score:  -1.4028321300828765
Training RMSE:  0.012587089575594908 Testing RMSE:  1.3561418414053734


 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                  | 9/10 [2:24:07<17:18, 1038.73s/it]

Best using:  {'n_estimators': 71, 'max_features': 'sqrt', 'max_depth': 51} Score:  -1.416531621001654
Training RMSE:  0.012604405621719475 Testing RMSE:  1.2572592738208974


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [2:46:51<00:00, 1001.13s/it]


Telecom_data


 10%|██████████████████▋                                                                                                                                                                        | 1/10 [00:10<01:36, 10.69s/it]

Best using:  {'n_estimators': 51, 'max_features': 'sqrt', 'max_depth': 11} Score:  -0.23884801124104849
Training RMSE:  0.030279504840548168 Testing RMSE:  0.22933703982509188
Best using:  {'n_estimators': 101, 'max_features': 'sqrt', 'max_depth': 91} Score:  -0.23335972968475827


 20%|█████████████████████████████████████▍                                                                                                                                                     | 2/10 [00:20<01:19,  9.98s/it]

Training RMSE:  8.208851107148051e-06 Testing RMSE:  0.2470618415846807
Best using:  {'n_estimators': 161, 'max_features': 'sqrt', 'max_depth': 21} Score:  -0.2480077939809934


 30%|████████████████████████████████████████████████████████                                                                                                                                   | 3/10 [00:29<01:09,  9.90s/it]

Training RMSE:  6.96627292795556e-07 Testing RMSE:  0.21362907798464992


 40%|██████████████████████████████████████████████████████████████████████████▊                                                                                                                | 4/10 [00:35<00:48,  8.07s/it]

Best using:  {'n_estimators': 51, 'max_features': 'sqrt', 'max_depth': 11} Score:  -0.23933950897665848
Training RMSE:  0.032264678748646455 Testing RMSE:  0.24354415388412892
Best using:  {'n_estimators': 71, 'max_features': 'sqrt', 'max_depth': 11} Score:  -0.2334614307838776
Training RMSE:  0.015006553902578405 Testing RMSE:  0.2275244180337624


 60%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                          | 6/10 [00:46<00:27,  6.87s/it]

Best using:  {'n_estimators': 31, 'max_features': 'sqrt', 'max_depth': 71} Score:  -0.24673241695255405
Training RMSE:  0.0131813974915215 Testing RMSE:  0.24851345478661147
Best using:  {'n_estimators': 191, 'max_features': 'sqrt', 'max_depth': 11} Score:  -0.24109008401562093


 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                        | 7/10 [00:58<00:25,  8.58s/it]

Training RMSE:  0.001218298256418393 Testing RMSE:  0.21919800857447197
Best using:  {'n_estimators': 121, 'max_features': 'sqrt', 'max_depth': 81} Score:  -0.24443266352489462


 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                     | 8/10 [01:05<00:16,  8.12s/it]

Training RMSE:  1.0365515439876745e-06 Testing RMSE:  0.24608169398006272


 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                  | 9/10 [01:10<00:06,  6.98s/it]

Best using:  {'n_estimators': 41, 'max_features': 'sqrt', 'max_depth': 81} Score:  -0.2429667487873771
Training RMSE:  0.004718195109327796 Testing RMSE:  0.243540617418753
Best using:  {'n_estimators': 111, 'max_features': 'sqrt', 'max_depth': 21} Score:  -0.252079698788268


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [01:14<00:00,  7.49s/it]


Training RMSE:  1.0546871988478413e-05 Testing RMSE:  0.2315458097860594
yearMSD_new


  0%|                                                                                                                                                                                                   | 0/10 [00:00<?, ?it/s]

Best using:  {'n_estimators': 101, 'max_features': 'sqrt', 'max_depth': 11} Score:  -9.489271835984217


 10%|██████████████████▍                                                                                                                                                                     | 1/10 [15:23<2:18:30, 923.36s/it]

Training RMSE:  1.8837985005280684 Testing RMSE:  9.397154835143434
Best using:  {'n_estimators': 101, 'max_features': 'log2', 'max_depth': 11} Score:  -9.4945916543165


 20%|████████████████████████████████████▊                                                                                                                                                   | 2/10 [30:58<2:04:00, 930.02s/it]

Training RMSE:  2.108292806099173 Testing RMSE:  9.415416461231723
Best using:  {'n_estimators': 111, 'max_features': 'auto', 'max_depth': 11} Score:  -9.540559457090032


 30%|███████████████████████████████████████████████████████▏                                                                                                                                | 3/10 [46:13<1:47:42, 923.27s/it]

Training RMSE:  1.4143544844661966 Testing RMSE:  9.366686397000986
Best using:  {'n_estimators': 171, 'max_features': 'auto', 'max_depth': 11} Score:  -9.406229765714247


 40%|████████████████████████████████████████████████████████████████████████▍                                                                                                            | 4/10 [1:21:32<2:19:33, 1395.52s/it]

Training RMSE:  0.8131558041327895 Testing RMSE:  9.623429473650566
Best using:  {'n_estimators': 71, 'max_features': 'sqrt', 'max_depth': 61} Score:  -9.875095584792295
Training RMSE:  0.006191605495394396 Testing RMSE:  9.650264300101265




Best using:  {'n_estimators': 41, 'max_features': 'log2', 'max_depth': 11} Score:  -9.646417213723701


 60%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                        | 6/10 [1:47:33<1:09:16, 1039.17s/it]

Training RMSE:  4.678058350330016 Testing RMSE:  9.615108495247233
Best using:  {'n_estimators': 51, 'max_features': 'log2', 'max_depth': 91} Score:  -9.884906381582883


 70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                       | 7/10 [2:04:21<51:26, 1028.83s/it]

Training RMSE:  0.05092312776776777 Testing RMSE:  9.869728455779734




Best using:  {'n_estimators': 111, 'max_features': 'sqrt', 'max_depth': 91} Score:  -9.830906108026266
Training RMSE:  9.104723485019406e-05 Testing RMSE:  9.787568928237256


 80%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                    | 8/10 [2:19:57<33:18, 999.28s/it]

Best using:  {'n_estimators': 11, 'max_features': 'auto', 'max_depth': 11} Score:  -9.778634105421622


 90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                  | 9/10 [2:34:43<16:03, 963.91s/it]

Training RMSE:  6.979356703735139 Testing RMSE:  9.662446751486792
Best using:  {'n_estimators': 91, 'max_features': 'log2', 'max_depth': 11} Score:  -9.462252277236525


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [2:44:11<00:00, 985.13s/it]


Training RMSE:  2.4248223851541404 Testing RMSE:  9.466123213194962
arrhythmia


 10%|██████████████████▋                                                                                                                                                                        | 1/10 [00:05<00:52,  5.79s/it]

Best using:  {'n_estimators': 101, 'max_features': 'sqrt', 'max_depth': 81} Score:  -6.45096181136104
Training RMSE:  0.00026194611867542267 Testing RMSE:  5.626649338981481
Best using:  {'n_estimators': 141, 'max_features': 'auto', 'max_depth': 1} Score:  -6.796127200070982


 20%|█████████████████████████████████████▍                                                                                                                                                     | 2/10 [00:09<00:37,  4.75s/it]

Training RMSE:  3.3616426816801748 Testing RMSE:  4.772183307410002
Best using:  {'n_estimators': 171, 'max_features': 'sqrt', 'max_depth': 11} Score:  -5.956857586052093
Training RMSE:  3.542172886902816e-07 Testing RMSE:  9.224667993594705


 40%|██████████████████████████████████████████████████████████████████████████▊                                                                                                                | 4/10 [00:24<00:41,  6.97s/it]

Best using:  {'n_estimators': 61, 'max_features': 'sqrt', 'max_depth': 11} Score:  -5.8704885431736455
Training RMSE:  0.023531677574023593 Testing RMSE:  5.091486338659208
Best using:  {'n_estimators': 161, 'max_features': 'sqrt', 'max_depth': 41} Score:  -6.405318469785331


 50%|█████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                             | 5/10 [00:29<00:31,  6.26s/it]

Training RMSE:  4.3059316106014466e-07 Testing RMSE:  5.650396764611179
Best using:  {'n_estimators': 141, 'max_features': 'auto', 'max_depth': 1} Score:  -5.877684174948559


 60%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                          | 6/10 [00:32<00:20,  5.02s/it]

Training RMSE:  3.3185989911537037 Testing RMSE:  6.077983090811976


 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                        | 7/10 [00:35<00:13,  4.37s/it]

Best using:  {'n_estimators': 141, 'max_features': 'sqrt', 'max_depth': 1} Score:  -6.005655631442341
Training RMSE:  3.57135232935013 Testing RMSE:  5.691744573680253
Best using:  {'n_estimators': 161, 'max_features': 'sqrt', 'max_depth': 81} Score:  -6.523987546746751
Training RMSE:  4.215573475309774e-07 Testing RMSE:  5.939695194913749


 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                     | 8/10 [00:39<00:08,  4.25s/it]

Best using:  {'n_estimators': 171, 'max_features': 'auto', 'max_depth': 61} Score:  -5.072946550039399


 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                  | 9/10 [00:49<00:05,  5.89s/it]

Training RMSE:  1.2797037454393585e-07 Testing RMSE:  7.051736035833324


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:53<00:00,  5.37s/it]


Best using:  {'n_estimators': 121, 'max_features': 'sqrt', 'max_depth': 51} Score:  -6.268644836142751
Training RMSE:  2.8882505624646728e-05 Testing RMSE:  7.287000250930882
Big_mart_sales


  0%|                                                                                                                                                                                                   | 0/10 [00:00<?, ?it/s]

Best using:  {'n_estimators': 191, 'max_features': 'auto', 'max_depth': 1} Score:  -1140.0388841509186


 10%|██████████████████▋                                                                                                                                                                        | 1/10 [00:13<02:01, 13.53s/it]

Training RMSE:  1126.4642818596794 Testing RMSE:  1105.348576050163


 20%|█████████████████████████████████████▍                                                                                                                                                     | 2/10 [00:30<02:03, 15.38s/it]

Best using:  {'n_estimators': 181, 'max_features': 'log2', 'max_depth': 1} Score:  -1178.1959374907071
Training RMSE:  1151.6487322140129 Testing RMSE:  1157.9480104091194
Best using:  {'n_estimators': 31, 'max_features': 'sqrt', 'max_depth': 61} Score:  -1191.3324659089606


 30%|████████████████████████████████████████████████████████                                                                                                                                   | 3/10 [00:45<01:46, 15.28s/it]

Training RMSE:  64.9096208642825 Testing RMSE:  1252.71763111653
Best using:  {'n_estimators': 91, 'max_features': 'log2', 'max_depth': 61} Score:  -1219.2782299264218


 40%|██████████████████████████████████████████████████████████████████████████▊                                                                                                                | 4/10 [01:01<01:34, 15.74s/it]

Training RMSE:  0.11959441081474417 Testing RMSE:  1172.1451562673815
Best using:  {'n_estimators': 141, 'max_features': 'sqrt', 'max_depth': 61} Score:  -1239.4451132777263


 50%|█████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                             | 5/10 [01:16<01:16, 15.32s/it]

Training RMSE:  0.0006095531315030952 Testing RMSE:  1135.593982767232


 60%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                          | 6/10 [01:25<00:52, 13.08s/it]

Best using:  {'n_estimators': 121, 'max_features': 'log2', 'max_depth': 1} Score:  -1195.4527528823053
Training RMSE:  1194.8804347775847 Testing RMSE:  1156.7523351501677
Best using:  {'n_estimators': 31, 'max_features': 'log2', 'max_depth': 91} Score:  -1170.5951060834793


 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                        | 7/10 [01:43<00:44, 14.86s/it]

Training RMSE:  65.26272632121633 Testing RMSE:  1211.0820646219993
Best using:  {'n_estimators': 31, 'max_features': 'sqrt', 'max_depth': 61} Score:  -1189.59896219538


 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                     | 8/10 [02:02<00:32, 16.03s/it]

Training RMSE:  64.79026556916672 Testing RMSE:  1212.0123237312757
Best using:  {'n_estimators': 41, 'max_features': 'auto', 'max_depth': 11} Score:  -1157.28172822926


 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                  | 9/10 [02:12<00:14, 14.36s/it]

Training RMSE:  563.4866560723996 Testing RMSE:  1231.8553007807634


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [02:27<00:00, 14.77s/it]


Best using:  {'n_estimators': 41, 'max_features': 'auto', 'max_depth': 1} Score:  -1175.2502661925955
Training RMSE:  1169.4528985814643 Testing RMSE:  1256.9306754232316
blogData




Best using:  {'n_estimators': 51, 'max_features': 'log2', 'max_depth': 11} Score:  -26.75095831880487


 10%|██████████████████▌                                                                                                                                                                       | 1/10 [04:35<41:19, 275.51s/it]

Training RMSE:  8.343270201721168 Testing RMSE:  21.382877518579324
Best using:  {'n_estimators': 171, 'max_features': 'sqrt', 'max_depth': 11} Score:  -25.361181602271007


 20%|█████████████████████████████████████▏                                                                                                                                                    | 2/10 [10:54<44:49, 336.14s/it]

Training RMSE:  2.6141971949238445 Testing RMSE:  27.252895850235248
Best using:  {'n_estimators': 151, 'max_features': 'sqrt', 'max_depth': 51} Score:  -28.191682424988453
Training RMSE:  5.640583348306177 Testing RMSE:  22.713793301414675


 30%|███████████████████████████████████████████████████████▊                                                                                                                                  | 3/10 [21:15<54:25, 466.49s/it]

Best using:  {'n_estimators': 171, 'max_features': 'sqrt', 'max_depth': 21} Score:  -27.163109737678166
Training RMSE:  5.638776472086949 Testing RMSE:  23.708170739617856


 40%|██████████████████████████████████████████████████████████████████████████▍                                                                                                               | 4/10 [27:56<44:02, 440.41s/it]

Best using:  {'n_estimators': 121, 'max_features': 'log2', 'max_depth': 11} Score:  -24.752823035925406


 50%|█████████████████████████████████████████████████████████████████████████████████████████████                                                                                             | 5/10 [35:59<37:59, 455.83s/it]

Training RMSE:  5.262400077434655 Testing RMSE:  30.05482523438916
Best using:  {'n_estimators': 141, 'max_features': 'sqrt', 'max_depth': 51} Score:  -26.74440705512376
Training RMSE:  0.3203213086422591 Testing RMSE:  33.55476234047409




Best using:  {'n_estimators': 151, 'max_features': 'sqrt', 'max_depth': 11} Score:  -26.753339105946644


 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                       | 7/10 [50:14<21:59, 439.97s/it]

Training RMSE:  2.908274415943759 Testing RMSE:  29.951295034599305
Best using:  {'n_estimators': 101, 'max_features': 'sqrt', 'max_depth': 31} Score:  -26.3759307029567
Training RMSE:  1.1520701393004806 Testing RMSE:  22.758398842491054




Best using:  {'n_estimators': 171, 'max_features': 'sqrt', 'max_depth': 11} Score:  -23.048825011484602


 90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                  | 9/10 [1:02:11<06:40, 400.34s/it]

Training RMSE:  2.485851364399492 Testing RMSE:  26.903785132498673
Best using:  {'n_estimators': 161, 'max_features': 'sqrt', 'max_depth': 71} Score:  -27.60121820926791
Training RMSE:  5.749986089023684 Testing RMSE:  25.108467323610665


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [1:10:07<00:00, 420.80s/it]


communities


 10%|██████████████████▋                                                                                                                                                                        | 1/10 [00:26<03:57, 26.41s/it]

Best using:  {'n_estimators': 121, 'max_features': 'sqrt', 'max_depth': 1} Score:  -0.14312608905519636
Training RMSE:  0.1306474428214467 Testing RMSE:  0.12170583453731353
Best using:  {'n_estimators': 81, 'max_features': 'auto', 'max_depth': 1} Score:  -0.145547269910109


 20%|█████████████████████████████████████▍                                                                                                                                                     | 2/10 [00:44<02:51, 21.40s/it]

Training RMSE:  0.13262943944072883 Testing RMSE:  0.1430938969852826
Best using:  {'n_estimators': 161, 'max_features': 'log2', 'max_depth': 41} Score:  -0.1418513838626646


 30%|████████████████████████████████████████████████████████                                                                                                                                   | 3/10 [01:03<02:23, 20.45s/it]

Training RMSE:  1.688777551649027e-08 Testing RMSE:  0.1342105238728362
Best using:  {'n_estimators': 181, 'max_features': 'sqrt', 'max_depth': 21} Score:  -0.14161138665080936


 40%|██████████████████████████████████████████████████████████████████████████▊                                                                                                                | 4/10 [01:09<01:28, 14.81s/it]

Training RMSE:  1.4886016913771992e-08 Testing RMSE:  0.13605511379847943
Best using:  {'n_estimators': 131, 'max_features': 'sqrt', 'max_depth': 81} Score:  -0.13994564458526754


 50%|█████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                             | 5/10 [01:25<01:15, 15.04s/it]

Training RMSE:  2.2662417414693346e-07 Testing RMSE:  0.16071669361072824
Best using:  {'n_estimators': 51, 'max_features': 'sqrt', 'max_depth': 51} Score:  -0.14156602461989473


 60%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                          | 6/10 [01:48<01:11, 17.78s/it]

Training RMSE:  0.0010652240503808577 Testing RMSE:  0.14432630087879608


 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                        | 7/10 [02:11<00:58, 19.41s/it]

Best using:  {'n_estimators': 161, 'max_features': 'log2', 'max_depth': 1} Score:  -0.14287723770310373
Training RMSE:  0.1259743322377415 Testing RMSE:  0.1489223719551686
Best using:  {'n_estimators': 61, 'max_features': 'sqrt', 'max_depth': 61} Score:  -0.1475987889645713


 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                     | 8/10 [02:36<00:42, 21.42s/it]

Training RMSE:  0.000388773307100021 Testing RMSE:  0.1348317855409102
Best using:  {'n_estimators': 91, 'max_features': 'log2', 'max_depth': 81} Score:  -0.13786641460119667


 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                  | 9/10 [02:46<00:17, 17.78s/it]

Training RMSE:  1.5780753133960816e-05 Testing RMSE:  0.14528202714295202
Best using:  {'n_estimators': 191, 'max_features': 'sqrt', 'max_depth': 21} Score:  -0.1447650917171762


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:04<00:00, 18.44s/it]


Training RMSE:  1.489307173373566e-08 Testing RMSE:  0.12758294138124657
dengue_features


  0%|                                                                                                                                                                                                   | 0/10 [00:00<?, ?it/s]

Best using:  {'n_estimators': 131, 'max_features': 'log2', 'max_depth': 71} Score:  -39.438720110187106


 10%|██████████████████▋                                                                                                                                                                        | 1/10 [00:03<00:35,  3.96s/it]

Training RMSE:  4.6531776694340955e-05 Testing RMSE:  27.434767803312475
Best using:  {'n_estimators': 171, 'max_features': 'log2', 'max_depth': 21} Score:  -36.240445661330796


 20%|█████████████████████████████████████▍                                                                                                                                                     | 2/10 [00:06<00:26,  3.34s/it]

Training RMSE:  1.1987488705000524 Testing RMSE:  39.98208102740179


 30%|████████████████████████████████████████████████████████                                                                                                                                   | 3/10 [00:10<00:24,  3.47s/it]

Best using:  {'n_estimators': 101, 'max_features': 'log2', 'max_depth': 1} Score:  -37.76719449060721
Training RMSE:  37.06368443852825 Testing RMSE:  46.7386031151435
Best using:  {'n_estimators': 111, 'max_features': 'log2', 'max_depth': 21} Score:  -35.338749007969604


 40%|██████████████████████████████████████████████████████████████████████████▊                                                                                                                | 4/10 [00:15<00:23,  3.92s/it]

Training RMSE:  0.5248685370588169 Testing RMSE:  41.338789839915925


 50%|█████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                             | 5/10 [00:21<00:24,  4.85s/it]

Best using:  {'n_estimators': 31, 'max_features': 'sqrt', 'max_depth': 21} Score:  -37.408656049874274
Training RMSE:  2.117691241143674 Testing RMSE:  32.88213795471596


 60%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                          | 6/10 [00:25<00:18,  4.53s/it]

Best using:  {'n_estimators': 61, 'max_features': 'sqrt', 'max_depth': 71} Score:  -37.62027343447145
Training RMSE:  0.6979563176814227 Testing RMSE:  31.256894901272517


 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                        | 7/10 [00:26<00:10,  3.46s/it]

Best using:  {'n_estimators': 31, 'max_features': 'log2', 'max_depth': 51} Score:  -38.01693035663527
Training RMSE:  2.037663752397105 Testing RMSE:  35.00784358180498
Best using:  {'n_estimators': 171, 'max_features': 'log2', 'max_depth': 41} Score:  -36.86162550414535


 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                     | 8/10 [00:30<00:06,  3.49s/it]

Training RMSE:  0.7266802230839804 Testing RMSE:  32.11924480889326
Best using:  {'n_estimators': 191, 'max_features': 'log2', 'max_depth': 21} Score:  -37.71462899537844


 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                  | 9/10 [00:36<00:04,  4.26s/it]

Training RMSE:  0.5248683902019885 Testing RMSE:  30.120679040548996


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:42<00:00,  4.23s/it]


Best using:  {'n_estimators': 61, 'max_features': 'sqrt', 'max_depth': 91} Score:  -33.88349878522221
Training RMSE:  1.1806864068763019 Testing RMSE:  35.76337203192187
ECG0_p02


  0%|                                                                                                                                                                                                   | 0/10 [00:00<?, ?it/s]

Best using:  {'n_estimators': 91, 'max_features': 'auto', 'max_depth': 61} Score:  -0.0690964430678539
Training RMSE:  1.8510763647294875e-05 Testing RMSE:  0.06967697226795948


 10%|██████████████████                                                                                                                                                                  | 1/10 [1:13:18<10:59:50, 4398.91s/it]

Best using:  {'n_estimators': 21, 'max_features': 'auto', 'max_depth': 21} Score:  -0.06414224120531849


 20%|████████████████████████████████████                                                                                                                                                | 2/10 [2:29:58<10:02:14, 4516.85s/it]

Training RMSE:  0.0304691147403607 Testing RMSE:  0.06083413950249206




Best using:  {'n_estimators': 121, 'max_features': 'auto', 'max_depth': 11} Score:  -0.04217919638348818
Training RMSE:  0.014772922394319888 Testing RMSE:  0.039280194463257985




Best using:  {'n_estimators': 41, 'max_features': 'auto', 'max_depth': 51} Score:  -0.06739181459154973
Training RMSE:  0.003615806223568278 Testing RMSE:  0.06565385564838967




Best using:  {'n_estimators': 171, 'max_features': 'auto', 'max_depth': 11} Score:  -0.041528938269330525
Training RMSE:  0.012340629386145234 Testing RMSE:  0.038452463503713126




Best using:  {'n_estimators': 171, 'max_features': 'auto', 'max_depth': 21} Score:  -0.054976713681847846
Training RMSE:  4.251096772325372e-05 Testing RMSE:  0.0532144792818499




Best using:  {'n_estimators': 171, 'max_features': 'auto', 'max_depth': 11} Score:  -0.04228909379874089
Training RMSE:  0.0124227644492228 Testing RMSE:  0.040941157676998005




Best using:  {'n_estimators': 31, 'max_features': 'auto', 'max_depth': 41} Score:  -0.06957859321630888
Training RMSE:  0.01031574067234901 Testing RMSE:  0.06633180499075986


 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                    | 8/10 [10:47:39<2:43:58, 4919.46s/it]

Best using:  {'n_estimators': 81, 'max_features': 'auto', 'max_depth': 91} Score:  -0.06941493296375374
Training RMSE:  5.3161946502119324e-05 Testing RMSE:  0.06489447276126996




Best using:  {'n_estimators': 131, 'max_features': 'auto', 'max_depth': 11} Score:  -0.041928161799301776
Training RMSE:  0.01412677749325202 Testing RMSE:  0.04075374283673487


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [12:23:17<00:00, 4459.78s/it]


ENERGY_DATA_COMPLETE


  0%|                                                                                                                                                                                                   | 0/10 [00:00<?, ?it/s]

Best using:  {'n_estimators': 181, 'max_features': 'log2', 'max_depth': 91} Score:  -73.35087499009006


 10%|██████████████████▌                                                                                                                                                                       | 1/10 [02:49<25:21, 169.01s/it]

Training RMSE:  5.27912616663241e-07 Testing RMSE:  71.53234689349146
Best using:  {'n_estimators': 171, 'max_features': 'log2', 'max_depth': 71} Score:  -73.43609914976022


 20%|█████████████████████████████████████▏                                                                                                                                                    | 2/10 [05:08<20:14, 151.85s/it]

Training RMSE:  1.559456582935691e-06 Testing RMSE:  65.70124757419546
Best using:  {'n_estimators': 81, 'max_features': 'log2', 'max_depth': 31} Score:  -73.35738757770949


 30%|███████████████████████████████████████████████████████▊                                                                                                                                  | 3/10 [08:53<21:37, 185.31s/it]

Training RMSE:  0.020189715555564123 Testing RMSE:  70.6270430880004
Best using:  {'n_estimators': 171, 'max_features': 'log2', 'max_depth': 91} Score:  -73.8452705256138


 40%|██████████████████████████████████████████████████████████████████████████▍                                                                                                               | 4/10 [11:38<17:42, 177.16s/it]

Training RMSE:  1.5460606300968864e-06 Testing RMSE:  72.01327038369082
Best using:  {'n_estimators': 111, 'max_features': 'log2', 'max_depth': 31} Score:  -71.98186727426842


 50%|█████████████████████████████████████████████████████████████████████████████████████████████                                                                                             | 5/10 [14:11<14:02, 168.47s/it]

Training RMSE:  0.0008640797652635563 Testing RMSE:  66.91624363974825
Best using:  {'n_estimators': 91, 'max_features': 'log2', 'max_depth': 51} Score:  -72.82134333092456


 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                          | 6/10 [16:03<09:56, 149.22s/it]

Training RMSE:  0.007030486207874158 Testing RMSE:  71.16159838643975
Best using:  {'n_estimators': 71, 'max_features': 'log2', 'max_depth': 71} Score:  -75.7090046717962


 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                       | 7/10 [17:48<06:44, 134.70s/it]

Training RMSE:  0.059294412106086956 Testing RMSE:  65.77571786972247
Best using:  {'n_estimators': 51, 'max_features': 'log2', 'max_depth': 91} Score:  -72.54082966798448


 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                     | 8/10 [18:36<03:34, 107.30s/it]

Training RMSE:  0.46985066863732633 Testing RMSE:  72.07983695803388
Best using:  {'n_estimators': 191, 'max_features': 'sqrt', 'max_depth': 11} Score:  -73.58673744950609


 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                  | 9/10 [21:34<02:09, 129.32s/it]

Training RMSE:  5.7506112577072654 Testing RMSE:  71.16385003943756
Best using:  {'n_estimators': 191, 'max_features': 'sqrt', 'max_depth': 31} Score:  -73.46909608921402


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [24:02<00:00, 144.24s/it]

Training RMSE:  2.132407021622937e-07 Testing RMSE:  76.62837236675169





In [None]:
# normal Random Forest with k fold -> do not need this

data_dir = "/home/asim/ssriva59/setup-stuff/gateway_and_dataset"
datasets = [
    # "Facebook_data",
    # "Features_TestSet",
    # "House_Price_Adv_Regression",
    # "Instant_Liking",
    # "Insurance",
    # "Isolet",
    # "new_data_trans",
    # "OnlineNewsPopularity",
    "ParkinsonData",
    # "Sberbank_Russian_Housing_Market",
    # "slice_localization_data",
    # "Telecom_data",
    # "yearMSD_new",
    # "arrhythmia",
    # "Big_mart_sales",
    # "blogData",
    # "communities",
    # "dengue_features",
    # "ECG0_p02",
    # "ENERGY_DATA_COMPLETE"
]

x_column_drop = {
    # "Facebook_data": ['Total.Interactions'],
    # "Features_TestSet": ['Unnamed: 0', 'Target'],
    # "House_Price_Adv_Regression": ['SalePrice'],
    # "Instant_Liking": ['Instant.Liking'] -> NaN needs to be fixed,
    # "Insurance": ['charges'],
    # "Isolet": ['Target'] -> some random error, need to see,
    # "new_data_trans": ['X23.Humedad_Exterior_Sensor'],
    # "OnlineNewsPopularity": ['shares'],
    "ParkinsonData": ['Unnamed: 0','total_UPDRS'],
    "Sberbank_Russian_Housing_Market": ['Unnamed: 0', 'price_doc'],
    "slice_localization_data": ['Unnamed: 0', 'reference'],
    # "Telecom_data": ['Churned.Label'],
    # "yearMSD_new": ['Year'],
    # "arrhythmia": ['Defection'],
    # "Big_mart_sales": ['Item_Outlet_Sales'],
    # "blogData": ['Comments'],
    # "communities": ['ViolentCrimesPerPop'],
    # "dengue_features": ['total_cases'],
    # "ECG0_p02": ['CurrentValue'],
    # "ENERGY_DATA_COMPLETE": ['Appliances']
}

target_column = {
    # "Facebook_data": ['Total.Interactions'],
    # "Features_TestSet": ['Target'],
    # "House_Price_Adv_Regression": ['SalePrice'],
    # "Instant_Liking": ['Instant.Liking'] -> NaN needs to be fixed,
    # "Insurance": ['charges'],
    # "Isolet": ['Target'] -> some random error, need to see,
    # "new_data_trans": ['X23.Humedad_Exterior_Sensor'],
    # "OnlineNewsPopularity": ['shares'],
    "ParkinsonData": ['total_UPDRS'],
    "Sberbank_Russian_Housing_Market": ['price_doc'],
    "slice_localization_data": ['reference'],
    # "Telecom_data": ['Churned.Label'],
    # "yearMSD_new": ['Year'],
    # "arrhythmia": ['Defection'],
    # "Big_mart_sales": ['Item_Outlet_Sales'],
    # "blogData": ['Comments'],
    # "communities": ['ViolentCrimesPerPop'],
    # "dengue_features": ['total_cases'],
    # "ECG0_p02": ['CurrentValue'],
    # "ENERGY_DATA_COMPLETE": ['Appliances']
}

# dataset = "Big_mart_sales"
seeds = [1, 50, 100, 150, 200, 250, 300, 350, 400, 450]

for index, dataset in enumerate(datasets):
    df = pd.DataFrame()
    print(dataset)
    for seed in tqdm(seeds):
        csv_path_train = os.path.join(data_dir, dataset, "Train", dataset + '_Train_seed' + str(seed) + '_modified.csv')
        df_train = pd.read_csv(csv_path_train)
        csv_path_test = os.path.join(data_dir, dataset, "Test", dataset + '_Test_seed' + str(seed) + '_modified.csv')
        df_test = pd.read_csv(csv_path_test)
        
        X = df_train.drop(x_column_drop[dataset], axis = 1)
        y = df_train[target_column[dataset]]
        X_test = df_test.drop(x_column_drop[dataset], axis = 1)
        y_test = df_test[target_column[dataset]]
        
        details = {
            'dataset': dataset,
            'seed': str(seed)
        }
        
        model_DT = DT_pred(X, y, details)
        model_RF = RF_pred_kfold(X, y, details)
        
        importances = model_RF.feature_importances_
        indices = np.argsort(importances)[::-1]
        top_k = 10
        top_indices = indices[:top_k]
        details['best_feature_list'] = np.array(X.columns)[indices][0:top_k]
        
        y_pred = model_RF.predict(X_test)
        validation_rmse = mean_squared_error(y_test, y_pred, squared=False)
        details['validation_rmse'] = validation_rmse
        
        df = df.append(details, ignore_index=True)
        filepath = Path(dataset + '_RF_kfold.csv')
        filepath.parent.mkdir(parents=True, exist_ok=True)
        df.to_csv(filepath, index=False)

In [None]:
# Split dataset into training and validation sets

data_dir = "/home/asim/ssriva59/setup-stuff/gateway_and_dataset"
datasets = [
    # "Facebook_data",
    # "Features_TestSet",
    # "House_Price_Adv_Regression",
    # "Instant_Liking",
    # "Insurance",
    "Isolet",
    # "new_data_trans",
    # "OnlineNewsPopularity",
    # "ParkinsonData",
    # "Sberbank_Russian_Housing_Market",
    # "slice_localization_data",
    # "Telecom_data",
    # "yearMSD_new",
    # "arrhythmia",
    # "Big_mart_sales",
    # "blogData",
    # "communities",
    # "dengue_features",
    # "ECG0_p02",
    # "ENERGY_DATA_COMPLETE"
]

train_split = 0.8

seeds = [1, 50, 100, 150, 200, 250, 300, 350, 400, 450]

for index, dataset in enumerate(datasets):
    df = pd.DataFrame()
    print(dataset)
    for seed in tqdm(seeds):
        data = []
        # print(os.path.join(data_dir, dataset, "Train", dataset + '_Train_seed' + str(seed) + '_modified.csv'))
        for each_line in open(os.path.join(data_dir, dataset, "Train", dataset + '_Train_seed' + str(seed) + '_modified.csv')):
            each_line = each_line.strip().split(',')
            data.append(each_line)
            
        headers = data.pop(0)
        
        random.shuffle(data)
        
        train_datapoints = int(train_split * len(data))
        train_set = [headers] + data[:train_datapoints]
        validation_set = [headers] + data[train_datapoints:]
        
        train_set_df = pd.DataFrame(train_set)
        train_set_df.to_csv(os.path.join(data_dir, dataset, "Train", dataset + '_seed_' + str(seed) + '_train.csv'), index=False, header=False)
        
        validation_set_df = pd.DataFrame(validation_set)
        validation_set_df.to_csv(os.path.join(data_dir, dataset, "Train", dataset + '_seed_' + str(seed) + '_validation.csv'), index=False, header=False)
        

In [None]:
# dataset normalization
from sklearn.preprocessing import MinMaxScaler

data_dir = "/home/asim/ssriva59/setup-stuff/gateway_and_dataset"
datasets = [
    # "Facebook_data",
    # "Features_TestSet",
    # "House_Price_Adv_Regression",
    # "Instant_Liking",
    # "Insurance",
    # "Isolet",
    # "new_data_trans",
    # "OnlineNewsPopularity",
    "ParkinsonData",
    # "Sberbank_Russian_Housing_Market",
    # "slice_localization_data",
    # "Telecom_data",
    # "yearMSD_new",
    # "arrhythmia",
    # "Big_mart_sales",
    # "blogData",
    # "communities",
    # "dengue_features",
    # "ECG0_p02",
    # "ENERGY_DATA_COMPLETE"
]

seeds = [1, 50, 100, 150, 200, 250, 300, 350, 400, 450]

for index, dataset in enumerate(datasets):
    df = pd.DataFrame()
    print(dataset)
    for seed in tqdm(seeds):
        csv_path_train = os.path.join(data_dir, dataset, "Train", dataset + '_seed_' + str(seed) + '_train.csv')
        csv_path_validation = os.path.join(data_dir, dataset, "Train", dataset + '_seed_' + str(seed) + '_validation.csv')
        csv_path_test = os.path.join(data_dir, dataset, "Test", dataset + '_Test_seed' + str(seed) + '_modified.csv')
        
        train_set = []
        for each_line in open(csv_path_train):
            each_line = each_line.strip().split(',')
            train_set.append(each_line)
        headers = train_set.pop(0)
        
        validation_set = []
        for each_line in open(csv_path_validation):
            each_line = each_line.strip().split(',')
            validation_set.append(each_line)
        validation_set.pop(0)
        
        test_set = []
        for each_line in open(csv_path_test):
            each_line = each_line.strip().split(',')
            test_set.append(each_line)
        test_set.pop(0)
        
        scaler = MinMaxScaler()
        train_set = [headers] + scaler.fit_transform(train_set).tolist()        
        validation_set = [headers] + scaler.transform(validation_set).tolist()
        test_set = [headers] + scaler.transform(test_set).tolist()
        
        train_set_df = pd.DataFrame(train_set)
        train_set_df.to_csv(os.path.join(data_dir, dataset, "Train", dataset + '_seed_' + str(seed) + '_train_normalized.csv'), index=False, header=False)
        
        validation_set_df = pd.DataFrame(validation_set)
        validation_set_df.to_csv(os.path.join(data_dir, dataset, "Train", dataset + '_seed_' + str(seed) + '_validation_normalized.csv'), index=False, header=False)
        
        test_set_df = pd.DataFrame(test_set)
        test_set_df.to_csv(os.path.join(data_dir, dataset, "Test", dataset + '_seed_' + str(seed) + '_test_normalized.csv'), index=False, header=False)

In [None]:
data_dir = "/home/asim/ssriva59/setup-stuff/gateway_and_dataset"
dataset = "arrhythmia"
seeds = [1, 50, 100, 150, 200, 250, 300, 350, 400, 450]

for s in tqdm(seeds):
    csv_path_train = os.path.join(data_dir, dataset, "Train", dataset + '_Train_seed' + str(s) + '_modified.csv')
    df_train = pd.read_csv(csv_path_train)
    csv_path_test = os.path.join(data_dir, dataset, "Test", dataset + '_Test_seed' + str(s) + '_modified.csv')
    df_test = pd.read_csv(csv_path_test)
    X = df_train.drop(['Defection'], axis = 1)
    y = df_train[['Defection']]
    model_DT = DT_pred(X, y)
    
    model_RF = RF_pred(X, y)
    importances = model_RF.feature_importances_
    indices = np.argsort(importances)[::-1]
    top_k = 10
    top_indices = indices[:top_k]
    
    print("Feature_importance: \n")
    print(np.array(X.columns)[indices][0:top_k])
    print(importances[top_indices])


In [None]:
# Just need to drop one column and replace with a one-hot encoding

def process_big_mart(df_train):
    dummies = pd.get_dummies(df_train.Type)
    df_train = pd.concat([df_train, dummies], axis='columns')
    df_train = df_train.drop(['Type'], axis='columns')
    return df_train

def get_missing():
    missingValueColumns = datafrm.columns[datafrm.isnull().any()].tolist()
    percent_missing = datafrm[missingValueColumns].isnull().sum()
    print("Missing value count columnwise:")
    print(percent_missing)



data_dir = "/home/asim/ssriva59/setup-stuff/gateway_and_dataset"
dataset = "Facebook_data"

seeds = [50, 100, 150, 200, 250, 300, 350, 400, 450]
for s in tqdm(seeds):
    csv_path_train = os.path.join(data_dir, dataset, "Train", dataset + '_Train_seed' + str(s) + '.csv')
    df_train = pd.read_csv(csv_path_train)
    csv_path_test = os.path.join(data_dir, dataset, "Test", dataset + '_Test_seed' + str(s) + '.csv')
    df_test = pd.read_csv(csv_path_test)
    
    df_train = process_big_mart(df_train)
    df_test = process_big_mart(df_test)
    df_train.to_csv(os.path.join(data_dir, dataset, "Train", dataset + '_Train_seed' + str(s) + '_modified.csv'), index=False)
    df_test.to_csv(os.path.join(data_dir, dataset, "Test", dataset + '_Test_seed' + str(s) + '_modified.csv'), index=False)
    

In [None]:
# No preprocessing required here

def process_big_mart(df_train):
    return df_train

def get_missing():
    missingValueColumns = datafrm.columns[datafrm.isnull().any()].tolist()
    percent_missing = datafrm[missingValueColumns].isnull().sum()
    print("Missing value count columnwise:")
    print(percent_missing)



data_dir = "/home/asim/ssriva59/setup-stuff/gateway_and_dataset"
dataset = "Features_TestSet"

seeds = [1, 50, 100, 150, 200, 250, 300, 350, 400, 450]
for s in tqdm(seeds):
    csv_path_train = os.path.join(data_dir, dataset, "Train", dataset + '_Train_seed' + str(s) + '.csv')
    df_train = pd.read_csv(csv_path_train)
    csv_path_test = os.path.join(data_dir, dataset, "Test", dataset + '_Test_seed' + str(s) + '.csv')
    df_test = pd.read_csv(csv_path_test)
    
    df_train = process_big_mart(df_train)
    df_test = process_big_mart(df_test)
    df_train.to_csv(os.path.join(data_dir, dataset, "Train", dataset + '_Train_seed' + str(s) + '_modified.csv'), index=False)
    df_test.to_csv(os.path.join(data_dir, dataset, "Test", dataset + '_Test_seed' + str(s) + '_modified.csv'), index=False)