In [None]:
import pandas as pd
from tqdm import tqdm
import os
import numpy as np
from pathlib import Path
import csv
import random

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
# import lightgbm as lgb

from sklearn.metrics import (precision_score, recall_score, roc_auc_score, accuracy_score, mean_squared_error,
                             confusion_matrix, precision_recall_curve, roc_curve, brier_score_loss)

from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt

In [None]:
# Data preprocessing

columns_drop = {
    "Facebook_data": None,
    "Features_TestSet": None,
    "House_Price_Adv_Regression": ['Id'],
    "Instant_Liking": None,
    "Insurance": None,
    "Isolet": None,
    "new_data_trans": None,
    "OnlineNewsPopularity": None,
    "ParkinsonData": ['subject.'],
    "Sberbank_Russian_Housing_Market": ['id'],
    "slice_localization_data": None,
    "Telecom_data": ['Phone.Number'],
    "yearMSD_new": None,
    "arrhythmia": None,
    "Big_mart_sales": ['Item_Identifier', 'Outlet_Identifier', 'Outlet_Establishment_Year'],
    "blogData": None,
    "communities": ['x4', 'x1'],
    "dengue_features": ['year', 'weekofyear', 'week_start_date'],
    "ECG0_p02": None,
    "ENERGY_DATA_COMPLETE": None
}

fill_na = {
    "Facebook_data": False,
    "Features_TestSet": False,
    "House_Price_Adv_Regression": False,
    "Instant_Liking": False,
    "Insurance": False,
    "Isolet": False,
    "new_data_trans": False,
    "OnlineNewsPopularity": False,
    "ParkinsonData": False,
    "Sberbank_Russian_Housing_Market": False,
    "slice_localization_data": False,
    "Telecom_data": False,
    "yearMSD_new": False,
    "arrhythmia": False,
    "Big_mart_sales": True,
    "blogData": False,
    "communities": False,
    "dengue_features": False,
    "ECG0_p02": False,
    "ENERGY_DATA_COMPLETE": False
}

fill_na_column = {
    "Facebook_data": [],
    "Features_TestSet": [],
    "House_Price_Adv_Regression": [],
    "Instant_Liking": [],
    "Insurance": [],
    "Isolet": [],
    "new_data_trans": [],
    "OnlineNewsPopularity": [],
    "ParkinsonData": [],
    "Sberbank_Russian_Housing_Market": [],
    "slice_localization_data": [],
    "Telecom_data": [],
    "yearMSD_new": [],
    "arrhythmia": [],
    "Big_mart_sales": ['Outlet_Size'],
    "blogData": [],
    "communities": [],
    "dengue_features": [],
    "ECG0_p02": [],
    "ENERGY_DATA_COMPLETE": [],
}

def process_dataset(df_train, df_test, columns_drop, fill_na, fill_na_columns):    
    df_total = pd.concat([df_train, df_test], axis=0)
    tar = df_total.iloc[:, -1]
    df_total = df_total.iloc[:, :-1]
    
    if fill_na:
        for column in fill_na_columns:
            df_total[column].fillna("NOT_PRESENT", inplace=True)
    
    if columns_drop:
        df_total = df_total.drop(columns_drop, axis='columns')
    
    cat = df_total.select_dtypes(include=['object']).columns.to_list()
    df_total = pd.get_dummies(df_total, cat)
    
    df_total = pd.concat([df_total, tar], axis=1)
    df_train = df_total.iloc[:df_train.shape[0],:]
    df_test = df_total.iloc[df_train.shape[0]:,:]
    
    return df_train, df_test

data_dir = "/home/asim/ssriva59/setup-stuff/gateway_and_dataset"
datasets = [
    # "Facebook_data",
    # "Features_TestSet",
    # "House_Price_Adv_Regression",
    # "Instant_Liking",
    # "Insurance",
    "Isolet",
    # "new_data_trans",
    # "OnlineNewsPopularity",
    # "ParkinsonData",
    # "Sberbank_Russian_Housing_Market",
    # "slice_localization_data",
    # "Telecom_data",
    # "yearMSD_new",
    # "arrhythmia",
    # "Big_mart_sales",
    # "blogData",
    # "communities",
    # "dengue_features",
    # "ECG0_p02",
    # "ENERGY_DATA_COMPLETE"
]

# Fill missing for Instant_Liking

seeds = [1, 50, 100, 150, 200, 250, 300, 350, 400, 450]

for index, dataset in enumerate(datasets):
    print(dataset)
    for seed in tqdm(seeds):
        csv_path_train = os.path.join(data_dir, dataset, "Train", dataset + '_Train_seed' + str(seed) + '.csv')
        df_train = pd.read_csv(csv_path_train)
        
        csv_path_test = os.path.join(data_dir, dataset, "Test", dataset + '_Test_seed' + str(seed) + '.csv')
        df_test = pd.read_csv(csv_path_test)
        
        df_train, df_test = process_dataset(df_train, df_test, columns_drop[dataset], fill_na[dataset], fill_na_column[dataset])
        df_train.to_csv(os.path.join(data_dir, dataset, "Train", dataset + '_Train_seed' + str(seed) + '_modified.csv'), index=False)
        df_test.to_csv(os.path.join(data_dir, dataset, "Test", dataset + '_Test_seed' + str(seed) + '_modified.csv'), index=False)

In [None]:
# Split dataset into training and validation sets

data_dir = "/home/asim/ssriva59/setup-stuff/gateway_and_dataset"
datasets = [
    # "Facebook_data",
    # "Features_TestSet",
    # "House_Price_Adv_Regression",
    # "Instant_Liking",
    # "Insurance",
    "Isolet",
    # "new_data_trans",
    # "OnlineNewsPopularity",
    # "ParkinsonData",
    # "Sberbank_Russian_Housing_Market",
    # "slice_localization_data",
    # "Telecom_data",
    # "yearMSD_new",
    # "arrhythmia",
    # "Big_mart_sales",
    # "blogData",
    # "communities",
    # "dengue_features",
    # "ECG0_p02",
    # "ENERGY_DATA_COMPLETE"
]

train_split = 0.8

seeds = [1, 50, 100, 150, 200, 250, 300, 350, 400, 450]

for index, dataset in enumerate(datasets):
    df = pd.DataFrame()
    print(dataset)
    for seed in tqdm(seeds):
        data = []
        # print(os.path.join(data_dir, dataset, "Train", dataset + '_Train_seed' + str(seed) + '_modified.csv'))
        for each_line in open(os.path.join(data_dir, dataset, "Train", dataset + '_Train_seed' + str(seed) + '_modified.csv')):
            each_line = each_line.strip().split(',')
            data.append(each_line)
            
        headers = data.pop(0)
        
        random.shuffle(data)
        
        train_datapoints = int(train_split * len(data))
        train_set = [headers] + data[:train_datapoints]
        validation_set = [headers] + data[train_datapoints:]
        
        train_set_df = pd.DataFrame(train_set)
        train_set_df.to_csv(os.path.join(data_dir, dataset, "Train", dataset + '_seed_' + str(seed) + '_train.csv'), index=False, header=False)
        
        validation_set_df = pd.DataFrame(validation_set)
        validation_set_df.to_csv(os.path.join(data_dir, dataset, "Train", dataset + '_seed_' + str(seed) + '_validation.csv'), index=False, header=False)