# 2018-11-19 (3.91929)

In [None]:
import numpy as np
import pandas as pd
np.random.seed(0)

import tensorflow as tf
tf.set_random_seed(0)
from tensorflow.keras import layers, regularizers, optimizers
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor 

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from scipy.stats import zscore
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder

In [None]:
def readDataset(filename):
    return pd.read_csv(filename, skipinitialspace=True)

In [None]:
def preprocessDataset(dataset, is_test=False):
    # Convert categorical variables to numerical
    ohe = OneHotEncoder()
    dataset = pd.get_dummies(dataset)
    
    dataset = dataset.drop(["id", "l1_ratio", "scale", "alpha"], axis=1)        
    dataset.loc[dataset["n_jobs"] == -1, ["n_jobs"]] = 16
        
    #max_iter_n_samples = dataset["max_iter"].values * dataset["n_samples"].values * dataset["n_features"].values
    #dataset["max_iter_n_samples_n_features"] = max_iter_n_samples
    #dataset["max_iter_n_samples_n_features"] /= dataset["n_jobs"]
    # dataset = dataset.drop(["max_iter", "n_samples", "n_features"], axis=1)
    #n_informative_classes = dataset["n_informative"].values * dataset["n_classes"]
    #dataset["n_informative_classes"] = n_informative_classes / dataset["n_jobs"]
    max_iter_n_samples = dataset["max_iter"].values * dataset["n_samples"].values
    dataset["max_iter_n_samples"] = max_iter_n_samples
    
    # this is bad
    #dataset["max_iter"] /= dataset["n_jobs"]
    #dataset["n_samples"] /= dataset["n_jobs"]
    #dataset["n_features"] /= dataset["n_jobs"]
    #dataset["n_classes"] /= dataset["n_jobs"]
    
    # Remove outliers
    if not is_test:
        z = np.abs(zscore(dataset.drop(["time"], axis=1)))
        dataset = dataset[(z < 3).all(axis=1)]
    
    time = None
    if "time" in dataset.columns:
        time = dataset["time"]
        dataset = dataset.drop(["time"], axis=1)
        
    # Z-score normalization
    dataset = dataset.apply(zscore)

    return dataset, time
  
def testPreprocessDataset(dataset):
    dataset = dataset.drop(["id", "l1_ratio", "alpha", "random_state", "n_clusters_per_class", "scale"], axis=1)
    dataset.loc[dataset["n_jobs"] == -1, ["n_jobs"]] = 16
    #dataset = dataset[dataset["flip_y"] < 0.095]
    
    time = None
    if "time" in dataset.columns:
        time = dataset["time"]
        dataset = dataset.drop(["time"], axis=1)
        
    ohe = OneHotEncoder()
    dataset = pd.get_dummies(dataset)
    dataset = dataset.apply(zscore)
    return dataset, time
    

In [None]:
def plot_corr(dataset, target, size=10):
    dataset_copy = dataset.copy(deep=True)
    if "time" not in dataset_copy.columns:
        dataset_copy["time"] = target
    corr = dataset_copy.corr()
    return corr.style.background_gradient()
    #corr = dataset.corrwith(target)
    #return corr

In [None]:
# Read in the dataset
df_full = readDataset("./train-combined.csv")
df_full.head()

In [None]:
# featexp
"""
from featexp import get_univariate_plots, get_trend_stats

df_full = readDataset("/content/drive/My Drive/School/HKUST/MSBD 5001/KaggleCompetition/train-combined.csv")
df_full = df_full.drop(["id", "l1_ratio", "alpha", "random_state", "n_clusters_per_class", "scale"], axis=1)
df_full.loc[df_full["n_jobs"] == -1, ["n_jobs"]] = 16
ohe = OneHotEncoder()
df_full = pd.get_dummies(df_full)
X_train, X_test = train_test_split(df_full, test_size=0.25)

#df_full = df_full[df_full["flip_y"] < 0.09]

# Plots drawn for all features if nothing is passed in feature_list parameter.
#get_univariate_plots(data=df_full, target_col='time', bins=20, features_list=['scale'])
get_univariate_plots(data=X_train, target_col='time', bins=20, features_list=['flip_y'], data_test=X_test)

#get_trend_stats(data=X_train, target_col='time', data_test=X_test)
"""

In [None]:
# Correlation matrix
plot_corr(df_full.drop("id", axis=1), None)

In [None]:
df, time = preprocessDataset(df_full)
df.head()

In [None]:
# Correlation matrix
plot_corr(df, time)

In [None]:
X = df.values
y = time.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X.shape

In [None]:
# Keras sequential model
def build_model():
    num_units = 9
    model = tf.keras.Sequential([
      layers.Dense(num_units, kernel_regularizer=regularizers.l2(0.001), activation='relu'),
      #layers.Dense(num_units//2, kernel_regularizer=regularizers.l2(0.001), activation='relu'),
      #layers.Dense(num_units//4, kernel_regularizer=regularizers.l2(0.001), activation='relu'),
      #layers.Dropout(0.2),
        
      # Add a layer with 1 output unit:
      layers.Dense(1)
    ])

    model.compile(optimizer=optimizers.Adam(lr=0.01),
    #model.compile(optimizer=optimizers.Adam(lr=0.001, decay=0.01, amsgrad=True),
    #model.compile(optimizer=tf.train.AdagradOptimizer(0.01),
                  loss='mse',       # mean squared error
                  metrics=['mse'])  # mean squared error
    
    return model

In [None]:
model = build_model()
history = model.fit(X, y, epochs=40, batch_size=9)
                    #validation_data=(X_test, y_test),
                    #validation_split=0.2)
                    #verbose=0)

In [None]:
# summarize history for accuracy  
plt.plot(history.history['mean_squared_error'][:])  
plt.plot(history.history['val_mean_squared_error'][:])  
plt.title('model accuracy')  
plt.ylabel('accuracy')  
plt.xlabel('epoch')  
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
# PREDICTION
df_test = readDataset("./test.csv")
df_test, time = preprocessDataset(df_test, True)
#df_test.head()
X_test = df_test.values

In [None]:
y_pred = model.predict(X_test).flatten()

In [None]:
# Output to csv
import datetime
d = datetime.datetime.today().strftime('%Y%m%d')

output = pd.DataFrame(data=y_pred, columns=["time"])
output["time"] = output["time"].abs()

filename = "./submission-" + d + ".csv"
output.to_csv(filename, index_label="Id")