In [None]:
# Import libraries
import pandas as pd
import numpy as np
from numpy import matlib
from scipy import stats
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate, KFold, train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, silhouette_score
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor
from sklearn.neural_network import MLPRegressor

In [None]:
# Data preprocessing functions
def deleteMissingValues(dataFrame):
    dataTr = np.array(dataFrame)
    
    # characterize missing values
    ifDataMiss = np.sum(np.isnan(dataTr))
    # delete missing values
    if ifDataMiss != 0:
        indNaN = np.argwhere(dataTr == np.NaN)
        rowDelete = indNaN[:,0]
        data = np.delete(dataTr, rowDelete, axis=0)
        dataTr = data
    
    return dataTr

def errorCorrection(data):
    N,dim = np.shape(data)

    dataFeatures = data[:,0:-1]
    dataFtMean = np.mean(dataFeatures, axis=0)
    dataFtStd  = np.std(dataFeatures, axis=0)

    dataFtCmp = matlib.repmat(4 * dataFtStd, N, 1)

    indError = np.argwhere(np.abs(dataFeatures - dataFtMean) > dataFtCmp)
    rowError = indError[:,0]
    rowDelete = np.unique(rowError)

    dataCoE = np.delete(data, rowDelete, axis=0)

    return dataCoE

def rescaleData(data):
    N,dim = np.shape(data)

    dataTarget = np.reshape(data[:,-1],[N,1])
    dataFeatures = data[:,0:-1]

    scaler = MinMaxScaler()
    scaler.fit(dataFeatures)
    dataNormdFeatures = scaler.transform(dataFeatures)

    dataNormd = np.append(dataNormdFeatures,dataTarget,axis=1)

    return dataNormd

In [None]:
# Data preprocessing
dfTr = pd.read_csv('train.csv')
# get the names of the column header
featureNames = list(dfTr.columns)

dataTr = deleteMissingValues(dfTr)

dataTrCoE = errorCorrection(dataTr)
dfCoE= pd.DataFrame(dataTrCoE, columns=featureNames)
dfCoE.to_csv('train_CoE.csv', index=False, header=True) 

dataTrNormd = rescaleData(dataTr)
dfNormd= pd.DataFrame(dataTrNormd, columns=featureNames)
dfNormd.to_csv('train_Normd.csv', index=False, header=True) 

dataTrCoENormd = rescaleData(dataTrCoE)
dfCoENormd = pd.DataFrame(dataTrCoENormd, columns=featureNames)
dfCoENormd.to_csv('train_CoE_Normd.csv', index=False, header=True) 

In [None]:
# Random forest error calculation
def RFRegressorErrorCalc(X, Y, splitNum):
    inSampleError = []
    outSampleError = []
    kf = KFold(n_splits = splitNum)
    for train, test in kf.split(X, Y):
        RFRegr = RandomForestRegressor() # Used squared loss, which is default
        RFRegr.fit(X.iloc[train,:], Y.iloc[train])
        inSampleError = np.append(inSampleError,(RFRegr.predict(X.iloc[train,:])-Y.iloc[train])**2)
        outSampleError = np.append(outSampleError,(RFRegr.predict(X.iloc[test,:])-Y.iloc[test])**2)

    return inSampleError, outSampleError

In [None]:
dfRaw = pd.read_csv('train.csv')
dfNormd = pd.read_csv('train_Normd.csv')


# random forest regression, raw data & normalized data
XRaw = dfRaw.drop(columns='target')
YRaw = dfRaw['target']
XNormd = dfNormd.drop(columns='target')
YNormd = dfNormd['target']
cvNum = 5

# errors
rawDataRFInSampleError, rawDataRFOutSampleError = RFRegressorErrorCalc(XRaw, YRaw, cvNum)
normdDataRFInSampleError, normdDataRFOutSampleError = RFRegressorErrorCalc(XNormd, YNormd, cvNum)

# paired t-test
errorCmpRes = [
stats.ttest_rel(rawDataRFInSampleError, normdDataRFInSampleError),
stats.ttest_rel(rawDataRFOutSampleError, normdDataRFOutSampleError),
]
print("Raw Data & Normalized Data In-sample Error T-Test: \n", "T-statistic =", round(tuple(errorCmpRes[0])[0], 4), ", pvalue =", round(tuple(errorCmpRes[0])[1], 4), "\n")
print("Raw Data & Normalized Data Out-of-sample Error T-Test: \n", "T-statistic =", round(tuple(errorCmpRes[1])[0], 4), ", pvalue =", round(tuple(errorCmpRes[1])[1], 4), "\n")

In [None]:
# K-Means clustering

df = pd.read_csv('train_normd.csv')
X = np.array(df.drop('target', axis=1))
target = np.array(df['target']).reshape(-1,1)
N, dim = np.shape(X)

kList = np.append(np.array([2]), np.arange(5,501,5))
silhouetteAvgScoreList = np.array([])
inertiaList = np.array([])
rmseTrList = np.array([])
rmseTeList = np.array([])
for indk in range(len(kList)):
    k = kList[indk]
    kmeansModel = KMeans(n_clusters=k, random_state=0, n_init="auto")
    KMCl = kmeansModel.fit(X)
    clCenters = KMCl.cluster_centers_
    clLabels = KMCl.labels_

    silhouetteAvgScore = silhouette_score(X, clLabels)
    silhouetteAvgScoreList = np.append(silhouetteAvgScoreList,silhouetteAvgScore)
    inertiaList = np.append(inertiaList, kmeansModel.inertia_)

    distDataCtr = np.zeros(shape=[N, k])
    for i in range(k):
        centerCalc = clCenters[i, :]
        dist = np.linalg.norm((X - centerCalc), axis=1)
        distDataCtr[:, i] = dist

    if k == 2:
        max_dist = np.max([1.05*np.max(distDataCtr[:, 0]), 1.1*np.max(distDataCtr[:, 1])])
        fig1 = plt.figure()
        plt.scatter(distDataCtr[:, 0], distDataCtr[:, 1], c=target, cmap='plasma', s=10)
        plt.colorbar()
        plt.plot([0, max_dist], [0, max_dist])
        plt.xlabel('Distance from Cluster Center 1')
        plt.ylabel('Distance from Cluster Center 2')
        plt.title('K-Means Clustering')
        plt.xlim((0, max_dist))
        plt.ylim((0, max_dist))
        fig1.show()

    XTr, XTe, yTr, yTe = train_test_split(distDataCtr, target, test_size=0.2)
    
    lrModel = LinearRegression()
    lrModel.fit(XTr, yTr)

    yPredTr = lrModel.predict(XTr)
    yPred = lrModel.predict(XTe)

    rmseTr = mean_squared_error(yTr, yPredTr, squared=False)
    rmseTe = mean_squared_error(yTe, yPred, squared=False)
    if k == 2:
        print("Training RMSE = ", rmseTr)
        print("Training RMSE = ", rmseTe)

    rmseTrList = np.append(rmseTrList, rmseTr)
    rmseTeList = np.append(rmseTeList, rmseTe)

figRMSE = plt.figure()
plt.plot(kList, rmseTrList, label="Training RMSE")
plt.plot(kList, rmseTeList, label="Testing RMSE")
figRMSE.legend()
plt.xlabel('number of clusters')
plt.ylabel('RMSE')
figRMSE.show()

figScore = plt.figure()
plt.plot(kList, silhouetteAvgScoreList)
plt.xlabel('number of clusters')
plt.ylabel('silhouette score')
figScore.show()

figInertia = plt.figure()
plt.plot(kList, inertiaList)
plt.xlabel('number of clusters')
plt.ylabel('inertia')
figInertia.show()