In [1]:
import os
import glob
import math
import time
import pandas as pd
import numpy as np
import scipy as sc
from sklearn.model_selection import KFold
import warnings
import matplotlib.pyplot as plt
import matplotlib
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
import torch
import torch.nn as nn
import random
import seaborn as sns; sns.set_theme()
import torch.nn.functional as F
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from matplotlib.pyplot import figure
from IPython import display
from pandas.plotting import scatter_matrix
from sklearn.metrics import r2_score
from sklearn import svm
from numpy import std
from numpy import mean
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from matplotlib import cm
from sklearn.metrics import confusion_matrix

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
warnings.filterwarnings('ignore')
pd.set_option('max_columns', 300)

In [4]:
train_test_seurat = pd.read_csv('./integrate.csv')

KeyboardInterrupt: 

In [None]:
train_test_seurat = train_test_seurat.T

In [None]:
train_test_seurat_std = train_test_seurat.std()
column_names = list(train_test_seurat.columns)
columns_remove = []
for i in range(train_test_seurat.shape[1]):
    if train_test_seurat_std[i] == 0:
        columns_remove.append(column_names[i])

In [None]:
train_test_seurat = train_test_seurat.drop(columns_remove, axis=1)

In [None]:
train_test_seurat[columns_remove[0]] = train_test_seurat.iloc[:, 0]

In [None]:
train_test_seurat.shape

In [None]:
train_seurat = train_test_seurat.iloc[:90000, :]
test_seurat = train_test_seurat.iloc[90000:, :]

Load train and test data

# 1. Load data and preprocessing

## 1.1 Load train and test data

In [None]:
train = pd.read_csv('./MLR_Project_train.csv')
test = pd.read_csv('./MLR_Project_test.csv')

Show the data format and dimension

In [None]:
train.head()

In [None]:
test.head()

## 1.3 Show the maximum return of train and test

In [None]:
train_max = np.sum(train['TARGET'][train['TARGET']>0])
test_max = np.sum(test['TARGET'][test['TARGET']>0])

print('Maximum return of training set:', train_max)
print('Maximum return of testing set:', test_max)

In [None]:
reg = Ridge(alpha=0.5).fit(pd.DataFrame(train_seurat.iloc[:, :]), train['TARGET'])
pred = reg.predict(pd.DataFrame(train_seurat.iloc[:, :]))

pred_test = reg.predict(pd.DataFrame(test_seurat.iloc[:, :]))

train_res = np.sum(train['TARGET'][pred>0])
test_res = np.sum(test['TARGET'][pred_test>0])

In [None]:
print(f'Train naive random selection percentage return: {train_res/train_max*100}%')
print(f'Test naive random selection percentage return: {test_res/test_max*100}%')

### 1.3.1 Remove the Unnamed columns in dataframe

In [None]:
train = train.loc[:, ~train.columns.str.contains('^Unnamed')]
test = test.loc[:, ~test.columns.str.contains('^Unnamed')]

In [None]:
train.shape

In [None]:
train_ = pd.DataFrame()

for i in range(train.shape[1]-1):
    for j in range(train.shape[1]-1):
        train_[str(i)+'_'+str(j)+'_feat'] = train.iloc[:, i] * train.iloc[:, j]
        
train_target = pd.DataFrame(train['TARGET'])

train = train.drop(['TARGET'], axis = 1)

train = pd.concat([train, train_], axis = 1)

train['TARGET'] = train_target

In [None]:
test_ = pd.DataFrame()

for i in range(test.shape[1]-1):
    for j in range(test.shape[1]-1):
        test_[str(i)+'_'+str(j)+'_feat'] = test.iloc[:, i] * test.iloc[:, j]
        
test_target = pd.DataFrame(test['TARGET'])

test = test.drop(['TARGET'], axis = 1)

test = pd.concat([test, test_], axis = 1)

test['TARGET'] = test_target

## 5.5 Autoencoder Resnet model

In [None]:
input_features = train.drop(['TARGET'], axis=1).to_numpy()
output_features = pd.DataFrame((np.sign(train['TARGET'])+1)//2).to_numpy()

X_test = test.drop(['TARGET'], axis=1).to_numpy()
Y_test = pd.DataFrame((np.sign(test['TARGET'])+1)//2).to_numpy()

X_train, X_val, Y_train, Y_val = train_test_split(input_features, output_features, test_size=0.2, random_state=42)

####
train_data, val_data = train_test_split(train, test_size=0.2, random_state=42)
test_data = test
####

auto_train_max = np.sum(train_data['TARGET'][train_data['TARGET']>0])
auto_val_max = np.sum(val_data['TARGET'][val_data['TARGET']>0])
auto_test_max = np.sum(test['TARGET'][test['TARGET']>0])

print('Train X shape:', X_train.shape)
print('Validation X shape:', X_val.shape)
print('Test X shape:', X_test.shape)

print('Train Y shape:', Y_train.shape)
print('Val Y shape:', Y_val.shape)
print('Test Y shape:', Y_test.shape)

print('train_max:', auto_train_max)
print('val_max:', auto_val_max)
print('test_max:', auto_test_max)

In [None]:
train_input = torch.from_numpy(X_train)
train_output = torch.from_numpy(Y_train)
val_input = torch.from_numpy(X_val)
val_output = torch.from_numpy(Y_val)
test_input = torch.from_numpy(X_test)
test_output = torch.from_numpy(Y_test)

train_input = torch.unsqueeze(train_input, 1)
val_input = torch.unsqueeze(val_input, 1)
test_input = torch.unsqueeze(test_input, 1)

train_input = train_input.float()
train_output = train_output.float()
val_input = val_input.float()
val_output = val_output.float()
test_input = test_input.float()
test_output = test_output.float()

input_feature = train_input.shape[1]
output_feature = 1

# print('input_feature:', input_feature)
# print('output_feature:', output_feature)

In [None]:
train_input = train_input.to(device)
train_output = train_output.to(device)
val_input = val_input.to(device)
val_output = val_output.to(device)
test_input = test_input.to(device)
test_output = test_output.to(device)

In [None]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything()

In [None]:
# auto-encoder model
# base model
class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()
        self.linear1 = nn.Linear(input_feature, input_feature//2)
        self.linear2 = nn.Linear(input_feature//2, input_feature//4)
        self.linear3 = nn.Linear(input_feature//4, input_feature//16)
        self.linear4 = nn.Linear(input_feature//16, input_feature//16)
        
        self.linear5 = nn.Linear(input_feature//16, input_feature//16)
        self.linear6 = nn.Linear(input_feature//16, input_feature//16)
        
        self.batchnorm_1 = nn.BatchNorm1d(input_feature//2)
        self.batchnorm_2 = nn.BatchNorm1d(input_feature//4)
        self.batchnorm_3 = nn.BatchNorm1d(input_feature//16)
        self.linear = nn.Linear(input_feature//16, 1)
        
        nn.init.constant_(self.linear1.weight, 0.1)
        nn.init.constant_(self.linear2.weight, 0.1)
        nn.init.constant_(self.linear3.weight, 0.1)
        nn.init.constant_(self.linear4.weight, 0.1)
        nn.init.constant_(self.linear.weight, 0.1)
        self.relu = nn.ReLU()
#         self.leakyrelu = nn.LeakyReLU(0.1)
        self.dropout = nn.Dropout(0.15)
        
        self.softmax = nn.Softmax()
        

    def forward(self, x):
        x = self.linear1(x)
#         x = self.batchnorm_1(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.linear2(x)
#         x = self.batchnorm_2(x)
        x = self.relu(x)
#         x = self.dropout(x)
        
        x = self.linear3(x)
#         x = self.batchnorm_3(x)
        x = self.relu(x)
        
        x = self.linear6(x)
        x = self.relu(x)
        
        output = self.linear(x)
                
        return output.float()

In [None]:
batch_size = 100000
train_ds = TensorDataset(train_input, train_output)
train_dl = DataLoader(train_ds, batch_size= batch_size, shuffle=False)

In [None]:
%matplotlib inline
def fit(num_epochs, model, loss_fn, train_input, train_output, val_input, val_output, test_input, test_output, model_path):
    best_loss = float('inf')
    train_pred_output = []
    val_pred_output = []
    train_error = []
    val_error = []
    test_error = []
    epochs = []
    
    train_returns = []
    val_returns = []
    test_returns = []
    
    train_sum = []
    val_sum = []
    test_sum = []

    for epoch in range(num_epochs):
        for x,y in train_dl:
            model = model.train()
            opt.zero_grad()
            pred = model(x)
            y = torch.reshape(y, (y.shape[0], 1))
            loss = loss_fn(pred, y)
            loss.backward()
            opt.step()

        if epoch % 50 == 0:
            
            model = model.eval()
            
            train_pred = model(train_input)
            train_pred_index = (torch.sign(train_pred)+1)//2
            train_output = torch.reshape(train_output, (train_output.shape[0], 1))
            train_loss = loss_fn(train_output, train_pred)
            # train_loss = loss_fn(train_pred, train_output.long().squeeze())
            train_loss = train_loss.cpu().detach().numpy()
            
            val_pred = model(val_input)
            val_pred_index = (torch.sign(val_pred)+1)//2
            val_output = torch.reshape(val_output, (val_output.shape[0], 1))
            val_loss = loss_fn(val_output, val_pred)
            # val_loss = loss_fn(val_pred, val_output.long().squeeze())
            val_loss = val_loss.cpu().detach().numpy()
        
            test_pred = model(test_input)
            test_pred_index = (torch.sign(test_pred)+1)//2
            test_output = torch.reshape(test_output, (test_output.shape[0], 1))
            test_loss = loss_fn(test_output, test_pred)
            # test_loss = loss_fn(test_pred, test_output.long().squeeze())
            test_loss = test_loss.cpu().detach().numpy()
    
            epochs.append(epoch)
            train_error.append(math.log(train_loss+1))
            val_error.append(math.log(val_loss+1))
            test_error.append(math.log(test_loss+1))
            
#             figure, ax = plt.subplots(1, 2, figsize = (20, 7))
#             ax = ax.flatten()
            
#             figure, ax = plt.subplots(1, 4, figsize = (22, 5))
#             ax = ax.flatten()
            
#             plt.grid(False)
            # train_conf = confusion_matrix(train_output, train_pred_index)
#             g1 = sns.heatmap(train_conf, cmap="YlGnBu",cbar=False, ax=ax[0], annot = True)
#             g1.set_ylabel('True Target')
#             g1.set_xlabel('Predict Target')
#             g1.set_title('Train dataset')

#             plt.grid(False)
            # val_conf = confusion_matrix(val_output, val_pred_index)
#             g2 = sns.heatmap(val_conf, cmap="YlGnBu",cbar=False, ax=ax[1], annot = True)
#             g2.set_ylabel('True Target')
#             g2.set_xlabel('Predict Target')
#             g2.set_title('Val dataset')
            
#             plt.grid(False)
            # test_conf = confusion_matrix(test_output, test_pred_index)
#             g3 = sns.heatmap(test_conf, cmap="YlGnBu",cbar=False, ax=ax[2], annot = True)
#             g3.set_ylabel('True Target')
#             g3.set_xlabel('Predict Target')
#             g3.set_title('Test dataset')
            
            train_pred_np = train_pred_index.cpu().detach().numpy()
            train_output_np = train_output.cpu().detach().numpy()
            val_pred_np = val_pred_index.cpu().detach().numpy()
            val_output_np = val_output.cpu().detach().numpy()
            test_pred_np = test_pred_index.cpu().detach().numpy()
            test_output_np = test_output.cpu().detach().numpy()
            
#             train_max_value = max(max(train_output_np), max(train_pred_np))
#             train_min_value = min(min(train_output_np), min(train_pred_np))
#             val_max_value = max(max(val_output_np), max(val_pred_np))
#             val_min_value = min(min(val_output_np), min(val_pred_np))
#             test_max_value = max(max(test_output_np), max(test_pred_np))
#             test_min_value = min(min(test_output_np), min(test_pred_np))
            
#             ax[0].scatter(train_output_np, train_pred_np, s = 20, alpha=0.3, c='blue')
#             ax[1].scatter(val_output_np, val_pred_np, s = 20, alpha=0.3, c='red')
#             ax[2].scatter(test_output_np, test_pred_np, s = 20, alpha=0.3, c='green')
            
#             ax[0].plot(epochs, train_error, c='blue')
#             ax[0].plot(epochs, val_error, c='red')
#             ax[0].plot(epochs, test_error, c='green')
#             ax[0].set_title('Errors vs Epochs', fontsize=15)
#             ax[0].set_xlabel('Epoch', fontsize=10)
#             ax[0].set_ylabel('Errors', fontsize=10)

#             ax[0].legend(['train', 'valid', 'test'])
            
#             ax[0].set_xlim([train_min_value, train_max_value])
#             ax[0].set_ylim([train_min_value, train_max_value])
#             ax[0].set_title('Trainig data', fontsize=15)
#             ax[0].set_xlabel('Target', fontsize=10)
#             ax[0].set_ylabel('Prediction', fontsize=10)
#             ax[0].plot([train_min_value, train_max_value], [train_min_value, train_max_value], 'k-')
            
#             ax[1].set_xlim([val_min_value, val_max_value])
#             ax[1].set_ylim([val_min_value, val_max_value])
#             ax[1].set_title('Validation data', fontsize=15)
#             ax[1].set_xlabel('Target', fontsize=10)
#             ax[1].set_ylabel('Prediction', fontsize=10)
#             ax[1].plot([val_min_value, val_max_value], [val_min_value, val_max_value], 'k-')
            
#             ax[2].set_xlim([test_min_value, test_max_value])
#             ax[2].set_ylim([test_min_value, test_max_value])
#             ax[2].set_title('Testing data', fontsize=15)
#             ax[2].set_xlabel('Target', fontsize=10)
#             ax[2].set_ylabel('Prediction', fontsize=10)
#             ax[2].plot([test_min_value, test_max_value], [test_min_value, test_max_value], 'k-')
            
#             ax[3].plot(epochs, train_error, c='blue')
#             ax[3].plot(epochs, val_error, c='red')
#             ax[3].plot(epochs, test_error, c='green')
#             ax[3].set_title('Training and Validation error', fontsize=15)
#             ax[3].set_xlabel('Epochs', fontsize=10)
#             ax[3].set_ylabel('MSE error', fontsize=10)
            
#             display.clear_output(wait=True)
#             display.display(pl.gcf())
            
#             print('Epoch ', epoch, 'Train_loss: ', train_loss*1000, ' Validation_loss: ', val_loss*100, ' Test_loss: ', test_loss*100)
            # print(train_pred_np.shape, train_pred_np)
            # print(train_pred, train_pred_np)
            train_pred_np = np.squeeze(train_pred_np)
            val_pred_np = np.squeeze(val_pred_np)
            test_pred_np = np.squeeze(test_pred_np)
            
            train_res = np.sum(train_data['TARGET'][train_pred_np>0])
            train_output_check = np.squeeze(train_output_np)
            train_check = np.sum(train_data['TARGET'][train_output_check>0])
            
            val_res = np.sum(val_data['TARGET'][val_pred_np>0])
            val_output_check = np.squeeze(val_output_np)
            val_check = np.sum(val_data['TARGET'][val_output_check>0])
            
            test_res = np.sum(test_data['TARGET'][test_pred_np>0])
            test_output_check = np.squeeze(test_output_np)
            test_check = np.sum(test_data['TARGET'][test_output_check>0])
            
#             train_returns.append(train_res)
#             val_returns.append(val_res)
#             test_returns.append(test_res)
            
#             ax[1].plot(epochs, train_returns, c='blu`e')
#             ax[1].plot(epochs, val_returns, c='red')
#             ax[1].plot(epochs, test_returns, c='green')
#             ax[1].legend(['train', 'valid', 'test'])
#             ax[1].set_title('Return vs Epochs', fontsize=15)
#             ax[1].set_xlabel('Epoch', fontsize=10)
#             ax[1].set_ylabel('Returns', fontsize=10)

#             display.clear_output(wait=True)
#             display.display(pl.gcf())
            
            train_sum.append(train_res)
            val_sum.append(val_res)
            test_sum.append(test_res)
            # print(f'Checks: {train_check/auto_train_max*100}%, {val_check/auto_val_max*100}%, {test_check/auto_test_max*100}%')
#             print(f'Maximum sum train return {train_res}, Total train return: {auto_train_max}, Maximum train percentage return: {train_res/auto_train_max*100}%')
#             print(f'Maximum sum train return {val_res}, Total train return: {auto_val_max}, Maximum train percentage return: {val_res/auto_val_max*100}%')
#             print(f'Maximum sum test return {test_res}, Total test return: {auto_test_max}, Maximum test percentage return: {test_res/auto_test_max*100}%')
#             print('Epoch:', epoch, 'Train loss:', train_loss, 'Val loss:', val_loss, 'Test loss:', test_loss)
            print(f'Epoch: {epoch}, Train loss: {train_loss}, Train return: {train_res/auto_train_max*100}%, Val loss: {val_loss}, Val return: {val_res/auto_val_max*100}%, Test loss: {test_loss}, Test return: {test_res/auto_test_max*100}%')
        
            if val_loss < best_loss:
                torch.save(model.state_dict(), model_path)
                best_loss = val_loss
                
#             train_pred_output.append([train_pred.cpu().detach().numpy(), train_output.cpu().detach().numpy()])
#             val_pred_output.append([val_pred.cpu().detach().numpy(), val_output.cpu().detach().numpy()])
    return train_sum, val_sum, test_sum


In [None]:
num_epochs = 20000
learning_rate = 0.001
loss_fn = F.mse_loss

seed_everything()

model = Autoencoder()
model = model.to(device)
opt = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)
train_sum_1, val_sum_1, test_sum_1 = fit(num_epochs, model, loss_fn, train_input, train_output, val_input, val_output, test_input, test_output, 'model_path_cnn')
# fig.savefig("auto_encoder.png", bbox_inches='tight', dpi=600)

In [None]:
# model = Autoencoder_model()
# model.load_state_dict(torch.load(model_path))
# model.eval()