In [None]:
# 1501900(16_80_06)_0701(90403_40430)

In [1]:
import numpy as np
import pandas as pd
import random as r

from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mad
from scipy.stats import pearsonr
import matplotlib.pyplot as plt

In [2]:
def preprocessing(X_train, X_test, n_components=None):
    if n_components == None: # If no parameter has been provided
        # minimum(n_samples, 2% of n_features)
        n_components = min(len(X_train), int(np.ceil(0.02*len(X_train.iloc[0]))))
    
    # Define PCA and model according to training set
    pca = PCA(n_components)
    pca.fit(X_train)
    
    # Calculate and return feature-extracted datasets
    return pca.transform(X_train), pca.transform(X_test)

def predict_super_resolution(X_train, Y_train, X_test):
    X_train, X_test = preprocessing(X_train, X_test)
    
    model = LinearRegression().fit(X_train, Y_train)
    
    return model.predict(X_test)

In [3]:
def write_results(results, file_name):
    results = results.flatten()
    
    open(file_name, "w").close() # Create blank file
    file = open(file_name, "a") # Open with appending
    
    # Write header row
    file.write("ID,predicted\n")
    
    i = 0 # Write results
    for value in results:
        file.write(f"{i},{value}\n")
        i += 1
    
    file.close()
    return i

In [4]:
def CV_5_Fold(train_LR_location, train_HR_location):
    # Read data
    X_train = pd.read_csv(train_LR_location)
    Y_train = pd.read_csv(train_HR_location)

    r.seed(1) # Define cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=1)

    predictions = np.array([])
    ground_truth = np.array([])

    MSE_values = []
    MAD_values = []
    P_corr_values = []

    i = 0
    for i_train, i_test in kf.split(X_train): # For each split
        current_X_train = X_train.iloc[i_train] # Get training input
        current_Y_train = Y_train.iloc[i_train] # Get training output
        current_X_test = X_train.iloc[i_test]   # Get testing input

        # Get testing output
        current_ground_truth = Y_train.iloc[i_test].to_numpy().flatten()
        ground_truth = np.concatenate([ground_truth, current_ground_truth])

        # Calculate predictions
        current_predictions = predict_super_resolution(current_X_train, current_Y_train, current_X_test).flatten()
        predictions = np.concatenate([predictions, current_predictions])

        # Calculate performance parameters
        MSE_values += [mse(current_predictions, current_ground_truth)]
        
        # Used for report
        #MAD_values += [mad(current_predictions, current_ground_truth)]
        #P_corr_values += [pearsonr(current_predictions, current_ground_truth)[0]]
    return predictions, MSE_values, MAD_values, P_corr_values

In [5]:
predictions, MSE_values, MAD_values, P_corr_values = CV_5_Fold("../data/train_LR.csv", "../data/train_HR.csv")

write_results(predictions, "test.csv")

6762042

In [12]:
MSE_values

0.7187060429565174