In [None]:
# Import moddules

import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn import metrics


In [2]:

def read_split_input():
    M, N = map(int, input().split())
    raw_data = np.array([input().split() for _ in range(N)], float)
    raw_data_X = raw_data[:,:2]
    raw_data_Y = raw_data[:,-1]

    T = int(input())
    test_raw_data = np.array([input().split() for _ in range(T)], float)
    
    # Split raw_data to training and validation sets
    # X_train, X_valid, y_train, y_valid = train_test_split(raw_data[:,:2],raw_data[:,-1],test_size=0.33, random_state=42)
    
    return raw_data_X, raw_data_Y, test_raw_data



In [3]:

def feature_engineer(raw_data_X, raw_data_Y, test_raw_data):

    train_X_df = pd.DataFrame({'X1':raw_data_X[:,0],'X2':raw_data_X[:,1]})
    train_Y_df = pd.DataFrame({'X3':raw_data_Y})
    
    test_df = pd.DataFrame({'X1':test_raw_data[:,0],'X2':test_raw_data[:,1]})

    print(train_X_df.describe)

    # Check if any missing value exist then fix them with mean [training set]
    if train_X_df.isnull().values.any():
        train_X_df[['X1','X2']]=train_X_df[['X1','X2']].apply(lambda x: x.fillna(x.mean()))

    # Check outliers and remove the record [training set]
    # keep only the ones that are within +/-3standard deviations in the column 'Data'.
    train_X_df = train_X_df[np.abs(train_X_df.X1-train_X_df.X1.mean()) <= (3*train_X_df.X1.std())]
    train_X_df = train_X_df[np.abs(train_X_df.X2-train_X_df.X1.mean()) <= (3*train_X_df.X2.std())]

    # Normalize data [training and testing sets]
    train_X_df = train_X_df.apply(lambda x: ((x - x.mean())/x.std()))
    test_df = test_df.apply(lambda x: ((x - x.mean())/x.std()))

    print('Train dataframe dimensions: ', train_X_df.shape)
    print('Train dataframe dimensions: ', train_Y_df.shape)
    print('Test dataframe dimensions: ', test_df.shape)

    return train_X_df, train_Y_df, test_df



In [None]:

def fit_regression_model(train_X_df, train_Y_df):
    
    # Test a traditional Linear Regression model
    model_regression = linear_model.LinearRegression()
    model_regression.fit(train_X_df, train_Y_df)

    return model_regression


In [None]:

def fit_regularized_models(train_X_df, train_Y_df):
    # Try a Linear Regression model with Ridge and k-CV
    model_ridge = linear_model.RidgeCV(alphas=(0.1, 0.5, 1.0, 5.0, 7.0, 10.0), 
                                       fit_intercept=True, normalize=False, scoring=None, 
                                       cv=5, gcv_mode=None, store_cv_values=False)
    model_ridge.fit(train_X_df, train_Y_df)

    # Try a Linear Regression model with LASSO and k-CV
    model_lasso = linear_model.LassoCV(eps=0.001, n_alphas=100, 
                                       alphas=(0.1, 0.5, 1.0, 5.0, 7.0, 10.0), 
                                       fit_intercept=True, normalize=False, max_iter=1000, 
                                       tol=0.0001, copy_X=True, cv=5)
    model_lasso.fit(train_X_df, train_Y_df)

    return model_ridge, model_lasso


In [None]:

def evaluate_models(estimator, X, Y):
    y_pred = estimator.predict(X)
    mse = metrics.mean_squared_error(Y.as_matrix(), y_pred)
    r_sq = metrics.r2_score(Y.as_matrix(), y_pred)
    return y_pred, mse, r_sq


In [None]:

def main():
    raw_data_X, raw_data_Y, test_raw_data = read_split_input()
    train_X_df, train_Y_df, test_df = feature_engineer(raw_data_X, raw_data_Y, test_raw_data)
    # print(train_X_df)
    print(train_Y_df)

    model_regression = fit_regression_model(train_X_df, train_Y_df)
    y_pred, mse, r_sq = evaluate_models(model_regression,train_X_df, train_Y_df)
    print(model_regression)
    print("Predictions: ", y_pred)
    print("MSE: ", mse)
    print("R-Squared: ", r_sq)
    print("=======================================")

    model_ridge, model_lasso = fit_regularized_models(train_X_df, train_Y_df)
    y_pred, mse, r_sq = evaluate_models(model_ridge,train_X_df, train_Y_df)
    print(model_ridge)
    print("Predictions: ", y_pred)
    print("MSE: ", mse)
    print("R-Squared: ", r_sq)
    print("=======================================")

    y_pred, mse, r_sq = evaluate_models(model_lasso,train_X_df, train_Y_df)
    print(model_lasso)
    print("Predictions: ", y_pred)
    print("MSE: ", mse)
    print("R-Squared: ", r_sq)
    print("=======================================")
    # return '\n'.join(list(map(str,result)))


if __name__ == '__main__':
    main()
