# Package Import

In [None]:
### import
import random as rand
import math
import numpy as np
import pandas as pd

from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingRegressor, HistGradientBoostingClassifier
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error

import matplotlib.pyplot as plt


from scipy.stats import iqr

!pip install -U liblinear-official
from liblinear.liblinearutil import *
from statsmodels.miscmodels.ordinal_model import OrderedModel

# Load Data & Preprocess Func
panda: y_train_pd, x_train_pd

numpy: y_train, x_train (w/ missing value), x_train_std (imputed according to current method)

In [None]:
### missing value
# x should be pandas dataframe
def KNN_Impute(x, k):
    knn_impute = KNNImputer(n_neighbors=k) # n_neighbors, weights
    x = knn_impute.fit_transform(x)
    for i in range(x.shape[0]):
        x[i][1] = round(x[i][1])
    return x

def KNN_Impute_iqrs(x, k):
    iqrs = x.apply(lambda x: np.nanquantile(x, 0.75) - np.nanquantile(x, 0.25))
    x = x / iqrs
    knn_impute = KNNImputer(n_neighbors=k) # n_neighbors, weights
    x = knn_impute.fit_transform(x)
    x = x * iqrs.to_numpy()
    for i in range(x.shape[0]):
        x[i][1] = round(x[i][1])
    return x

In [None]:
### load training data with pandas
train_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/train.csv", delimiter=",", header=0)
n_train = train_df.shape[0]

y_train_pd = train_df[['Danceability']].copy()
y_train    = y_train_pd.to_numpy()
y_train    = np.reshape(y_train, n_train)

x_train_pd = train_df.iloc[:, list(train_df.dtypes == float)].copy()
x_train_pd = x_train_pd.drop(columns=['Danceability'])
x_train    = x_train_pd.to_numpy()   
x_train_std= KNN_Impute_iqrs(x_train_pd, 5) # change according to current agreement    
#pd.set_option('display.max_columns', 500)                                      
#train_df.head()

# Evaluation Func

In [None]:
### CV Interpretation
def CV_Average(score, msg):
    fold = score.shape[0]
    sum = 0
    for f in range(fold):
        sum += score[f]
    print(msg)
    print('average: ' + str(sum/fold))
    print('indiv.: '+str(score))

In [None]:
### Output Manipulation
def Reg_for_Cla(y):
    for i in range(y.shape[0]):
        for j in range (y.shape[1]):
            integer = math.floor(y[i][j])
            trail = y[i][j] - integer
            if trail < 0.1:
                y[i][j] = integer
            elif trail > 0.9:
                y[i][j] = integer+1
    return y

# Models

[Hist Gradient Boosting Classifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingClassifier.html#sklearn.ensemble.HistGradientBoostingClassifier)

[Hist Gradient Boosting Regressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingRegressor.html#sklearn.ensemble.HistGradientBoostingRegressor)

best 5f-cv avg: 1.697 (original input w/ NaN & original output)

somehow regressor is better than classifier (maybe because considers ordered relation?)

In [None]:
### Gradient Boosting Decision Tree
gbr = make_pipeline(HistGradientBoostingRegressor(loss='absolute_error'))
gbr_trans = TransformedTargetRegressor(regressor=HistGradientBoostingRegressor(loss='absolute_error'), inverse_func=Reg_for_Cla, check_inverse=False)


#   CV: change estimator name
cv_score = cross_val_score(gbr_trans, x_train_std, y_train, cv=5, scoring="neg_mean_absolute_error")
CV_Average(cv_score, "with iqrs impute")
cv_score = cross_val_score(gbr_trans, x_train, y_train, cv=5, scoring="neg_mean_absolute_error")
CV_Average(cv_score, "with NaN")

In [None]:
### Ordered Model
mod_prob = OrderedModel(y_train, x_train_std, distr='logit')
res_prob = mod_prob.fit()
#print(res_prob.summary())

y_pred_class = res_prob.predict(x_train_std)
y_pred= np.zeros((n_train))
for i in range(n_train):
    best_r = 0
    for r in range(10):
        if y_pred_class[i][r] > y_pred_class[i][best_r]:
            best_r = r
    y_pred[i] = best_r
mean_absolute_error(y_train, y_pred)

# Prediction

In [None]:
### load data
test_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/test.csv", delimiter=",", header=0)
n_test = test_df.shape[0]

id = test_df[['id']].copy()
id = id.to_numpy()
submit = np.zeros((n_test, 2))
submit[:, 0] = id[:, 0]

x_test_pd = test_df.iloc[:, list(test_df.dtypes == float)].copy()
x_test    = x_test_pd.to_numpy()   
x_test_std= KNN_Impute_iqrs(x_test_pd, 5) # change according to current agreement    
#pd.set_option('display.max_columns', 500)                                      
#x_test_pd.head()

In [None]:
### make prediction
gbr = make_pipeline(HistGradientBoostingRegressor(loss='absolute_error'))
gbr.fit(x_train, y_train)
submit[:, 1] = gbr.predict(x_test)

df = pd.DataFrame(submit, columns = ['id','Danceability'])
df = df.astype({"id": int})
df.to_csv('submission.csv', index=False)
#get rid of 0 column