# Package Import

In [2]:
### import
import random as rand
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.impute import KNNImputer
from sklearn.ensemble import HistGradientBoostingRegressor, HistGradientBoostingClassifier, RandomForestRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate, cross_val_score, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, ConfusionMatrixDisplay

from scipy.stats import iqr

from statsmodels.miscmodels.ordinal_model import OrderedModel

# Load Data & Preprocess Func
panda: y_train_pd, x_train_pd

numpy: y_train, x_train (w/ missing value), x_train_std (imputed according to current method)

In [3]:
### missing value
# x should be pandas dataframe
def KNN_Impute(x, k):
    knn_impute = KNNImputer(n_neighbors=k) # n_neighbors, weights
    x = knn_impute.fit_transform(x)
    for i in range(x.shape[0]):
        x[i][1] = round(x[i][1])
    return x

def KNN_Impute_iqrs(x, k):
    iqrs = x.apply(lambda x: np.nanquantile(x, 0.75) - np.nanquantile(x, 0.25))
    x = x / iqrs
    knn_impute = KNNImputer(n_neighbors=k) # n_neighbors, weights
    x = knn_impute.fit_transform(x)
    x = x * iqrs.to_numpy()
    for i in range(x.shape[0]):
        x[i][1] = round(x[i][1])
    return x

In [4]:
### load training data with pandas
train_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/train.csv", delimiter=",", header=0)
n_train = train_df.shape[0]

y_train_pd = train_df[['Danceability']].copy()
y_train    = y_train_pd.to_numpy()
y_train    = np.reshape(y_train, n_train)

x_train_pd = train_df.iloc[:, list(train_df.dtypes == float)].copy()
x_train_pd = x_train_pd.drop(columns=['Danceability'])
x_train    = x_train_pd.to_numpy()   
x_train_std= KNN_Impute_iqrs(x_train_pd, 5) # change according to current agreement    
#pd.set_option('display.max_columns', 500)                                      
#train_df.head()

# Evaluation Func

In [5]:
### CV Interpretation
def CV_Average(score, msg):
    fold = score.shape[0]
    sum = 0
    for f in range(fold):
        sum += score[f]
    print(msg)
    print('average: ' + str(sum/fold))
    print('indiv.: '+str(score))

In [6]:
### Output Manipulation
def Reg_for_Cla(y):
    y = y.round()
    for i in range(y.shape[0]):
        for j in range (y.shape[1]):
            if   y[i][j] < 0:
                y[i][j] = 0
            elif y[i][j] > 9:
                y[i][j] = 9
    return y

# Models

[Hist Gradient Boosting Classifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingClassifier.html#sklearn.ensemble.HistGradientBoostingClassifier)

[Hist Gradient Boosting Regressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingRegressor.html#sklearn.ensemble.HistGradientBoostingRegressor)

best 5f-cv avg: 1.687 (original input w/ NaN & rounded output)

somehow regressor is better than classifier (maybe because considers ordered relation?)

In [7]:
### Gradient Boosting Decision Tree
gbr = make_pipeline(HistGradientBoostingRegressor(loss='absolute_error'))
gbr_round = TransformedTargetRegressor(regressor=HistGradientBoostingRegressor(loss='absolute_error'), inverse_func=np.round, check_inverse=False)

In [None]:
### Random Forest
# can only used imputed data
rf = make_pipeline(RandomForestRegressor(max_depth=2, max_samples=0.3, criterion='absolute_error'))
rf.fit(x_train_std, y_train)

In [15]:
### Cross Validation: change estimator name & x
cv_score = cross_val_score(gbr, x_train, y_train, cv=5, scoring="neg_mean_absolute_error")
CV_Average(cv_score, "original output")
cv_score = cross_val_score(gbr_round, x_train,     y_train, cv=5, scoring="neg_mean_absolute_error")
CV_Average(cv_score, "with rounding")

original output
average: -1.6955145926034483
indiv.: [-1.65224929 -1.69621278 -1.67112583 -1.70574927 -1.75223579]
with rounding
average: -1.6870704717530576
indiv.: [-1.64676762 -1.68200349 -1.65492137 -1.6924869  -1.75917298]


In [None]:
### Confusion Matrix on Classifier
x_train3, x_eval, y_train3, y_eval = train_test_split(x_train, y_train, random_state=0)
gbr_class = make_pipeline(HistGradientBoostingClassifier())
gbr_model = gbr_class.fit(x_train3, y_train3)

np.set_printoptions(precision=2)
titles_options = [("Confusion matrix, without normalization", None),
                  ("Normalized confusion matrix", "true"),]
for title, normalize in titles_options:
    disp = ConfusionMatrixDisplay.from_estimator(
        gbr_model,
        x_eval,
        y_eval,
        display_labels=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'],
        cmap=plt.cm.Blues,
        normalize=normalize,
    )
    disp.ax_.set_title(title)
    print(title)
    print(disp.confusion_matrix)

plt.show()

# Prediction

In [None]:
### load data
test_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/test.csv", delimiter=",", header=0)
n_test = test_df.shape[0]

id = test_df[['id']].copy()
id = id.to_numpy()
submit = np.zeros((n_test, 2))
submit[:, 0] = id[:, 0]

x_test_pd = test_df.iloc[:, list(test_df.dtypes == float)].copy()
x_test    = x_test_pd.to_numpy()   
x_test_std= KNN_Impute_iqrs(x_test_pd, 5) # change according to current agreement

In [None]:
### make prediction
gbr_round.fit(x_train, y_train)
submit[:, 1] = gbr.predict(x_test)

df = pd.DataFrame(submit, columns = ['id','Danceability'])
df = df.astype({"id": int})
df.to_csv('submission.csv', index=False)