# Package Import

In [None]:
### import
import random as rand
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.impute import KNNImputer
from sklearn.ensemble import HistGradientBoostingRegressor, HistGradientBoostingClassifier
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate, cross_val_score, cross_val_predict, RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, ConfusionMatrixDisplay

from scipy.stats import iqr, uniform
!pip install scikit-optimize
from skopt import BayesSearchCV
import xgboost as xgb
import joblib   
#joblib.dump(best_model, 'best_model.joblib')

# Load Data & Preprocess Func


In [3]:
### missing value
# x should be pandas dataframe
def KNN_Impute(x, k):
    knn_impute = KNNImputer(n_neighbors=k) # n_neighbors, weights
    x = knn_impute.fit_transform(x)
    for i in range(x.shape[0]):
        x[i][1] = round(x[i][1])
    return x

def KNN_Impute_iqrs(x, k):
    iqrs = x.apply(lambda x: np.nanquantile(x, 0.75) - np.nanquantile(x, 0.25))
    if x.shape[1] == 17:
        iqrs[13:16] = 1
    x = x / iqrs
    knn_impute = KNNImputer(n_neighbors=k) # n_neighbors, weights
    x = knn_impute.fit_transform(x)
    x = x * iqrs.to_numpy()
    for i in range(x.shape[0]):
        x[i][1] = round(x[i][1])
    return x

In [9]:
### load training data with pandas
train_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/train.csv", delimiter=",", header=0)
n_train = train_df.shape[0]

# y_train
y_train_pd = train_df[['Danceability']].copy()
y_train    = y_train_pd.to_numpy()
y_train    = np.reshape(y_train, n_train)

train_df = train_df.drop(['Danceability','id','Track','Album','Uri','Url_spotify','Url_youtube','Description','Title','Channel'], axis =1)
album_map = {'album': 3, 'single': 2, 'compilation': 1}
tf_map    = {1: 1, 0: 0}
train_df['Album_type']     = train_df['Album_type'].map(album_map, na_action='ignore')
train_df['official_video'] = train_df['official_video'].map(tf_map, na_action='ignore')
train_df['Licensed']       = train_df['Licensed'].map(tf_map, na_action='ignore')

# x_train125: one hot encoding of artist, composer, album_type
x_train_pd125 = pd.get_dummies(train_df, columns=['Artist','Composer','Album_type'])
x_train125    = x_train_pd125.to_numpy()  
# x_train17: categorical album_type, no artist, composer
x_train_pd17 = train_df.drop(['Composer','Artist'], axis =1)
x_train17 = x_train_pd17.to_numpy()
# x_train14: no album_type, official_video, licensed, artist, composer
x_train_pd14 = x_train_pd17.drop(['Album_type','official_video','Licensed'], axis =1)
x_train14    = x_train_pd14.to_numpy()
#pd.set_option('display.max_columns', 500)                                      
#print(x_train_pd17.head(3))
#print(x_train125.shape)

In [None]:
### Impute
x_train17_knn  = KNN_Impute_iqrs(x_train_pd17, 5)
x_train14_knn  = KNN_Impute_iqrs(x_train_pd14, 5)

# Evaluation Func

In [10]:
### Search Result
def Search_Result(res):
    print("mean")
    print(res.cv_results_['mean_test_score'])
    print("std")
    print(res.cv_results_['std_test_score'])
    print("rank")
    print(res.cv_results_['rank_test_score'])
    print(res.best_params_)
    print(res.best_score_)

In [11]:
### CV Interpretation
def CV_Average(score, msg):
    fold = score.shape[0]
    sum = 0
    for f in range(fold):
        sum += score[f]
    print(msg)
    print('average: ' + str(sum/fold))
    print('indiv.: '+str(score))

In [12]:
### Output Manipulation
# Rounding
def Reg_for_Cla(y):
    y = y.round()
    for i in range(y.shape[0]):
        for j in range (y.shape[1]):
            if   y[i][j] < 0:
                y[i][j] = 0
            elif y[i][j] > 9:
                y[i][j] = 9
    return y

# Decision Stump
def Stump(y_reg, y_cla):
    n = y_reg.shape[0]
    y_reg = y_reg.reshape((n,1))
    y_cla = y_cla.reshape((n,1))
    y = np.concatenate((y_reg, y_cla), axis=1)
    y = y[y[:, 0].argsort()]


    thr = np.empty(9)
    for t in range(9):
        y2 = np.copy(y)
        for i in range (n):
            if y[i][1] <= t:
                y2[i][1] = -1
            else:
                y2[i][1] = 1

        gtrain = np.zeros((n))
        for i in range(n):
            if y2[i][1] == -1:
                gtrain[0] += 1
        for i in range(1, n):
            if y2[i-1][1] == -1:
                gtrain[i] = gtrain[i-1] - 1;
            else:                  
                gtrain[i] = gtrain[i-1] + 1;

        ming = 0
        for i in range(n):
            if gtrain[i] < gtrain[ming]:
                ming = i
        if ming == 0:  
            g = -1;
        else:
            g = (y[ming][0] + y[ming-1][0]) / 2;

        thr[t] = g
        #print(str(t)+': '+str(gtrain[ming]/n))
    return thr

def Stump_Apply(y, stump):
    y_pred = np.copy(y)
    for i in range(y_pred.shape[0]):
        if y_pred[i] < stump[0]:
            y_pred[i] = 0
        elif stump[8] < y_pred[i]:
            y_pred[i] = 9
        else:
            for t in range (1, 9):
                if stump[t-1] < y_pred[i] <= stump[t]:
                    y_pred[i] = t
    return y_pred

# Stump()+Stump_Apply()
def Stump_Set(y_test, y_train_pred, y_train_true):
    stump = Stump(y_train_pred, y_train_true)
    y_test_new = Stump_Apply(y_test, stump)
    return y_test_new

# Models

[Hist Gradient Boosting Regressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingRegressor.html#sklearn.ensemble.HistGradientBoostingRegressor)

[XGBoost](https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.XGBRegressor)

In [15]:
### XGBoost
x_train = x_train125

xgb_reg = xgb.XGBRegressor(gamma=0.001, learning_rate=0.048391470778895496, max_depth=9, min_child_weight=12.512400985268663, n_estimators=1456)

#consruct bayesSearch object
y_pred = cross_val_predict(xgb_reg, x_train, y_train, cv = 3)

y_pred_int = y_pred.round()
print(mean_absolute_error(y_pred_int, y_train))
#[('gamma', 0.001), ('learning_rate', 0.048391470778895496), ('max_depth', 9), ('min_child_weight', 12.512400985268663), ('n_estimators', 1456)] -------> 1.5835760046592895

In [None]:
### Gradient Boosting Decision Tree
x_train = x_train17 # x_train17_knn

gbr   = make_pipeline(HistGradientBoostingRegressor(loss='absolute_error', categorical_features=[13, 14, 15]))
gbr_p = make_pipeline(HistGradientBoostingRegressor(loss='absolute_error', categorical_features=[13, 14, 15], 
                                                    min_samples_leaf=150, max_leaf_nodes=33, max_depth=14, max_bins=225, learning_rate=0.1, l2_regularization=0.1))
gbr_r = TransformedTargetRegressor(regressor=HistGradientBoostingRegressor(loss='absolute_error', categorical_features=[13, 14, 15], random_state=6211), inverse_func=Reg_for_Cla, check_inverse=False)
gbr_rp= TransformedTargetRegressor(regressor=HistGradientBoostingRegressor(loss='absolute_error', categorical_features=[13, 14, 15], random_state=6211, 
                                                    min_samples_leaf=80, max_leaf_nodes=33, max_depth=40, max_bins=195, l2_regularization=0.03), inverse_func=Reg_for_Cla, check_inverse=False)

In [None]:
### Random Search
param_dist = {'learning_rate': (0.01,0.1,'uniform'),
              'max_depth': (3,10),
              'n_estimators': (100,1500),
              'min_child_weight':(0.5,15), # the smaller, the more easy to overfit
              'gamma': (0.001,0.5)
              }                    
search = BayesSearchCV(estimator=xgb_reg, param_distributions=param_dist, scoring='neg_mean_absolute_error', n_iter=50, cv=5) #Randomized / Bayes
search.fit(x_train, y_train)
Search_Result(search)
#HistGB {'regressor__min_samples_leaf': 80, 'regressor__max_leaf_nodes': 33, 'regressor__max_depth': 40, 'regressor__max_bins': 195, 'regressor__l2_regularization': 0.03} 1.6626

In [None]:
### Grid Search
param_dist = {'regressor__random_state': [1126, 6211, None],
              'regressor__max_depth': [20, 80, 140, 200, None],                     
              'regressor__l2_regularization': [0.01, 0.03, 0.1, 0.3, 1, 3]
              }
search = GridSearchCV(estimator=gbr_r, param_grid=param_dist, scoring='neg_mean_absolute_error', cv=5)
search.fit(x_train, y_train)
Search_Result(search)

In [16]:
### 3:1 Validation
x_train = x_train17
x_train3, x_eval, y_train3, y_eval = train_test_split(x_train, y_train)

xgb_model = xgb_reg.fit(x_train3, y_train3)
y_train_pred = xgb_model.predict(x_train3)
stump = Stump(y_train_pred, y_train3)

y_pred = xgb_model.predict(x_eval)
y_pred_int = y_pred.round()
y_pred_stump = Stump_Apply(y_pred, stump)
print(mean_absolute_error(y_pred, y_eval))  #1.5932098731716182
print(mean_absolute_error(y_pred_int, y_eval))  #1.5804798509201026
print(mean_absolute_error(y_pred_stump, y_eval))  #1.5583508036338225

1.58528304724628
1.5648730491497786
1.5427440018634988


In [None]:
### 5-fold Cross Validation
options = [(gbr, "param 1")]
for model, msg in options:
    cv_score = cross_val_score(model, x_train, y_train, cv=5, scoring="neg_mean_absolute_error")
    CV_Average(cv_score, msg)

In [None]:
### Confusion Matrix on Classifier
x_train = x_train17
x_train3, x_eval, y_train3, y_eval = train_test_split(x_train, y_train)

np.set_printoptions(precision=2)
titles_options = [("Confusion matrix, without normalization", None)]#,
                  #("Normalized confusion matrix", "true"),]
for title, normalize in titles_options:
    disp = ConfusionMatrixDisplay.from_predictions(y_eval, y_pred,
        display_labels=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'],
        cmap=plt.cm.Blues,
        normalize=normalize)
    disp.ax_.set_title(title)
plt.show()

# Prediction

In [None]:
### load data
test_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/test.csv", delimiter=",", header=0)
test_df = test_df.drop(['Track','Album','Uri','Url_spotify','Url_youtube','Description','Title','Channel'], axis =1)
n_test = test_df.shape[0]

id = test_df[['id']].copy()
id = id.to_numpy()
submit = np.zeros((n_test, 2))
submit[:, 0] = id[:, 0]

x_test_pd17 = test_df.drop(['id','Composer','Artist'], axis =1)
x_test_pd14 = x_test_pd17.drop(['Album_type','official_video','Licensed'], axis =1)

album_map = {'album': 3, 'single': 2, 'compilation': 1}
tf_map = {1: 1, 0: 0}
x_test_pd17['Album_type']     = x_test_pd17['Album_type'].map(album_map, na_action='ignore')
x_test_pd17['official_video'] = x_test_pd17['official_video'].map(tf_map, na_action='ignore')
x_test_pd17['Licensed']       = x_test_pd17['Licensed'].map(tf_map, na_action='ignore')
x_test_pd125 = test_df.drop(['id'], axis =1)
x_test_pd125 = pd.get_dummies(test_df, columns=['Artist','Composer','Album_type'])    

x_test17      = x_test_pd17.to_numpy()
x_test14      = x_test_pd14.to_numpy()
x_test125     = x_test_pd125.to_numpy()  
#pd.set_option('display.max_columns', 500)                                      
#print(x_test_pd17.head(3))

In [None]:
### make prediction
x_train = x_train125
x_test  = x_test125 #choose x
model = xgb_reg.fit(x_train, y_train) #change model name
y_train_pred = model.predict(x_train)
y_pred = model.predict(x_test)
y_pred_stump = Stump_Set(y_pred, y_train_pred, y_train)
submit[:, 1] = y_pred_stump
#submit[:, 1] = gbr_rp.predict(x_test) #change model name again

df = pd.DataFrame(submit, columns = ['id','Danceability'])
df = df.astype({"id": int})
df.to_csv('submission.csv', index=False)

# Past Trials


In [None]:
### Parameter Search
param_dist = {'regressor__max_depth': [5, 20, 80, 140, 200, None],                     
              'regressor__l2_regularization': [0.01, 0.03, 0.1, 0.3, 1, 3]}
grid_search = GridSearchCV(estimator=gbr_r, param_grid=param_dist, scoring='neg_mean_absolute_error', cv=5)
grid_search.fit(x_train, y_train)

Search_Result(grid_search)
# Result on x_train_17
# 'l2_regularization': 3, 'max_depth': None  cv_socre: 1.6642399534071053
# depth = 5 consistently bad
# depth = none needs at least l2=0.03

In [None]:
### Effect of Random State
param_dist = {'regressor__random_state': [0, 10, 1126, 6211, 100000, None]}                     #255 (<=255)
grid_search = GridSearchCV(estimator=gbr_r, param_grid=param_dist, scoring='neg_mean_absolute_error', cv=5)
grid_search.fit(x_train, y_train)

Search_Result(grid_search)
# Done on x_train17
# Result: Use None