# 1

In [2]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-1.6.0-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 2.3 MB/s eta 0:00:01
Installing collected packages: xgboost
Successfully installed xgboost-1.6.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
import numpy as np

X_train = np.array(pd.read_csv("OnlineAd_X_train.csv", header=None))
Y_train = pd.read_csv("OnlineAd_Y_train.csv", header=None)
Y_train = np.where(Y_train == 1)[1]
print(X_train.shape, Y_train.shape)

(1452, 251) (1452,)


In [3]:
from numpy import unique
# summarize data set
classes = unique(Y_train)
total = len(Y_train)
for c in classes:
    n_examples = len(Y_train[Y_train==c])
    percent = n_examples / total * 100
    print('> Class=%s : %d/%d (%.1f%%)' % ("0 (no click)" if c == 0 else "1 (click A)" if c == 1 else "2 (click B)", n_examples, total, percent))

> Class=0 (no click) : 822/1452 (56.6%)
> Class=1 (click A) : 277/1452 (19.1%)
> Class=2 (click B) : 353/1452 (24.3%)


In [9]:
# common function
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_auc_score

def shuffle_set(X_train, Y_train):
    train_set = list(zip(X_train, Y_train))
    rd.shuffle(train_set)
    
    X_train, Y_train = zip(*train_set)
    return X_train, Y_train

def k_fold_estimate(k, X_train, Y_train, val_estimators):
    indices = np.array(range(len(X_train)))
    k_indices = np.split(indices,k)

    k_x_groups = [np.array(X_train)[k_indices[i]] for i in range(k)]
    k_y_groups = [np.array(Y_train)[k_indices[i]] for i in range(k)]
    val_scores = []
    for i in range(k):
        x_val = k_x_groups[i]
        y_val = k_y_groups[i]
        x_train = np.concatenate(k_x_groups[:i] + k_x_groups[i+1:])
        y_train = np.concatenate(k_y_groups[:i] + k_y_groups[i+1:])
        val_scores.append(val_estimators[i].score(X_train, Y_train))
    val_scores = np.array(val_scores)
    print("cross-validation mean accuracy:", np.mean(val_scores))
    print("cross-validation accuracies:", val_scores)
    return val_estimators[np.argmax(val_scores)]
    
def k_fold_cv(model, k, X_train, Y_train):
    X_train, Y_train = shuffle_set(X_train, Y_train)
    myscore = make_scorer(roc_auc_score, multi_class='ovo', needs_proba=True)
    cv_result = cross_validate(model, X_train, Y_train, cv=k, scoring=myscore, return_train_score=True, return_estimator=True)

    val_scores = cv_result['test_score']
    val_estimators = cv_result['estimator']
    train_scores = cv_result['train_score']
    
    return val_scores, train_scores, val_estimators, X_train, Y_train

## (a)

### k-Nearest Neighbor

In [7]:
from sklearn.neighbors import KNeighborsClassifier

def k_fold_cv_kNN(k, X_train, Y_train):
    mknn = KNeighborsClassifier()
    return k_fold_cv(mknn, k, X_train, Y_train)

In [16]:
rd.seed(2022)
k = 6

val_scores, train_scores, val_estimators, _, _ =  k_fold_cv_kNN(k, X_train, Y_train)
print("cross-validation mean score:", np.mean(val_scores))
print("cross-validation scores:", val_scores)
print("training set scores:", train_scores)

cross-validation mean score: 0.6423789028094739
cross-validation scores: [0.65314276 0.62302009 0.66291887 0.62454817 0.6205619  0.67008164]
training set scores: [0.83898975 0.83928588 0.83092452 0.84467956 0.83536103 0.83296642]


### Logistic Regression

In [56]:
from sklearn.linear_model import LogisticRegression

def k_fold_cv_LR(k, X_train, Y_train):
    mlgr = LogisticRegression(max_iter=200)
    return k_fold_cv(mlgr, k, X_train, Y_train)

In [38]:
rd.seed(2022)
k = 6

val_scores, train_scores, val_estimators, X_train, Y_train = k_fold_cv_LR(k, X_train, Y_train)
print("cross-validation mean score:", np.mean(val_scores))
print("cross-validation scores:", val_scores)
print("training set scores:", train_scores)

cross-validation mean score: 0.6791401647147784
cross-validation scores: [0.67737637 0.63186559 0.67640261 0.71753519 0.65977127 0.71188996]
training set scores: [0.91525182 0.91991391 0.91159579 0.90340909 0.90876782 0.90199859]


### Linear Discriminant Analysis

In [13]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

def k_fold_cv_LDA(k, X_train, Y_train):
    mlda = LinearDiscriminantAnalysis()
    return k_fold_cv(mlda, k, X_train, Y_train)

In [18]:
rd.seed(2022)
k = 6

val_scores, train_scores, val_estimators, _, _ =  k_fold_cv_LDA(k, X_train, Y_train)
print("cross-validation mean score:", np.mean(val_scores))
print("cross-validation scores:", val_scores)
print("training set scores:", train_scores)

cross-validation mean score: 0.6813732666948442
cross-validation scores: [0.69257604 0.63156392 0.70317557 0.7060318  0.65547347 0.6994188 ]
training set scores: [0.87222586 0.8805675  0.87740323 0.86553583 0.87927121 0.87526377]


### Quadratic Discirminant Analysis

In [19]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

def k_fold_cv_QDA(k, X_train, Y_train):
    mqda = QuadraticDiscriminantAnalysis()
    return k_fold_cv(mqda, k, X_train, Y_train)

In [21]:
rd.seed(2212)
k = 6

val_scores, train_scores, val_estimators, _, _ =  k_fold_cv_QDA(k, X_train, Y_train)
print("cross-validation mean score:", np.mean(val_scores))
print("cross-validation scores:", val_scores)
print("training set scores:", train_scores)



cross-validation mean score: 0.5363932942467762
cross-validation scores: [0.55250936 0.54957466 0.50274731 0.55689791 0.51873789 0.53789264]
training set scores: [0.98921983 0.98945164 0.98897458 0.98752834 0.98923203 0.98985302]




### Naive Bayes

In [22]:
from sklearn.naive_bayes import GaussianNB

def k_fold_cv_NB(k, X_train, Y_train):
    mnb = GaussianNB()
    return k_fold_cv(mnb, k, X_train, Y_train)

In [24]:
rd.seed(2022)
k = 6

val_scores, train_scores, val_estimators, _, _ =  k_fold_cv_NB(k, X_train, Y_train)
print("cross-validation mean score:", np.mean(val_scores))
print("cross-validation scores:", val_scores)
print("training set scores:", train_scores)

cross-validation mean score: 0.7004772169051768
cross-validation scores: [0.73460441 0.67261962 0.73705572 0.68563607 0.68019995 0.69274753]
training set scores: [0.71053553 0.72244853 0.71242088 0.71922231 0.7180289  0.72011451]


### Random Forest

In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

def k_fold_tune_RF(k, X_train, Y_train):
    mrf = RandomForestClassifier(random_state=0, n_jobs=-1)
    params = { 
        'max_depth': [2, 4, 6],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'n_estimators': [100, 150, 200],
        'max_features': ['auto', 'sqrt']
    }
    scorer = make_scorer(roc_auc_score, multi_class='ovo', needs_proba=True)
    grid_cv = GridSearchCV(estimator=mrf, param_grid=params, scoring=scorer, cv=k, refit=True)
    grid_result = grid_cv.fit(X_train, Y_train)

    return grid_result.best_params_

In [26]:
rd.seed(2022)
k = 6

cv_params = k_fold_tune_RF(k, X_train, Y_train)
for key, value in cv_params.items():
    print(key+":", value)

max_depth: 4
max_features: auto
min_samples_leaf: 2
min_samples_split: 10
n_estimators: 150


In [29]:
def k_fold_cv_RF(k, X_train, Y_train):
    mrf = RandomForestClassifier(
        max_depth=4,
        min_samples_split=10,
        min_samples_leaf=2,
        n_estimators=150,
        max_features='auto'
    )
    return k_fold_cv(mrf, k, X_train, Y_train)

In [30]:
rd.seed(2022)
k = 6

val_scores, train_scores, val_estimators, X_train, Y_train =  k_fold_cv_RF(k, X_train, Y_train)
print("cross-validation mean score:", np.mean(val_scores))
print("cross-validation scores:", val_scores)
print("training set scores:", train_scores)

cross-validation mean score: 0.725969363416875
cross-validation scores: [0.74947652 0.70348127 0.76488542 0.70693009 0.69379598 0.7372469 ]
training set scores: [0.84567213 0.85251449 0.85297625 0.85306436 0.84894587 0.85509123]


### Boosting

In [57]:
from sklearn.ensemble import GradientBoostingClassifier

def k_fold_cv_Boost(k, X_train, Y_train):
    n_estimators = np.array([60, 65, 70, 75, 80])
    best_score = 0
    best_learning_rate = 0
    best_n_estimator = 0
    for n_estimator in n_estimators:
        learning_rate = 10 / n_estimator
        mabc = GradientBoostingClassifier(n_estimators=n_estimator, learning_rate=learning_rate)
        val_scores, train_scores, val_estimators, X_train, Y_train = k_fold_cv(mabc, k, X_train, Y_train)
        if(best_score < np.mean(val_scores)):
            best_score = np.mean(val_scores)
            best_val_scores = val_scores
            best_train_scores = train_scores
            best_estimators = val_estimators
            best_n_estimator = n_estimator
            best_learning_rate = learning_rate
            best_X_train = X_train
            best_Y_train = Y_train
    return best_val_scores, best_train_scores, best_estimators, best_n_estimator, best_learning_rate, best_X_train, best_Y_train

In [58]:
import random as rd

rd.seed(2022)
k = 6
val_scores, train_scores, val_estimators, n_estimator, learning_rate, X_train, Y_train  =  k_fold_cv_Boost(k, X_train, Y_train)
print("cross-validation mean score:", np.mean(val_scores))
print("cross-validation scores:", val_scores)
print("training set scores:", train_scores)
print("n_estimator:", n_estimator)
print("learning rate:", learning_rate)

cross-validation mean score: 0.7239251429836396
cross-validation scores: [0.73972452 0.74590032 0.65446805 0.75894031 0.75185619 0.69266146]
training set scores: [0.99779862 0.99874736 0.99808552 0.99877111 0.99711683 0.9988365 ]
n_estimator: 65
learning rate: 0.15384615384615385


In [7]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

def k_fold_tune_XGB1(k, X_train, Y_train):
    param_test1 = {
     'max_depth': [3, 6, 9],
     'min_child_weight': [1, 3, 5]
    }
    mxgb = XGBClassifier(
        learning_rate=0.1, 
        n_estimators=1000, 
        max_depth=5, 
        min_child_weight=1, 
        gamma=0, 
        subsample=0.8, 
        colsample_bytree=0.8,
        objective= 'binary:logistic', 
        nthread=-1, 
        seed=2022
    )
    
    myscore = make_scorer(roc_auc_score, multi_class='ovo', needs_proba=True)
    gsearch1 = GridSearchCV(
        estimator = mxgb,
        param_grid = param_test1, 
        scoring=myscore,
        n_jobs=-1,
        cv=6, 
        verbose=10
    )
    grid_result = gsearch1.fit(X_train, Y_train)
    return grid_result.best_params_

In [62]:
rd.seed(2022)
k = 6

cv_params = k_fold_tune_XGB1(k, X_train, Y_train)
for key, value in cv_params.items():
    print(key+":", value)

Fitting 6 folds for each of 9 candidates, totalling 54 fits
max_depth: 9
min_child_weight: 1


In [66]:
def k_fold_tune_XGB2(k, X_train, Y_train):
    param_test2 = {
     'gamma':[i/10.0 for i in range(0,5)]
    }
    mxgb = XGBClassifier(
        learning_rate=0.1, 
        n_estimators=1000, 
        max_depth=9, 
        min_child_weight=1, 
        gamma=0, 
        subsample=0.8, 
        colsample_bytree=0.8,
        objective= 'binary:logistic', 
        nthread=-1, 
        seed=2022
    )
    
    myscore = make_scorer(roc_auc_score, multi_class='ovo', needs_proba=True)
    gsearch2 = GridSearchCV(
        estimator = mxgb,
        param_grid = param_test2, 
        scoring=myscore,
        n_jobs=-1,
        cv=6, 
        verbose=10
    )
    grid_result = gsearch2.fit(X_train, Y_train)
    return grid_result.best_params_

In [67]:
rd.seed(2022)
k = 6

cv_params = k_fold_tune_XGB2(k, X_train, Y_train)
for key, value in cv_params.items():
    print(key+":", value)

Fitting 6 folds for each of 5 candidates, totalling 30 fits
gamma: 0.3


In [86]:
def k_fold_tune_XGB3(k, X_train, Y_train):
    param_test3 = {
     'subsample':[i/10.0 for i in range(6,9)],
     'colsample_bytree':[i/10.0 for i in range(6,9)]
    }

    mxgb = XGBClassifier(
        learning_rate=0.1,
        n_estimators=1000, 
        max_depth=9, 
        min_child_weight=1, 
        gamma=0.3, 
        subsample=0.8, 
        colsample_bytree=0.8,
        objective= 'binary:logistic', 
        nthread=-1, 
        seed=2022
    )
    
    myscore = make_scorer(roc_auc_score, multi_class='ovo', needs_proba=True)
    gsearch3 = GridSearchCV(
        estimator = mxgb,
        param_grid = param_test3, 
        scoring=myscore,
        n_jobs=-1,
        cv=6, 
        verbose=10
    )
    grid_result = gsearch3.fit(X_train, Y_train)
    return grid_result.best_params_

In [87]:
rd.seed(2022)
k = 6

cv_params = k_fold_tune_XGB3(k, X_train, Y_train)
for key, value in cv_params.items():
    print(key+":", value)

Fitting 6 folds for each of 9 candidates, totalling 54 fits
colsample_bytree: 0.7
subsample: 0.8


In [5]:
def k_fold_cv_XGB(k, X_train, Y_train):
    xgb = XGBClassifier(
        learning_rate=0.1, 
        n_estimators=1000, 
        max_depth=5, 
        min_child_weight=1, 
        gamma=0.3, 
        subsample=0.8, 
        colsample_bytree=0.7,
        objective= 'binary:logistic', 
        nthread=-1, 
        seed=2022
    )
    return k_fold_cv(xgb, k, X_train, Y_train)

In [10]:
import random as rd

rd.seed(2022)
k = 6
val_scores, train_scores, val_estimators, X_train, Y_train = k_fold_cv_XGB(k, X_train, Y_train)
print("cross-validation mean score:", np.mean(val_scores))
print("cross-validation scores:", val_scores)
print("training set scores:", train_scores)

cross-validation mean score: 0.7195497544036722
cross-validation scores: [0.72589001 0.69829325 0.73743113 0.71784359 0.69069007 0.74715049]
training set scores: [1. 1. 1. 1. 1. 1.]


## (b)

In [39]:
rd.seed(2022)
# logistic classifier accuracy without regularization term
k = 6
k_fold_estimate(k, X_train, Y_train, val_estimators)

cross-validation mean accuracy: 0.7310606060606061
cross-validation accuracies: [0.73415978 0.73829201 0.73140496 0.7238292  0.73553719 0.7231405 ]


### Logistic Regression + Lasso

In [47]:
from sklearn.linear_model import LogisticRegression

def k_fold_cv_LR_Lasso(k, X_train, Y_train):
    best_score = 0.0
    best_c = 0
    c_list = [rd.uniform(1e-5,1e2) for _ in range(20)]
    for c in c_list:
        mlgrlasso = LogisticRegression(max_iter=200, penalty='l1', solver='liblinear', C=c)
        val_scores, train_scores, val_estimators, X_train, Y_train = k_fold_cv(mlgrlasso, k, X_train, Y_train)
        if(best_score < np.mean(val_scores)):
            best_score = np.mean(val_scores)
            best_val_scores = val_scores
            best_train_scores = train_scores
            best_estimators = val_estimators
            best_c = c
            best_X_train = X_train
            best_Y_train = Y_train
    return best_val_scores, best_train_scores, best_estimators, best_c, best_X_train, best_Y_train

In [48]:
import random as rd

rd.seed(2022)
k = 6
val_scores, train_scores, val_estimators, c, X_train, Y_train =  k_fold_cv_LR_Lasso(k, X_train, Y_train)
print("cross-validation mean score:", np.mean(val_scores))
print("cross-validation scores:", val_scores)
print("training set scores:", train_scores)
print("regularization strength:", 1/c)

cross-validation mean score: 0.66336519226951
cross-validation scores: [0.69350731 0.68543705 0.61934737 0.65562856 0.65963321 0.66663766]
training set scores: [0.89186569 0.89374422 0.8987719  0.9038029  0.90105394 0.89885405]
regularization strength: 0.26317734500973883


In [49]:
rd.seed(2022)
k = 6
k_fold_estimate(k, X_train, Y_train, val_estimators)

cross-validation mean accuracy: 0.7208448117539027
cross-validation accuracies: [0.71763085 0.71969697 0.71625344 0.72520661 0.7238292  0.72245179]


### Logistic Regression + Ridge

In [50]:
from sklearn.linear_model import LogisticRegression

def k_fold_cv_LR_Ridge(k, X_train, Y_train):
    best_score = 0.0
    best_c = 0
    c_list = [rd.uniform(1e-5,1e2) for _ in range(20)]
    for c in c_list:
        mlgrridge = LogisticRegression(max_iter=200, penalty='l2', solver='liblinear', C=c)
        val_scores, train_scores, val_estimators, X_train, Y_train = k_fold_cv(mlgrridge, k, X_train, Y_train)
        if(best_score < np.mean(val_scores)):
            best_score = np.mean(val_scores)
            best_val_scores = val_scores
            best_train_scores = train_scores
            best_estimators = val_estimators
            best_c = c
            best_X_train = X_train
            best_Y_train = Y_train
    return best_val_scores, best_train_scores, best_estimators, best_c, best_X_train, best_Y_train

In [53]:
import random as rd

rd.seed(2022)
k = 6
val_scores, train_scores, val_estimators, c, X_train, Y_train =  k_fold_cv_LR_Ridge(k, X_train, Y_train)
print("cross-validation mean score:", np.mean(val_scores))
print("cross-validation scores:", val_scores)
print("training set scores:", train_scores)
print("regularization strength:", 1/c)

cross-validation mean score: 0.6643869387299887
cross-validation scores: [0.6470984  0.67496822 0.66376247 0.68504842 0.67238048 0.64306364]
training set scores: [0.90782447 0.90012607 0.89970552 0.90079919 0.90213888 0.90873176]
regularization strength: 0.012761200171725823


In [54]:
rd.seed(2022)
k = 6
k_fold_estimate(k, X_train, Y_train, val_estimators)

cross-validation mean accuracy: 0.7289944903581267
cross-validation accuracies: [0.73071625 0.73140496 0.72520661 0.72727273 0.73415978 0.72520661]


# 2

## (a)

In [16]:
import random as rd
# X
rd.seed(2022)
k = 6
best_estimator = k_fold_estimate(k, X_train, Y_train, val_estimators)

cross-validation mean accuracy: 0.9382460973370064
cross-validation accuracies: [0.93595041 0.93801653 0.93870523 0.94077135 0.93870523 0.93732782]


## (b)

In [17]:
X_test = np.array(pd.read_csv("OnlineAd_X_test.csv", header=None))
Y_predict = np.zeros((300,3)).astype('int64')
Y_pred = best_estimator.predict(X_test)

In [18]:
# Y_pred = best_estimator.predict(X_test)
for i, value in enumerate(Y_pred):
    Y_predict[i, value] = 1

In [19]:
import pandas as pd

dataframe = pd.DataFrame(Y_predict)
dataframe.to_csv("2021-11780_pred.csv", header=None, index=False)