## this homework 1 for machine learning lecture

In [452]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.svm as svm
from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline
import scipy.spatial.distance
from collections import Counter
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score


In [453]:
## implementation of knn classsifer without sklearn whic takes k and metric params
class KNNClassifier:

    def __init__(self, k=5, metric='euclidean'):
        self.k = k
        self.metric = metric
        self.X_train = None
        self.y_train = None
        self.X_test = None
        self.y_test = None
        self.classes = None

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
        self.classes = np.unique(y_train)
        return self

    def predict(self, X_test):
        self.X_test = X_test
        y_pred = []
        for i in range(self.X_test.shape[0]):
            # calculate distances
            distances = scipy.spatial.distance.cdist(self.X_train,
                                                     self.X_test[i, :].reshape(
                                                         1, -1),
                                                     metric=self.metric)
            # sort the distances
            sorted_indices = np.argsort(distances, axis=0)
            # get the indices of the k nearest neighbors
            k_nearest_indices = sorted_indices[:self.k]
            # get the classes of the k nearest neighbors
            # print(k_nearest_indices.flatten())
            k_nearest_classes = [
                self.y_train.iloc[i] for i in k_nearest_indices.flatten()
            ]

            # get the most common class
            most_common_class = Counter(k_nearest_classes).most_common(1)[0][0]
            y_pred.append(most_common_class)
        return y_pred

    def score(self, X_test, y_test):
        y_pred = self.predict(X_test)
        return np.sum(y_pred == y_test) / len(y_test)


In [454]:
bank_data = pd.read_csv('bank.csv', sep=';')
bank_data = bank_data.dropna().drop(['month','day'],axis=1)

Y = bank_data['y']
X = bank_data.drop(['y'], axis=1)
X

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,duration,campaign,pdays,previous,poutcome
0,30,unemployed,married,primary,no,1787,no,no,cellular,79,1,-1,0,unknown
1,33,services,married,secondary,no,4789,yes,yes,cellular,220,1,339,4,failure
2,35,management,single,tertiary,no,1350,yes,no,cellular,185,1,330,1,failure
3,30,management,married,tertiary,no,1476,yes,yes,unknown,199,4,-1,0,unknown
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,226,1,-1,0,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,33,services,married,secondary,no,-333,yes,no,cellular,329,5,-1,0,unknown
4517,57,self-employed,married,tertiary,yes,-3313,yes,yes,unknown,153,1,-1,0,unknown
4518,57,technician,married,secondary,no,295,no,no,cellular,151,11,-1,0,unknown
4519,28,blue-collar,married,secondary,no,1137,no,no,cellular,129,4,211,3,other


In [455]:
ordinalEncoder = OrdinalEncoder()
oneHotEncoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
labelEncoder = LabelEncoder()

column_transformer = make_column_transformer(
    (ordinalEncoder, ['education', 'poutcome']), (oneHotEncoder, [
        'job',
        'housing',
        'marital',
        'default',
        'loan',
        'contact',
    ]),
    remainder='passthrough')

X_train, X_test, y_train, y_test = train_test_split(
    X,
    Y,
    test_size=0.2,
)
y_test


2578     no
3704     no
1595     no
1913    yes
3591     no
       ... 
221      no
3763     no
201      no
4091     no
1209     no
Name: y, Length: 905, dtype: object

In [456]:
pipeline = Pipeline([('column_transformer', column_transformer),
                     ('scaler', StandardScaler()),
                     ('classifier', KNNClassifier(k=4, metric='minkowski'))])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
# print(y_pred.shape)
cm = confusion_matrix(y_test, y_pred)
print("Accuracy : ", accuracy_score(y_test, y_pred))
print('confusion matrix : \n', cm)

pipeline.score(X_test, y_test)

cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=0)
grid = GridSearchCV(pipeline,
                    param_grid={
                        'classifier__metric':
                        ['euclidean', 'cityblock', 'cosine', 'minkowski']
                    },
                    cv=cv)
grid.fit(X_train, y_train)
grid.score(X_test, y_test)

print('best params', grid.best_params_)
print('best score:', grid.best_score_)
print('test score:', grid.score(X_test, y_test))


Accuracy :  0.8795580110497238
confusion matrix : 
 [[770  26]
 [ 83  26]]
best params {'classifier__metric': 'euclidean'}
best score: 0.8802764976958526
test score: 0.8795580110497238
