# Similarity & kNN

## Euclidean distance

In [1]:
import pandas as pd
from sklearn.metrics import euclidean_distances

x = pd.DataFrame({'age':[23,40],
                  'year':[2,10],
                  'resident':[2,1]})
euclidean_distances(x)

array([[ 0.        , 18.81488772],
       [18.81488772,  0.        ]])

## Similarity

### Part 1: Load Data

In [2]:
# load bank-data "bank-data.csv
bankData = pd.read_csv("bank-data.csv", sep=';')

### Part 2: Preprocess Data

In [3]:
#Binary encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
bankData['default'] = le.fit_transform(bankData['default'])
bankData['housing'] = le.fit_transform(bankData['housing'])
bankData['loan'] = le.fit_transform(bankData['loan'])
bankData['y'] = le.fit_transform(bankData['y'])
bankData.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,0,1787,0,0,cellular,19,oct,79,1,-1,0,unknown,0
1,33,services,married,secondary,0,4789,1,1,cellular,11,may,220,1,339,4,failure,0
2,35,management,single,tertiary,0,1350,1,0,cellular,16,apr,185,1,330,1,failure,0
3,30,management,married,tertiary,0,1476,1,1,unknown,3,jun,199,4,-1,0,unknown,0
4,59,blue-collar,married,secondary,0,0,1,0,unknown,5,may,226,1,-1,0,unknown,0


In [4]:
#Convert categorical variables into dummy columns
bankData = pd.concat([bankData,pd.get_dummies(bankData['job'],prefix='job')],axis=1)
bankData = pd.concat([bankData,pd.get_dummies(bankData['marital'],prefix='marital')],axis=1)
bankData = pd.concat([bankData,pd.get_dummies(bankData['education'],prefix='education')],axis=1)
bankData = pd.concat([bankData,pd.get_dummies(bankData['contact'],prefix='contact')],axis=1)
bankData = pd.concat([bankData,pd.get_dummies(bankData['month'],prefix='month')],axis=1)
bankData = pd.concat([bankData,pd.get_dummies(bankData['poutcome'],prefix='poutcome')],axis=1)

bankData = bankData.drop(columns=['job', 'marital', 'education','contact', 'month', 'poutcome'])

In [5]:
bankData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 49 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   age                  4521 non-null   int64
 1   default              4521 non-null   int32
 2   balance              4521 non-null   int64
 3   housing              4521 non-null   int32
 4   loan                 4521 non-null   int32
 5   day                  4521 non-null   int64
 6   duration             4521 non-null   int64
 7   campaign             4521 non-null   int64
 8   pdays                4521 non-null   int64
 9   previous             4521 non-null   int64
 10  y                    4521 non-null   int32
 11  job_admin.           4521 non-null   uint8
 12  job_blue-collar      4521 non-null   uint8
 13  job_entrepreneur     4521 non-null   uint8
 14  job_housemaid        4521 non-null   uint8
 15  job_management       4521 non-null   uint8
 16  job_retired          452

#### Data prep

In [6]:
#Train/Test separation (hold out method)
from sklearn.model_selection import train_test_split
y = bankData['y']
X = bankData.drop(columns='y')
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)
X_train.head()

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,pdays,previous,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
1143,57,0,501,1,1,7,59,2,-1,0,...,0,0,0,0,0,0,0,0,0,1
1083,64,0,768,0,0,15,250,2,182,1,...,0,0,0,0,0,0,0,0,1,0
1398,34,0,341,0,1,29,136,1,262,3,...,0,0,0,0,0,0,1,0,0,0
3699,50,0,253,1,1,5,475,2,-1,0,...,0,0,1,0,0,0,0,0,0,1
3743,42,0,-247,1,1,21,519,1,166,1,...,0,0,0,0,1,0,0,1,0,0


### Part 3: Data processing

#### 1. Distances

In [7]:
# Euclidean distance is suitable for most of numeric data
euclidean_distances(bankData[0:3])

array([[   0.        , 3024.4973136 ,  558.4012894 ],
       [3024.4973136 ,    0.        , 3439.19670854],
       [ 558.4012894 , 3439.19670854,    0.        ]])

In [8]:
# Manhattan distance is less sensitive to outlier
from sklearn.metrics.pairwise import manhattan_distances
manhattan_distances(bankData[0:3])

array([[   0., 3508.,  894.],
       [3508.,    0., 3502.],
       [ 894., 3502.,    0.]])

In [9]:
# Distance with scaling data
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()

bankData_scaled = pd.DataFrame(
                    mms.fit_transform(bankData),
                    columns=bankData.columns)

euclidean_distances(bankData_scaled[0:3])

array([[0.        , 3.20226295, 3.3410116 ],
       [3.20226295, 0.        , 3.00755954],
       [3.3410116 , 3.00755954, 0.        ]])

In [10]:
manhattan_distances(bankData_scaled[0:3])

array([[ 0.        , 10.94766062, 11.63406998],
       [10.94766062,  0.        ,  9.38414555],
       [11.63406998,  9.38414555,  0.        ]])

#### 2. Look-alike

In [11]:
d = euclidean_distances(bankData_scaled,
                        bankData_scaled[0:1])

result = bankData
result['d'] = d
result.sort_values(by='d')

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,pdays,previous,...,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown,d
0,30,0,1787,0,0,19,79,1,-1,0,...,0,0,0,1,0,0,0,0,1,0.000000
3083,41,0,819,0,0,9,244,3,-1,0,...,0,0,0,0,0,0,0,0,1,1.463590
2041,73,0,154,0,0,15,103,1,-1,0,...,0,0,0,1,0,0,0,0,1,1.555054
108,56,0,3391,0,0,21,243,1,-1,0,...,0,0,0,0,0,0,0,0,1,1.775964
2043,52,0,255,0,1,10,374,3,-1,0,...,0,0,0,0,0,0,0,0,1,1.790613
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4213,30,0,-522,1,1,5,670,2,286,6,...,0,1,0,0,0,1,0,0,0,3.663789
2714,47,0,477,1,0,7,973,1,366,1,...,0,1,0,0,0,1,0,0,0,3.672753
1223,49,0,2370,1,1,17,56,1,103,2,...,0,0,1,0,0,0,1,0,0,3.755431
3652,29,0,1070,1,0,19,30,1,357,1,...,0,1,0,0,0,0,1,0,0,3.764402


#### 3. kNN

In [12]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=5,
                           metric='euclidean',
                           n_jobs=-1)
clf.fit(X_train,y_train)

KNeighborsClassifier(metric='euclidean', n_jobs=-1)

In [13]:
res = clf.predict(X_test)
pd.crosstab(y_test, res)

col_0,0,1
y,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1559,52
1,160,38


In [14]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

print("Accuracy:\t %.3f" %accuracy_score(y_test, res))
print(classification_report(y_test, res))

Accuracy:	 0.883
              precision    recall  f1-score   support

           0       0.91      0.97      0.94      1611
           1       0.42      0.19      0.26       198

    accuracy                           0.88      1809
   macro avg       0.66      0.58      0.60      1809
weighted avg       0.85      0.88      0.86      1809



#### Weighted voting kNN

In [15]:
clf1 = KNeighborsClassifier(n_neighbors=5,
                            metric='euclidean',
                            n_jobs=-1,
                            weights='distance')
clf1.fit(X_train,y_train)

KNeighborsClassifier(metric='euclidean', n_jobs=-1, weights='distance')

In [16]:
#test and evaluate
res1 = clf1.predict(X_test)
pd.crosstab(y_test, res1)

col_0,0,1
y,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1555,56
1,157,41


In [17]:
print("Accuracy:\t %.3f" %accuracy_score(y_test, res1))
print(classification_report(y_test, res1))

Accuracy:	 0.882
              precision    recall  f1-score   support

           0       0.91      0.97      0.94      1611
           1       0.42      0.21      0.28       198

    accuracy                           0.88      1809
   macro avg       0.67      0.59      0.61      1809
weighted avg       0.86      0.88      0.86      1809



In [18]:
bankData_scaled.head()

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,pdays,previous,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,0.161765,0.0,0.068455,0.0,0.0,0.6,0.024826,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.205882,0.0,0.10875,1.0,1.0,0.333333,0.0715,0.0,0.389908,0.16,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.235294,0.0,0.06259,1.0,0.0,0.5,0.059914,0.0,0.379587,0.04,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.161765,0.0,0.064281,1.0,1.0,0.066667,0.064548,0.061224,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.588235,0.0,0.044469,1.0,0.0,0.133333,0.073486,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# bankData_scaled

In [19]:
#Train/Test separation (hold out method)
# bankData_scaled
from sklearn.model_selection import train_test_split
y_scaled = bankData_scaled['y']
X_scaled = bankData_scaled.drop(columns='y')
X_scaled_train, X_scaled_test, y_scaled_train, y_scaled_test = train_test_split(X_scaled,y_scaled,test_size=0.4)
X_scaled_train.head()

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,pdays,previous,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
131,0.264706,0.0,0.044469,1.0,0.0,0.833333,0.00662,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4210,0.470588,0.0,0.045516,1.0,0.0,0.866667,0.238001,0.020408,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2830,0.161765,0.0,0.080415,1.0,0.0,0.566667,0.006289,0.081633,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
650,0.205882,0.0,0.362089,1.0,0.0,0.5,0.064548,0.020408,0.168578,0.08,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1976,0.117647,0.0,0.045986,0.0,0.0,0.7,0.011586,0.326531,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


#### KNN

In [20]:
from sklearn.neighbors import KNeighborsClassifier
clf_scaled = KNeighborsClassifier(n_neighbors=5,
                           metric='euclidean',
                           n_jobs=-1)
clf_scaled.fit(X_scaled_train,y_scaled_train)

KNeighborsClassifier(metric='euclidean', n_jobs=-1)

In [21]:
res_scaled = clf_scaled.predict(X_scaled_test)
pd.crosstab(y_scaled_test, res_scaled)

col_0,0.0,1.0
y,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,1563,20
1.0,206,20


In [22]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

print("Accuracy:\t %.3f" %accuracy_score(y_scaled_test, res_scaled))
print(classification_report(y_scaled_test, res_scaled))

Accuracy:	 0.875
              precision    recall  f1-score   support

         0.0       0.88      0.99      0.93      1583
         1.0       0.50      0.09      0.15       226

    accuracy                           0.88      1809
   macro avg       0.69      0.54      0.54      1809
weighted avg       0.84      0.88      0.83      1809



#### Weighted voting kNN

In [23]:
clf2 = KNeighborsClassifier(n_neighbors=5,
                            metric='euclidean',
                            n_jobs=-1,
                            weights='distance')
clf2.fit(X_scaled_train,y_scaled_train)

KNeighborsClassifier(metric='euclidean', n_jobs=-1, weights='distance')

In [24]:
#test and evaluate
res2 = clf2.predict(X_scaled_test)
pd.crosstab(y_scaled_test, res2)

col_0,0.0,1.0
y,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,1551,32
1.0,203,23


In [25]:
print("Accuracy:\t %.3f" %accuracy_score(y_scaled_test, res2))
print(classification_report(y_scaled_test, res2))

Accuracy:	 0.870
              precision    recall  f1-score   support

         0.0       0.88      0.98      0.93      1583
         1.0       0.42      0.10      0.16       226

    accuracy                           0.87      1809
   macro avg       0.65      0.54      0.55      1809
weighted avg       0.83      0.87      0.83      1809



#### Summary between bankData vs bankData_scaled