In [1]:
%matplotlib notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import matplotlib.patches as patches

In [2]:
bn=pd.read_csv('Banknote-authentication-dataset.csv',index_col=None)
print(bn.shape)
bn.head()

(1372, 2)


Unnamed: 0,V1,V2
0,3.6216,8.6661
1,4.5459,8.1674
2,3.866,-2.6383
3,3.4566,9.5228
4,0.32924,-4.4552


In [3]:
# descriptive stats
bn.describe()

Unnamed: 0,V1,V2
count,1372.0,1372.0
mean,0.433735,1.922353
std,2.842763,5.869047
min,-7.0421,-13.7731
25%,-1.773,-1.7082
50%,0.49618,2.31965
75%,2.821475,6.814625
max,6.8248,12.9516


In [4]:
v1=bn['V1']
v2=bn['V2']
rd=np.column_stack((v1,v2))
means=np.mean(rd,0)
std=np.std(rd,0)

In [5]:
print(means)
print(std)

[0.43373526 1.92235312]
[2.84172641 5.86690749]


In [6]:
means[0]

0.43373525728862977

In [7]:
#visualise data
# we use scatter plot when we have two numeric features to compare
fig,graph=plt.subplots()
graph.scatter(bn['V1'],bn['V2'])
graph.scatter(means[0],means[1])
ellipse=patches.Ellipse([means[0],means[1]],std[0]*2,std[1]*2,alpha=0.3)
graph.add_patch(ellipse)
plt.xlabel('V1')
plt.ylabel('V2')

<IPython.core.display.Javascript object>

Text(0, 0.5, 'V2')

In [8]:
#normalising the dataset
v1m=v1.mean()
v2m=v2.mean()

v1max=v1.max()
v2max=v2.max()

v1min=v1.min()
v2min=v2.min()

In [9]:
v1n=(v1-v1m).div(v1max-v1min)
v2n=(v2-v2m).div(v2max-v2min)

In [10]:
v12n=np.column_stack((v1n,v2n))
data=pd.DataFrame(v12n,columns=['v1','v2'])
data

Unnamed: 0,v1,v2
0,0.229890,0.252341
1,0.296545,0.233681
2,0.247515,-0.170653
3,0.217991,0.284398
4,-0.007536,-0.238639
...,...,...
1367,-0.001990,-0.021447
1368,-0.131423,-0.254433
1369,-0.301728,-0.575533
1370,-0.288272,-0.385600


In [11]:
km_res=KMeans(n_clusters=2).fit(data)

In [12]:
clusters=km_res.cluster_centers_

In [13]:
clusters

array([[ 0.13368173,  0.10968233],
       [-0.17098826, -0.14029136]])

In [23]:
data['label']=km_res.labels_
data

Unnamed: 0,v1,v2,label
0,0.229890,0.252341,0
1,0.296545,0.233681,0
2,0.247515,-0.170653,0
3,0.217991,0.284398,0
4,-0.007536,-0.238639,1
...,...,...,...
1367,-0.001990,-0.021447,0
1368,-0.131423,-0.254433,1
1369,-0.301728,-0.575533,1
1370,-0.288272,-0.385600,1


In [56]:
from matplotlib.lines import Line2D
plt.figure(figsize=(7,4))
plt.scatter(data['v1'],data['v2'],c=km_res.labels_.astype(float))
plt.scatter(clusters[:,0],clusters[:,1],s=700,alpha=0.45)
plt.xlabel('V1 - Norm')
plt.ylabel('V2 - Norm')
# create a list of legend elemntes
## markers / records

colors = ['#ffd343' ,'#8F00FF']
legend_elements = [Line2D([0], [0], marker='o', color='w', label='Cluster {}'.format(i), 
               markerfacecolor=mcolor, markersize=5) for i, mcolor in enumerate(colors)]# plot legend
plt.legend(handles=legend_elements, loc='lower right')
plt.title('''K-Means Clustering distinguishing:
The real bank notes with Yellow color and purple for forged notes.''')
plt.show()

<IPython.core.display.Javascript object>

In [17]:
clustered_data=data.join(cluster_map)
clustered_data

Unnamed: 0,v1,v2,cluster
0,0.229890,0.252341,0
1,0.296545,0.233681,0
2,0.247515,-0.170653,0
3,0.217991,0.284398,0
4,-0.007536,-0.238639,1
...,...,...,...
1367,-0.001990,-0.021447,0
1368,-0.131423,-0.254433,1
1369,-0.301728,-0.575533,1
1370,-0.288272,-0.385600,1


In [18]:
one=clustered_data[clustered_data['cluster']==1]
one

Unnamed: 0,v1,v2,cluster
4,-0.007536,-0.238639,1
7,0.119599,-0.326752,1
18,0.073136,-0.254512,1
21,-0.007538,-0.238639,1
25,0.019613,-0.277663,1
...,...,...,...
1366,-0.205074,0.068137,1
1368,-0.131423,-0.254433,1
1369,-0.301728,-0.575533,1
1370,-0.288272,-0.385600,1


In [19]:
zero=clustered_data[clustered_data['cluster']==0]
zero

Unnamed: 0,v1,v2,cluster
0,0.229890,0.252341,0
1,0.296545,0.233681,0
2,0.247515,-0.170653,0
3,0.217991,0.284398,0
5,0.283745,0.289973,0
...,...,...,...
1351,0.063790,-0.000773,0
1352,0.076143,0.056369,0
1353,-0.022919,0.048627,0
1359,-0.026510,0.021293,0


In [20]:
d1=one[['v1','v2']].describe()
d2=zero[['v1','v2']].describe()
d1=d1.rename_axis(index='Measures')
d2=d2.rename_axis(index='Measures')
data1=d1.join(d2,lsuffix='_left',rsuffix='_right')
data1.rename(index={'std':'deviation'},columns={'v1_left':'legit v1','v2_left':'legit v2','v1_right':'forged v1',
                      'v2_right':'forged v2'},inplace=True)
data1[1:3]

Unnamed: 0_level_0,legit v1,legit v2,forged v1,forged v2
Measures,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
mean,-0.169555,-0.13949,0.134536,0.11068
deviation,0.135632,0.19658,0.140927,0.167841


In [21]:
data1

Unnamed: 0_level_0,legit v1,legit v2,forged v1,forged v2
Measures,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
count,607.0,607.0,765.0,765.0
mean,-0.169555,-0.13949,0.134536,0.11068
deviation,0.135632,0.19658,0.140927,0.167841
min,-0.539114,-0.587301,-0.229008,-0.274875
25%,-0.253624,-0.306816,0.032897,-0.011228
50%,-0.162663,-0.095626,0.150644,0.138443
75%,-0.08184,0.001976,0.248813,0.253194
max,0.166062,0.287339,0.460886,0.412699
