# K-means clustering

In [77]:
import pandas as pd
import numpy as np

In [78]:
data = pd.read_csv('data/processed_data.csv')
data = data[['text','y']]

data.head()

Unnamed: 0,text,y
0,rock destined 21st century new conan he going ...,1
1,gorgeously elaborate continuation lord ring tr...,1
2,effective tootepid biopic,1
3,sometimes like go movie fun wasabi good place ...,1
4,emerges something rare issue thats honest keen...,1


# Build K-means clustering from scratch with Python

In [79]:
from sklearn.feature_extraction.text import CountVectorizer
from scipy.spatial.distance import cdist

In [81]:
# vectorize data to dictionary
def vectorize_data(data):
    vectorizer = CountVectorizer()
    x = vectorizer.fit_transform(data)
    vectorized_data = x.toarray()
    return vectorized_data

In [80]:
# plit data to train and test set
def train_test(data):
    np.random.seed(3) 
    msk = np.random.rand(len(data)) < 0.8
    train_data = data[msk]
    test_data = data[~msk]
    return train_data,test_data

In [82]:
# initialize centers for dataset
def init_centers(vectorized_data):
    return np.array([np.mean(vectorized_data[-5331:],axis = 0),np.mean(vectorized_data[0:5330],axis = 0)])

In [83]:
# assign new label for all data point
def assign_labels(vectorized_data,centers):
    distance = cdist(centers,vectorized_data)
    return np.argmin(distance, axis = 0)

In [84]:
# update center
def update_centers(vectorized_data,labels):
    centers = np.zeros((2,vectorized_data.shape[1]))
    pos_data = vectorized_data[labels == 1,:]
    pos_center = np.mean(pos_data,axis = 0)
    centers[1, :] = pos_center
    neg_data = vectorized_data[labels == 0,:]
    neg_center = np.mean(neg_data,axis = 0)
    centers[0, :] = neg_center
    return centers

In [85]:
#  check if no changing then stop 
def check_converge(centers,new_centers):
    return (set([tuple(a) for a in centers]) == set([tuple(a) for a in new_centers]))
       

In [86]:
# create data 
vectorized_data = vectorize_data(data['text'])
v_train,v_test = train_test(vectorized_data)
y_train,y_test = train_test(data['y'])

# init center
centers = init_centers(v_train)
iter = 0

while True:
    # assign labels
    labels = assign_labels(v_train,centers)
    
    # update center
    new_centers = update_centers(v_train,labels)
    
    # breaking condition
    if check_converge(centers,new_centers):
        break
    
    # final center
    centers = new_centers
    iter += 1
print("centers founded after %d iteration : \n"%(iter),centers)
    

centers founded after 8 iteration : 
 [[0.         0.00067751 0.00067751 ... 0.         0.         0.        ]
 [0.         0.         0.00014134 ... 0.00014134 0.00014134 0.00014134]]


In [87]:
y_test = y_test
test_labels = assign_labels(v_test,centers)
print("some prediction : ",test_labels[0:10])
print(1 - np.absolute(np.sum(test_labels - y_test)) / len(y_test))


some prediction :  [0 1 1 1 1 0 0 1 1 1]
0.6707721459024159


# Use K-means model in Scikit-Learn

In [88]:
# use scikit-learn
from sklearn.cluster import KMeans
init_centers = init_centers(vectorized_data)
kmeans = KMeans(n_clusters=2,init = init_centers, random_state=0).fit(v_train)
print('Centers found by scikit-learn:')
print(kmeans.cluster_centers_)
pred_label = kmeans.predict(v_test)

  return_n_iter=True)


Centers found by scikit-learn:
[[ 0.00000000e+00  6.77506775e-04  6.77506775e-04 ...  1.21972744e-19
   1.21972744e-19  1.21972744e-19]
 [ 0.00000000e+00 -1.92445886e-17  1.41342756e-04 ...  1.41342756e-04
   1.41342756e-04  1.41342756e-04]]


In [89]:
print("some prediction : ",pred_label[0:10])
print("accuracy : ",1 - np.absolute(np.sum(pred_label - y_test)) / len(y_test))

some prediction :  [0 1 1 1 1 0 0 1 1 1]
accuracy :  0.6707721459024159


# Conclusion

My model build from scratch and sklearn model have same result