In [0]:
import numpy as np
import random
from collections import defaultdict
import sys
import time


class Point:
    def __init__(self, label, doc_id, tfidf):
        def normalize():
            # compute norm2 square
            ans = 0.0
            for x in tfidf.values():
                ans += x**2
            ans=ans**0.5
            for i in tfidf:
                tfidf[i] = tfidf[i]/ans

            self.tfidf = tfidf

        self.label = label
        self.doc_id = doc_id
        normalize()


class Cluster:
    def __init__(self):
        self.centroid = None
        self.points = []
        self.centroid_l2_square = 0.0

    def set_centroid(self, new_centroid):
        self.centroid = new_centroid

    def add_point(self, point):
        self.points.append(point)

    def reset_points(self):
        self.points = []

def Load_data(pathin):
    def tfidf(doc):
        tf_idf = defaultdict(int)
        fea = doc.split()
        for i in fea:
            tmp = i.split(':')
            tf_idf[int(tmp[0])] = float(tmp[1])
        return tf_idf

    with open(pathin, 'r') as f:
        d_lines = f.read().splitlines()
    data = []
    for d in d_lines:
        fea = d.split('<<>>')
        label = fea[0]
        doc_id = fea[1]
        data.append(Point(label, doc_id,tfidf(fea[2])))
    return data

X_train = Load_data("/content/drive/My Drive/Project2/Datasets/train_tfidf_df=3")
X_test = Load_data("/content/drive/My Drive/Project2/Datasets/test_tfidf")



In [0]:

class Kmeans:
    def __init__(self):
        self.list_clusters = []
        self.k_cluster = 0
        self.n_doc = 0
        self.new_clusters = []
        self.max_simirality = 0.0
        self.it=0
        '''
        with open("C:\\Users\\nql\\Desktop\\20192\\project2\\textpreprocessing\\words_idf", 'r') as f:
            self.dim = len(f.read().splitlines())
        '''
        self.dim=20167

    def dist_between_x_y(self, x, y):
        '''
        x is a point;
        y is a vector
        '''
        # compute dot product
        ans = 0.0
        for pos in x.tfidf:
            ans += x.tfidf[pos]*y[pos]
        # return distance
        return (2-2*ans)

    def init_centroid(self, X):
        rd = random.sample(range(self.n_doc), self.k_cluster)
        for i in rd:
            centroid = np.zeros(self.dim)
            for pos in X[i].tfidf:
                centroid[pos] = X[i].tfidf[pos]
            tmp = Cluster()
            tmp.set_centroid(centroid)
            self.list_clusters.append(tmp)

    
    def init_centroid_v1(self, X):
        def add_centroid(idx):
            centroid = np.zeros(self.dim)
            for pos in X[idx].tfidf:
                centroid[pos] = X[idx].tfidf[pos]
            new_cluster = Cluster()
            new_cluster.set_centroid(centroid)
            self.list_clusters.append(new_cluster)

        # khoi tao 1 centroid ban dau
        add_centroid(random.randint(1, self.n_doc))
        # mang luu tru khoang cach nho nhat cua diem x voi cac centroid khoi tao
        dist_min_between_x_centroid=np.array([sys.maxsize]*self.n_doc,dtype=float)
        # chon cac centroid khac sao cho khoang cach den cac centroid da tao lon nhat
        for k in range(self.k_cluster-1):
            for i in range(self.n_doc):
                d=self.dist_between_x_y(X[i],self.list_clusters[-1].centroid)
                dist_min_between_x_centroid[i]=min(dist_min_between_x_centroid[i],d)
            idx_new_centroid=np.argmax(dist_min_between_x_centroid)
            add_centroid(idx_new_centroid)


    def update_centroid(self):
        max_simirality = -1
        for clr in self.new_clusters:
            new_centroid = np.zeros(self.dim)
            for pnt in clr.points:
                for pos in pnt.tfidf:
                    new_centroid[pos] += pnt.tfidf[pos]
            new_centroid = new_centroid/len(clr.points)
            #new_centroid=new_centroid/np.linalg.norm(new_centroid)
            clr.set_centroid(new_centroid)
        

    def asign_cluster(self, X):
        self.new_clusters.clear()
        for i in range(self.k_cluster):
            tmp=Cluster()
            self.new_clusters.append(tmp)

        for x in X:
            dis_min = sys.maxsize
            asign_clr = -1
            for i in range(self.k_cluster):
                dis = self.dist_between_x_y(x,self.list_clusters[i].centroid)
                if(dis <dis_min):
                    asign_clr = i
                    dis_min = dis
            self.new_clusters[asign_clr].add_point(x)

    
    def check_stop(self, label_criteria):
        count_labels_change = self.n_doc
        for i in range(self.k_cluster):
            labels_unchange =[ label for label in self.list_clusters[i].points
                              if label in self.new_clusters[i].points ]
            count_labels_change -= len(labels_unchange)
        if(count_labels_change <label_criteria):
            return True

        return False
    
    
    def fit(self, X,k_cluster):
        def backup():
            self.list_clusters.clear()
            self.list_clusters= [ clr for clr in self.new_clusters]

        def run_batch(X_batch):
            for i in range(100):
                self.asign_cluster(X_batch)
                self.update_centroid()
                if(self.check_stop(10)):
                    self.it=i
                    backup()
                    break
                backup()

        
        self.n_doc = len(X)
        self.k_cluster = k_cluster
        self.init_centroid_v1(X)

        # chay tim centroid tren tap du lieu nho
        #rd =random.sample(range(self.n_doc),2000)
        #run_batch(X[rd])

        # khi tim duoc centroid tren tap dl nho roi, tim tren tap train
        run_batch(X)

    def predict(self, X_test):
        self.asign_cluster(X_test)
        n_test = len(X_test)
        count_predict_true = 0
        for clr in self.new_clusters:
            pre = [0]*self.k_cluster
            for x in clr.points:
                pre[int(x.label)] += 1
            count_predict_true += np.max(pre)
        return str(count_predict_true)+'/'+str(n_test)


In [3]:
for i in range(10):
  t=time.time()
  model= Kmeans()
  model.fit(X_train,20)
  print("time train =",time.time()-t,'; iter =',model.it,end='; ')
  ans_train=model.predict(X_train)
  print('pre train =', ans_train,end=';')
  ans_test=model.predict(X_test)
  print('pre test= ',ans_test)

time train = 271.4844000339508 ; iter = 24; pre train = 4958/11314;pre test=  3233/7532
time train = 192.11541604995728 ; iter = 17; pre train = 6005/11314;pre test=  3935/7532
time train = 225.0636281967163 ; iter = 20; pre train = 5527/11314;pre test=  3575/7532
time train = 265.3886787891388 ; iter = 24; pre train = 5132/11314;pre test=  3297/7532
time train = 224.64220190048218 ; iter = 20; pre train = 5856/11314;pre test=  3849/7532
time train = 186.50940895080566 ; iter = 16; pre train = 5098/11314;pre test=  3327/7532
time train = 405.68641996383667 ; iter = 37; pre train = 6489/11314;pre test=  4199/7532
time train = 300.1251771450043 ; iter = 27; pre train = 5661/11314;pre test=  3911/7532
time train = 278.77742314338684 ; iter = 25; pre train = 5843/11314;pre test=  3735/7532
time train = 245.94400119781494 ; iter = 22; pre train = 6026/11314;pre test=  3890/7532
