In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import numpy as np

In [2]:
from sklearn.datasets import fetch_20newsgroups
cats = ['comp.graphics', 'comp.os.ms-windows.misc']
newsgroups_train = fetch_20newsgroups(subset='train',remove=('headers', 'footers', 'quotes'), categories=cats)
newsgroups_test = fetch_20newsgroups(subset='test',remove=('headers', 'footers', 'quotes'), categories=cats)

In [66]:
newsgroups_train = ["This little kitty came to play when I was eating at a restaurant.",
             "Merley has the best squooshy kitten belly.",
             "Google Translate app is incredible.",
             "If you open 100 tab in google you get a smiley face.",
             "Best cat photo I've ever taken.",
             "Climbing ninja cat.",
             "Impressed with google map feedback.",
             "Key promoter extension for Google Chrome."]

In [3]:
target_data = newsgroups_test.data

In [4]:
vectorizer = TfidfVectorizer(stop_words='english')
base_scaled = vectorizer.fit_transform(newsgroups_train.data)

In [5]:
target_scaled = vectorizer.transform(target_data)
target_scaled = np.asarray(target_scaled.todense())

In [6]:
target_scaled[0].size

41342

In [7]:
#learn number of clusters using silhouette_score

from __future__ import print_function

from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

num_clusters = 0
max_silhouette = -100

for n_clusters in range(2,10):

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters,init='k-means++',max_iter=1000)
    cluster_labels = clusterer.fit_predict(base_scaled)

    silhouette_avg = silhouette_score(base_scaled, cluster_labels)
    #print("For n_clusters =", n_clusters,"The average silhouette_scinit='k-means++',ore is :", silhouette_avg)
    
    if(silhouette_avg > max_silhouette):
        max_silhouette = silhouette_avg
        num_clusters = n_clusters

print("Optimum Number of Clusters: ",num_clusters)


KeyboardInterrupt: 

In [8]:
num_clusters = 2

In [9]:
#learn centers from source/base model
import numpy as np
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=num_clusters,init='k-means++')
kmeans.fit(base_scaled)

base_centroids = kmeans.cluster_centers_
#labels = kmeans.labels_
    
#print(kmeans.cluster_centers_)
print(kmeans.inertia_)

1109.641885059994


In [158]:
###spectral clustering
from sklearn.cluster import SpectralClustering
clustering = SpectralClustering(n_clusters=2,
         assign_labels="discretize",
         random_state=0,affinity='nearest_neighbors',n_neighbors=27).fit(target_scaled)

In [10]:
num_attr = base_centroids[0].size

In [11]:
base_scaled.shape

(1175, 41342)

In [12]:
def cal_centroids(alpha,beta):

    target_centroids = []
    for i in range(num_clusters):
        val = []
        for j in range(num_attr):
            val.append(alpha[j]*base_centroids[i][j] + beta[j])
        
        target_centroids.append(val)
             

    target_centroids = np.array(target_centroids)
    #print("new centroids: ", target_centroids)
    return target_centroids
    #print(target_centroids)

In [13]:
def kmeans_custom(k,dataItems,centroids,maxIter,num_attr):
    
    #print(k,dataItems,centroids,maxInter,num_attr)
    old_centroids = []
    groups = []
    
    for i in range(k):
        groups.append([])
        
    iter = 0
    while(iter < maxIter):
        old_centroids = centroids
        for item in dataItems:
            row = []
            for centroid in centroids:
                diff = 0
                for i in range(num_attr):
                    diff += abs(item[i] - centroid[i])
                    
                row.append(diff)
                
            idx = row.index(min(row))
            groups[idx].append(item)
                    
            
        iter += 1
        
    ss = 0    
    for i in range(k):
        for item in groups[i]:
            for j in range(num_attr):
            
                diff = abs(item[j]- old_centroids[i][j])
                ss += pow(diff,2)
            
        
    return (groups,ss/2)
        

In [14]:
def cal_gradient(clusters,reframed_centroids,old_alpha,old_beta):
    
    gradient_alpha = []
    gradient_beta = []
    gradient = 0
    
    for i in range(num_attr):
        gradient_alpha.append(0)
        gradient_beta.append(0)
        
    
    for i in range(num_clusters):
                
        for member in clusters[i]:
                      
            for j in range(num_attr):
                
                gradient =  (reframed_centroids[i][j] - member[j])
                gradient_alpha[j] = gradient_alpha[j] + gradient*base_centroids[i][j]
                gradient_beta[j] = gradient_beta[j] + gradient

    #print("******")
    #print(gradient_alpha,gradient_beta)
    
    new_alpha = []
    new_beta = []
    
    for i in range(num_attr):       
        new_alpha.append(old_alpha[i]-.1*gradient_alpha[i])
        new_beta.append(old_beta[i]-.1*gradient_beta[i])
        
    return [new_alpha,new_beta]

In [15]:
def learn_parameters(alpha,beta,target_avail):
    reframed_centroids =  cal_centroids(alpha,beta)
    km = kmeans_custom(num_clusters, target_avail, reframed_centroids, 1, num_attr)
    #kmeans.fit(target_scaled)
    best_error = round(km[1],5)
    #centroids = kmeans.cluster_centers_
    #labels = kmeans.labels_

    count = 0
    best_alpha = alpha
    best_beta = beta
    while(1):
        #print(centroids)
        #print("best error: ",best_error)

        #clusters = find_members(centroids,labels)

        #reframed_centroids = closest_centroids(reframed_centroids,centroids)

        new_alphabeta = cal_gradient(km[0],reframed_centroids,alpha,beta)

        alpha = new_alphabeta[0]
        beta = new_alphabeta[1]

        #print("new alpha beta", alpha, beta)

        reframed_centroids =  cal_centroids(alpha,beta)

        km = kmeans_custom(num_clusters, target_avail, reframed_centroids, 1, num_attr)
        #kmeans.fit(target_scaled)
        new_error = round(km[1],5)

        print("compare ",best_error,new_error)
        if(new_error < best_error):
            best_alpha = alpha
            best_beta = beta
            best_error = new_error
            count = 0

        elif(new_error == best_error):
            if(count<5):
                count += 1
                continue
            else:
                break;

        else:
            break; 

        #base_centroids = kmeans.cluster_centers_
        #labels = kmeans.labels_

        #print(kmeans.cluster_centers_)
        #print(old_error,new_error)

    #print("finalparameters", best_alpha,best_beta)
    
    target_cent = cal_centroids(best_alpha,best_beta)
    #km = kmeans_custom(num_clusters, target_scaled, target_cent, 1, num_attr)
    kmeans = KMeans(n_clusters=num_clusters, init=target_cent, max_iter=1)
    kmeans.fit(target_scaled)
    #print("Reframing results on ",kmeans.inertia_)
    return kmeans

In [33]:
def find_scores(class_labels, clustering_result):
    
    cluster_labels = clustering_result.labels_
    
    print("Reframing results on ",n," data: ")
    
    print("SSE: ", clustering_result.inertia_)
    
    conf_mat = confusion_matrix(class_labels, cluster_labels)
    m=0
    for i in range(num_clusters):
        m = m+np.max(conf_mat[i])
    
    print("purity: ", m/len(class_labels))  
    
    NMI = normalized_mutual_info_score(class_labels, cluster_labels)
    print("NMI: ",NMI)
    
    RI = accuracy_score(class_labels, cluster_labels)
    print("RI: ",RI)
    
    
    

In [36]:
elements = [30,50,70,90,120,140,160,180,200]
for n in elements:
    alpha = []
    beta = []
    avg_m = []
    avg_d = []
    sum_d = 0
    for i in range(num_attr):
        sum_m = 0
        for j in range(num_clusters):
            sum_m = sum_m + base_centroids[j][i]
        #print(sum_m)
        avg_m.append(sum_m/num_clusters)
    
    target_avail = vectorizer.transform(target_data[:n])
    target_avail = np.asarray(target_avail.todense())
    for i in range(num_attr):
        sum_d = 0
        for j in range(len(target_avail)):
            sum_d = sum_d + target_avail[j][i]
        #print(sum_d)
        avg_d.append(sum_d/len(target_avail))


    for i in range(num_attr):
        alpha.append(avg_d[i]/avg_m[i])
        beta.append(0)
    
    #print(len(alpha))
    #print(alpha,beta)
    #print("#####")
    result = learn_parameters(alpha,beta,target_avail)
    find_scores(newsgroups_test.target,result)
    #print("NMI: ",normalized_mutual_info_score(newsgroups_test.target, result.labels_))
    
    #print("Reframing results on ",n," data: ",result.inertia_)
    #print(result)



compare  12.80283 12.88368


  return_n_iter=True)


Reframing results on  30  data: 
SSE:  741.089706982
purity:  0.78288633461
NMI:  0.252018824908
RI:  0.78288633461
compare  22.51795 22.62286
Reframing results on  50  data: 
SSE:  740.650269441
purity:  0.761174968072
NMI:  0.250731263657
RI:  0.761174968072
compare  32.69472 43.57696
Reframing results on  70  data: 
SSE:  741.151283138
purity:  0.79054916986
NMI:  0.262857182661
RI:  0.79054916986
compare  42.04126 66.40617
Reframing results on  90  data: 
SSE:  740.786720794
purity:  0.789272030651
NMI:  0.277758740409
RI:  0.789272030651
compare  55.31959 106.60894
Reframing results on  120  data: 
SSE:  740.483569126
purity:  0.779054916986
NMI:  0.287250342007
RI:  0.779054916986
compare  65.14086 144.02664
Reframing results on  140  data: 
SSE:  740.403157874
purity:  0.777777777778
NMI:  0.29160363874
RI:  0.777777777778
compare  75.13099 202.1769
Reframing results on  160  data: 
SSE:  740.543104911
purity:  0.787994891443
NMI:  0.297655105559
RI:  0.787994891443
compare  84.

In [150]:
nj = np.zeros(2) ##for class
nl = np.zeros(2) ##for cluster

In [151]:
nj[1] = np.count_nonzero(newsgroups_test.target)
nj[0] = len(newsgroups_test.target) - nj[1]

nl[1] = np.count_nonzero(result.labels_)
nl[0] = len(result.labels_) - nl[1]

In [152]:
print(nj,nl)

[ 389.  394.] [ 378.  405.]


In [153]:
njl = np.zeros((2,2))

In [154]:
for i in range(len(newsgroups_test.target)):
    x = newsgroups_test.target[i]
    y = result.labels_[i]
    njl[x][y] = njl[x][y]+1
    
njl

array([[ 286.,  103.],
       [  92.,  302.]])

In [155]:
sum_xy = 0
sum_x = 0
sum_y = 0
flag = 0
N = len(newsgroups_test.target)
for x in range(2):
    
    sum_x = sum_x+ nj[x] * np.log10(nj[x]/N)
    
    for y in range(2):
        
        sum_xy = sum_xy+ njl[x][y] * np.log10((N*njl[x][y])/(nj[x]*nl[y]))
        
        if(flag==0):
            sum_y = sum_y+ nl[y] * np.log10(nl[y]/N)
            flag = 1
    flag = 0
               

In [156]:
sum_xy

44.861865144370839

In [157]:
NMI = sum_xy/np.sqrt(sum_x*sum_y)
NMI

0.18897661896689369