In [67]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import numpy as np

In [68]:
from sklearn.datasets import fetch_20newsgroups
cats = ['comp.graphics', 'comp.os.ms-windows.misc']
newsgroups_train = fetch_20newsgroups(subset='train',remove=('headers', 'footers', 'quotes'), categories=cats)
newsgroups_test = fetch_20newsgroups(subset='test',remove=('headers', 'footers', 'quotes'), categories=cats)

In [66]:
newsgroups_train = ["This little kitty came to play when I was eating at a restaurant.",
             "Merley has the best squooshy kitten belly.",
             "Google Translate app is incredible.",
             "If you open 100 tab in google you get a smiley face.",
             "Best cat photo I've ever taken.",
             "Climbing ninja cat.",
             "Impressed with google map feedback.",
             "Key promoter extension for Google Chrome."]

In [79]:
target_data = newsgroups_test.data

In [80]:
vectorizer = TfidfVectorizer(stop_words='english')
base_scaled = vectorizer.fit_transform(newsgroups_train.data)

In [81]:
target_scaled = vectorizer.transform(target_data)
target_scaled = np.asarray(target_scaled.todense())

In [82]:
target_scaled[0].size

41342

In [83]:
#learn number of clusters using silhouette_score

from __future__ import print_function

from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

num_clusters = 0
max_silhouette = -100

for n_clusters in range(2,10):

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters,init='k-means++',max_iter=1000)
    cluster_labels = clusterer.fit_predict(base_scaled)

    silhouette_avg = silhouette_score(base_scaled, cluster_labels)
    #print("For n_clusters =", n_clusters,"The average silhouette_scinit='k-means++',ore is :", silhouette_avg)
    
    if(silhouette_avg > max_silhouette):
        max_silhouette = silhouette_avg
        num_clusters = n_clusters

print("Optimum Number of Clusters: ",num_clusters)


Optimum Number of Clusters:  3


In [86]:
#learn centers from source/base model
import numpy as np
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=num_clusters,init='k-means++',max_iter=1000)
kmeans.fit(base_scaled)

base_centroids = kmeans.cluster_centers_
#labels = kmeans.labels_
    
#print(kmeans.cluster_centers_)
print(kmeans.inertia_)

1105.5843047778483


In [87]:
num_attr = base_centroids[0].size

In [88]:
base_scaled.shape

(1175, 41342)

In [89]:
def cal_centroids(alpha,beta):

    target_centroids = []
    for i in range(num_clusters):
        val = []
        for j in range(num_attr):
            val.append(alpha[j]*base_centroids[i][j] + beta[j])
        
        target_centroids.append(val)
             

    target_centroids = np.array(target_centroids)
    #print("new centroids: ", target_centroids)
    return target_centroids
    #print(target_centroids)

In [90]:
def kmeans_custom(k,dataItems,centroids,maxIter,num_attr):
    
    #print(k,dataItems,centroids,maxInter,num_attr)
    old_centroids = []
    groups = []
    
    for i in range(k):
        groups.append([])
        
    iter = 0
    while(iter < maxIter):
        old_centroids = centroids
        for item in dataItems:
            row = []
            for centroid in centroids:
                diff = 0
                for i in range(num_attr):
                    diff += abs(item[i] - centroid[i])
                    
                row.append(diff)
                
            idx = row.index(min(row))
            groups[idx].append(item)
                    
            
        iter += 1
        
    ss = 0    
    for i in range(k):
        for item in groups[i]:
            for j in range(num_attr):
            
                diff = abs(item[j]- old_centroids[i][j])
                ss += pow(diff,2)
            
        
    return (groups,ss/2)
        

In [94]:
def cal_gradient(clusters,reframed_centroids,old_alpha,old_beta):
    
    gradient_alpha = []
    gradient_beta = []
    gradient = 0
    
    for i in range(num_attr):
        gradient_alpha.append(0)
        gradient_beta.append(0)
        
    
    for i in range(num_clusters):
                
        for member in clusters[i]:
                      
            for j in range(num_attr):
                
                gradient =  (reframed_centroids[i][j] - member[j])
                gradient_alpha[j] = gradient_alpha[j] + gradient*base_centroids[i][j]
                gradient_beta[j] = gradient_beta[j] + gradient

    #print("******")
    #print(gradient_alpha,gradient_beta)
    
    new_alpha = []
    new_beta = []
    
    for i in range(num_attr):       
        new_alpha.append(old_alpha[i]-.01*gradient_alpha[i])
        new_beta.append(old_beta[i]-.01*gradient_beta[i])
        
    return [new_alpha,new_beta]

In [95]:
def learn_parameters(alpha,beta,target_avail):
    reframed_centroids =  cal_centroids(alpha,beta)
    km = kmeans_custom(num_clusters, target_avail, reframed_centroids, 1, num_attr)
    #kmeans.fit(target_scaled)
    best_error = round(km[1],5)
    #centroids = kmeans.cluster_centers_
    #labels = kmeans.labels_

    count = 0
    best_alpha = alpha
    best_beta = beta
    while(1):
        #print(centroids)
        #print("best error: ",best_error)

        #clusters = find_members(centroids,labels)

        #reframed_centroids = closest_centroids(reframed_centroids,centroids)

        new_alphabeta = cal_gradient(km[0],reframed_centroids,alpha,beta)

        alpha = new_alphabeta[0]
        beta = new_alphabeta[1]

        #print("new alpha beta", alpha, beta)

        reframed_centroids =  cal_centroids(alpha,beta)

        km = kmeans_custom(num_clusters, target_avail, reframed_centroids, 1, num_attr)
        #kmeans.fit(target_scaled)
        new_error = round(km[1],5)

        print("compare ",best_error,new_error)
        if(new_error < best_error):
            best_alpha = alpha
            best_beta = beta
            best_error = new_error
            count = 0

        elif(new_error == best_error):
            if(count<5):
                count += 1
                continue
            else:
                break;

        else:
            break; 

        #base_centroids = kmeans.cluster_centers_
        #labels = kmeans.labels_

        #print(kmeans.cluster_centers_)
        #print(old_error,new_error)

    #print("finalparameters", best_alpha,best_beta)
    
    target_cent = cal_centroids(best_alpha,best_beta)
    #km = kmeans_custom(num_clusters, target_scaled, target_cent, 1, num_attr)
    kmeans = KMeans(n_clusters=num_clusters, init=target_cent, max_iter=1)
    kmeans.fit(target_scaled)
    #print("Reframing results on ",kmeans.inertia_)
    return kmeans.inertia_

In [96]:
elements = [10]
for n in elements:
    alpha = []
    beta = []
    avg_m = []
    avg_d = []
    sum_d = 0
    for i in range(num_attr):
        sum_m = 0
        for j in range(num_clusters):
            sum_m = sum_m + base_centroids[j][i]
        #print(sum_m)
        avg_m.append(sum_m/num_clusters)
    
    target_avail = vectorizer.transform(target_data[:n])
    target_avail = np.asarray(target_avail.todense())
    for i in range(num_attr):
        sum_d = 0
        for j in range(len(target_avail)):
            sum_d = sum_d + target_avail[j][i]
        #print(sum_d)
        avg_d.append(sum_d/len(target_avail))


    for i in range(num_attr):
        alpha.append(avg_d[i]/avg_m[i])
        beta.append(0)
    
    print(len(alpha))
    #print(alpha,beta)
    #print("#####")
    result = learn_parameters(alpha,beta,target_avail)
    print("Reframing results on ",n," data: ",result)
    #print(result)



41342
compare  3.97192 3.95445
compare  3.95445 3.94031
compare  3.94031 3.92885
compare  3.92885 3.91957
compare  3.91957 3.98671


  return_n_iter=True)


Reframing results on  10  data:  740.405279352
