In [21]:
#function to write in csv
import csv
def write_list_in_file(final, name):
    with open(name, "w", newline="",encoding="utf8") as fp:
        a = csv.writer(fp, delimiter=',')
        a.writerows(final)


In [22]:
#Function to read csv files
from csv import reader
# Load a CSV file\n",
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset


In [23]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

#load base data
base_data = load_csv('season1.csv')    
base_data = np.array(base_data[1:])
base_size = len(base_data)

#no of target elements and load target data
elements = [10,20,30,40,50]
target_data = load_csv('season2.csv')
target_data = np.array(target_data[1:])

#normalize data using a min max normalizer
mixed = np.concatenate((base_data,target_data))
scaler = MinMaxScaler()
mixed_scaled = scaler.fit_transform(mixed)

#normalized base and target data
base_scaled = mixed_scaled[:base_size]
target_scaled = mixed_scaled[base_size:]




In [46]:
# number of attributes
num_attr = mixed_scaled[0].size

In [47]:
#learn number of clusters using silhouette_score

from __future__ import print_function

from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

num_clusters = 0
max_silhouette = -1

for n_clusters in range(2,10):

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters)
    cluster_labels = clusterer.fit_predict(base_scaled)

    silhouette_avg = silhouette_score(base_scaled, cluster_labels)
    #print("For n_clusters =", n_clusters,"The average silhouette_score is :", silhouette_avg)
    
    if(silhouette_avg > max_silhouette):
        max_silhouette = silhouette_avg
        num_clusters = n_clusters

print("Optimum Number of Clusters: ",num_clusters)


Optimum Number of Clusters:  7


In [48]:
#learn centers from source/base model
import numpy as np
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(base_scaled)

base_centroids = kmeans.cluster_centers_
#labels = kmeans.labels_
    
#print(kmeans.cluster_centers_)
#print(kmeans.inertia_)

In [52]:
#base model results
kmeans = KMeans(n_clusters=num_clusters, init=base_centroids, max_iter=1)
kmeans.fit(target_scaled)
print("applying base model in target result: ", kmeans.inertia_)

applying base model in target result:  7.99637305263


  return_n_iter=True)


In [53]:
#retraining results


for n in elements:

    #Read Available Limited Target Data
    kmeans = KMeans(n_clusters=num_clusters)
    kmeans.fit(target_scaled[:n])
    
    #Result on whole Target Data
    kmeans = KMeans(n_clusters=num_clusters, init=kmeans.cluster_centers_, max_iter=1)
    kmeans.fit(target_scaled)

    print("Retraining results on ", n," data ",kmeans.inertia_)

Retraining results on  10  data  9.63067393178
Retraining results on  20  data  9.25081005174
Retraining results on  30  data  6.45792655246
Retraining results on  40  data  6.00573365415
Retraining results on  50  data  5.92548512222


  return_n_iter=True)


In [94]:
def cal_centroids(alpha,beta):

    target_centroids = []
    for i in range(num_clusters):
        val = []
        for j in range(num_attr):
            val.append(alpha[j]*base_centroids[i][j] + beta[j])
        
        target_centroids.append(val)
             

    target_centroids = np.array(target_centroids)
    print("new centroids: ", target_centroids)
    return target_centroids
    #print(target_centroids)

In [95]:
def kmeans(k,dataItems,centroids,maxIter,num_attr):
    
    #print(k,dataItems,centroids,maxInter,num_attr)
    old_centroids = []
    groups = []
    
    for i in range(k):
        groups.append([])
        
    iter = 0
    while(iter < maxIter):
        old_centroids = centroids
        for item in dataItems:
            row = []
            for centroid in centroids:
                diff = 0
                for i in range(num_attr):
                    diff += abs(item[i] - centroid[i])
                    
                row.append(diff)
                
            idx = row.index(min(row))
            groups[idx].append(item)
                    
            
        iter += 1
        
    ss = 0    
    for i in range(k):
        for item in groups[i]:
            for j in range(num_attr):
            
                diff = abs(item[j]- old_centroids[i][j])
                ss += pow(diff,2)
            
        
    return (groups,ss)
        

In [96]:
def cal_gradient(clusters,reframed_centroids,old_alpha,old_beta):
    
    gradient_alpha = []
    gradient_beta = []
    gradient = 0
    
    for i in range(num_attr):
        gradient_alpha.append(0)
        gradient_beta.append(0)
        
    
    for i in range(num_clusters):
                
        for member in clusters[i]:
                      
            for j in range(num_attr):
                
                gradient =  (reframed_centroids[i][j] - member[j])
                gradient_alpha[j] = gradient_alpha[j] + gradient*reframed_centroids[i][j]
                gradient_beta[j] = gradient_beta[j] + gradient

    print("******")
    #print(gradient_alpha,gradient_beta)
    
    new_alpha = []
    new_beta = []
    
    for i in range(num_attr):       
        new_alpha.append(old_alpha[i]-.01*gradient_alpha[i])
        new_beta.append(old_beta[i]-.01*gradient_beta[i])
        
    return [new_alpha,new_beta]

In [97]:
alpha = []
beta = []
avg_m = []
avg_d = []
sum_d = 0
for i in range(num_attr):
    sum_m = 0
    for j in range(num_clusters):
        sum_m = sum_m + base_centroids[j][i]
        
    avg_m.append(sum_m/num_clusters)
    

for i in range(num_attr):
    sum_d = 0
    for j in range(len(target_scaled[:10])):
        sum_d = sum_d + target_scaled[j][i]
        
    avg_d.append(sum_d/len(target_scaled[:10]))
    

for i in range(num_attr):
    alpha.append(avg_m[i]/avg_d[i])
    beta.append(0)

print(alpha,beta)
        


[0.99130737776314515, 1.0092333021369835, 1.0133235003375738, 1.084878812312928] [0, 0, 0, 0]


In [98]:
reframed_centroids =  cal_centroids(alpha,beta)
km = kmeans(num_clusters, target_scaled, reframed_centroids, 1, num_attr)
#kmeans.fit(target_scaled)
best_error = km[1]
#centroids = kmeans.cluster_centers_
#labels = kmeans.labels_

count = 0
best_alpha = alpha
best_beta = beta
while(1):
    #print(centroids)
    print("best error: ",best_error)
    
    #clusters = find_members(centroids,labels)
    
    #reframed_centroids = closest_centroids(reframed_centroids,centroids)
    
    new_alphabeta = cal_gradient(km[0],reframed_centroids,alpha,beta)
    
    alpha = new_alphabeta[0]
    beta = new_alphabeta[1]
    
    print("new alpha beta", alpha, beta)
    
    reframed_centroids =  cal_centroids(alpha,beta)
    
    km = kmeans(num_clusters, target_scaled, reframed_centroids, 1, num_attr)
    #kmeans.fit(target_scaled)
    new_error = km[1]
    
    print("compare ",best_error,new_error)
    if(new_error < best_error):
        best_alpha = alpha
        best_beta = beta
        best_error = new_error
        count = 0
        
    elif(new_error == best_error):
        if(count<5):
            count += 1
            continue
        else:
            break;
                
    else:
        break; 

    #base_centroids = kmeans.cluster_centers_
    #labels = kmeans.labels_

    #print(kmeans.cluster_centers_)
    #print(old_error,new_error)
    
print("finalparameters", best_alpha,best_beta)

new centroids:  [[ 0.16355839  0.16618691  0.51973753  0.33176255]
 [ 0.25062838  0.27185965  0.78294091  0.19427337]
 [ 0.22420695  0.1920063   0.53833174  0.69046429]
 [ 0.37941289  0.39117382  0.82384862  0.48504881]
 [ 0.56073282  0.58725431  0.67499621  0.30453968]
 [ 0.33858762  0.35073781  0.54491268  0.36139946]
 [ 0.43577614  0.43716693  0.40052711  0.79777621]]
best error:  16.5600784138
******
new alpha beta [1.1192940321919618, 1.1218834262491943, 1.0152502682233744, 1.0716244921632381] [0.23461581771456513, 0.19731735862399774, 0.0031676513432944511, -0.012145656443268909]
new centroids:  [[ 0.41929106  0.38205397  0.52389342  0.31556365]
 [ 0.51760257  0.49952185  0.78759727  0.17975421]
 [ 0.48776989  0.41075531  0.54252299  0.66988301]
 [ 0.66301431  0.63215381  0.82858276  0.46697715]
 [ 0.86774426  0.85012071  0.67944732  0.28867336]
 [ 0.61691813  0.58720435  0.54911645  0.34483846]
 [ 0.72665456  0.68328064  0.40445634  0.77588386]]
compare  16.5600784138 8.21798731

In [99]:
target_cent = cal_centroids(best_alpha,best_beta)

new centroids:  [[ 0.3915733   0.42088934  0.55259603  0.23920662]
 [ 0.48828151  0.54004504  0.81931665  0.10811937]
 [ 0.45893535  0.45000307  0.57143872  0.58120607]
 [ 0.63132184  0.6745827   0.86077101  0.38535538]
 [ 0.832713    0.89568141  0.70992946  0.21325133]
 [ 0.5859774   0.62898739  0.57810761  0.26746354]
 [ 0.69392422  0.72644414  0.43179259  0.68352121]]


In [100]:
kmeans = KMeans(n_clusters=num_clusters, init=target_cent, max_iter=1)
kmeans.fit(target_scaled)
print(kmeans.inertia_)

5.95162535738


  return_n_iter=True)
