In [89]:
import pandas as pd

# preprocessing
df = pd.read_csv("features_training.csv",  names= ["nouncount","adjectivecount","pronouncount","adverbcount","pos_words","neg_words","y"], nrows=100)

print(df)

accuracies = [[],[],[]]
loops = []

    nouncount  adjectivecount  pronouncount  adverbcount  pos_words  \
0    0.105139        0.098198      0.041470     0.030055   2.044192   
1    0.222596        0.028947      0.044684     0.000000   0.376309   
2    0.194861        0.048089      0.019363     0.000000   0.000000   
3    0.138421        0.078070      0.036644     0.030747   0.413919   
4    0.090290        0.095891      0.026909     0.040074   1.442656   
..        ...             ...           ...          ...        ...   
95   0.322793        0.100079      0.036494     0.000000   0.721328   
96   0.342374        0.167080      0.000000     0.000000   0.000000   
97   0.085392        0.102310      0.047823     0.056139   0.376309   
98   0.124915        0.164216      0.022342     0.128872   0.000000   
99   0.207336        0.000000      0.034049     0.049130   0.540090   

    neg_words  y  
0    0.000000  4  
1    0.000000  4  
2    0.630709  4  
3    0.811947  4  
4    0.000000  4  
..        ... ..  
95   0.601536 

In [168]:
# K-means model

K = 3 #(0 = negative, 2 = neutral, 4 = positive)
labels = [0, 2, 4]

#select random sample as initial centroids
import random


Centroids = []
for i in range(K):
    sublist =  df[df["y"] == labels[i]]
    sample = sublist.iloc[random.randint(0, len(sublist)-1),:]
    Centroids.append(sample.values.tolist())
    print(Centroids[i])

[0.12393758523356815, 0.06431144172808746, 0.16439545367644132, 0.020884972753633355, 0.0, 2.3452217537777056, 0.0]
[0.5311956274685616, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0]
[0.19831281388600286, 0.0, 0.0, 0.060756284374206125, 0.0, 0.0, 4.0]


In [169]:
nloop = 0
while True:
    cluster = [[] for i in range(K)]
    new_center = [[0 for i in range(len(df.columns)-1)] for j in range(K)]

    for index, row in df.iterrows():
        min_ = float('inf')
        min_i = 0

        d = [0 for i in range(K)]
        for c in range(K):
            dist = 0 #eulerian distance
            for i in range(len(df.columns)-1):
                dist += (row[i] - Centroids[c][i])*(row[i] - Centroids[c][i])
            d[c] = dist
        
        #find the closest centroid
        min_ = min(d)
        min_i = d.index(min_)
        cluster[min_i].append(index)
        new_center[min_i] += row[:-1]

    #calculate new centroids
    for k in range(K):
        new_center[k] = [n / len(cluster[k]) for n in new_center[k]]
        
    #if converged, quit loop
    if sum([Centroids[i][j] - new_center[i][j] for i in range(K) for j in range(len(Centroids)-1)]) < 0.00000001:
        break
    Centroids = new_center
    nloop+=1

loops.append(nloop)
print("Cluster with row indices: ",cluster)
print("Centroids: ", new_center)
print("Converged in ", nloop," loops")

Cluster with row indices:  [[14, 16, 24, 35, 36, 39, 40, 43, 92, 94, 97], [8, 19, 28, 48, 50, 55, 57, 67, 71, 76, 80, 83], [0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 15, 17, 18, 20, 21, 22, 23, 25, 26, 27, 29, 30, 31, 32, 33, 34, 37, 38, 41, 42, 44, 45, 46, 47, 49, 51, 52, 53, 54, 56, 58, 59, 60, 61, 62, 63, 64, 65, 66, 68, 69, 70, 72, 73, 74, 75, 77, 78, 79, 81, 82, 84, 85, 86, 87, 88, 89, 90, 91, 93, 95, 96, 98, 99]]
Centroids:  [[0.17500769047277276, 0.08189930244685999, 0.03551757373315354, 0.03125581987198917, 0.6338128371531165, 1.963045749373073], [0.5120990146753578, 0.002677178873287228, 0.0, 0.0031329999999020613, 0.05012799999843299, 0.03891066831209444], [0.22660264307074338, 0.04182966241027795, 0.021273249375380037, 0.026295005731532515, 0.3432264477996259, 0.2707596204107492]]
Converged in  0  loops


In [170]:
#test
cluster = [{0:0, 2:0, 4:0} for i in range(K)]
new_center = [[0 for i in range(len(df.columns)-1)] for j in range(K)]

#find the closest centroids
for index, row in df.iterrows():
    min_ = float('inf')
    min_i = 0

    d = [0 for i in range(K)]
    for c in range(K):
        dist = 0
        for i in range(len(df.columns)-1):
            dist += (row[i] - Centroids[c][i])*(row[i] - Centroids[c][i])
        d[c] = dist

    min_ = min(d)
    min_i = d.index(min_)
    cluster[min_i][row[-1]] += 1

#calculating the highest accuracy percentage among labels from the percentage.
accuracy_list = []
for d in cluster:
    accuracy_list.append(max([n/sum(d.values()) for n in d.values()]))
accuracy_list = sorted(accuracy_list)

for k in range(K):
    accuracies[k].append(accuracy_list[k])
print(accuracy_list)

[0.4675324675324675, 0.5, 0.7272727272727273]


In [171]:
print("average accuracy: ")
from statistics import mean
for k in range(K):
    print(mean(accuracies[k]))
print("") 

print("average loop: ")
print(mean(loops))

print("after running k-means clustering" , len(loops) , "times.")

average accuracy: 
0.44854086068387344
0.6485945547821682
0.8069028925591774

average loop: 
1.1
after running k-means clustering 20 times.
