# Clustering Based Method

# Importing Required Libraries

In [32]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
import warnings
warnings.filterwarnings('ignore')

# Train and Test Classifier

In [33]:
def train_and_test(file,classifier):
    start = time.time()
    data = pd.read_csv('extracted-features/'+file)
    X = data.iloc[:,:-1]
    y = data.iloc[:,-1:]
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=30)
    model = eval(classifier+'(n_clusters=2)')
    model.fit(X_train)
    prediction = model.fit_predict(X_test)
    end = time.time()
    print('File:',file)
    print('Execution Time:',round(end - start, 2),'sec')
    print('Accuracy: [',str(classifier),':',accuracy_score(y_test,prediction.round()).round(3),']')
    print('RMS Error: [',str(classifier),':',mean_squared_error(y_test,prediction.round()).round(3),']\n')

# Main

In [34]:
train_and_test('stylometry.csv','KMeans')
train_and_test('stylometry.csv','AgglomerativeClustering')

File: stylometry.csv
Execution Time: 0.08 sec
Accuracy: [ KMeans : 0.348 ]
RMS Error: [ KMeans : 0.652 ]

File: stylometry.csv
Execution Time: 0.02 sec
Accuracy: [ AgglomerativeClustering : 0.383 ]
RMS Error: [ AgglomerativeClustering : 0.617 ]



In [35]:
train_and_test('word_1_gram.csv','KMeans')
train_and_test('word_1_gram.csv','AgglomerativeClustering')

File: word_1_gram.csv
Execution Time: 0.82 sec
Accuracy: [ KMeans : 0.447 ]
RMS Error: [ KMeans : 0.553 ]

File: word_1_gram.csv
Execution Time: 0.42 sec
Accuracy: [ AgglomerativeClustering : 0.589 ]
RMS Error: [ AgglomerativeClustering : 0.411 ]



In [36]:
train_and_test('word_2_gram.csv','KMeans')
train_and_test('word_2_gram.csv','AgglomerativeClustering')

File: word_2_gram.csv
Execution Time: 0.87 sec
Accuracy: [ KMeans : 0.511 ]
RMS Error: [ KMeans : 0.489 ]

File: word_2_gram.csv
Execution Time: 0.38 sec
Accuracy: [ AgglomerativeClustering : 0.539 ]
RMS Error: [ AgglomerativeClustering : 0.461 ]



In [37]:
train_and_test('word_3_gram.csv','KMeans')
train_and_test('word_3_gram.csv','AgglomerativeClustering')

File: word_3_gram.csv
Execution Time: 0.76 sec
Accuracy: [ KMeans : 0.532 ]
RMS Error: [ KMeans : 0.468 ]

File: word_3_gram.csv
Execution Time: 0.34 sec
Accuracy: [ AgglomerativeClustering : 0.532 ]
RMS Error: [ AgglomerativeClustering : 0.468 ]



In [38]:
train_and_test('char_3_gram.csv','KMeans')
train_and_test('char_3_gram.csv','AgglomerativeClustering')

File: char_3_gram.csv
Execution Time: 0.97 sec
Accuracy: [ KMeans : 0.56 ]
RMS Error: [ KMeans : 0.44 ]

File: char_3_gram.csv
Execution Time: 0.62 sec
Accuracy: [ AgglomerativeClustering : 0.553 ]
RMS Error: [ AgglomerativeClustering : 0.447 ]



In [39]:
train_and_test('char_4_gram.csv','KMeans')
train_and_test('char_4_gram.csv','AgglomerativeClustering')

File: char_4_gram.csv
Execution Time: 0.97 sec
Accuracy: [ KMeans : 0.454 ]
RMS Error: [ KMeans : 0.546 ]

File: char_4_gram.csv
Execution Time: 0.65 sec
Accuracy: [ AgglomerativeClustering : 0.532 ]
RMS Error: [ AgglomerativeClustering : 0.468 ]



In [40]:
train_and_test('char_5_gram.csv','KMeans')
train_and_test('char_5_gram.csv','AgglomerativeClustering')

File: char_5_gram.csv
Execution Time: 1.11 sec
Accuracy: [ KMeans : 0.447 ]
RMS Error: [ KMeans : 0.553 ]

File: char_5_gram.csv
Execution Time: 0.54 sec
Accuracy: [ AgglomerativeClustering : 0.603 ]
RMS Error: [ AgglomerativeClustering : 0.397 ]



In [41]:
train_and_test('char_6_gram.csv','KMeans')
train_and_test('char_6_gram.csv','AgglomerativeClustering')

File: char_6_gram.csv
Execution Time: 1.05 sec
Accuracy: [ KMeans : 0.411 ]
RMS Error: [ KMeans : 0.589 ]

File: char_6_gram.csv
Execution Time: 0.5 sec
Accuracy: [ AgglomerativeClustering : 0.433 ]
RMS Error: [ AgglomerativeClustering : 0.567 ]



In [42]:
train_and_test('char_7_gram.csv','KMeans')
train_and_test('char_7_gram.csv','AgglomerativeClustering')

File: char_7_gram.csv
Execution Time: 0.9 sec
Accuracy: [ KMeans : 0.504 ]
RMS Error: [ KMeans : 0.496 ]

File: char_7_gram.csv
Execution Time: 0.53 sec
Accuracy: [ AgglomerativeClustering : 0.553 ]
RMS Error: [ AgglomerativeClustering : 0.447 ]



In [43]:
train_and_test('char_8_gram.csv','KMeans')
train_and_test('char_8_gram.csv','AgglomerativeClustering')

File: char_8_gram.csv
Execution Time: 1.42 sec
Accuracy: [ KMeans : 0.475 ]
RMS Error: [ KMeans : 0.525 ]

File: char_8_gram.csv
Execution Time: 0.47 sec
Accuracy: [ AgglomerativeClustering : 0.546 ]
RMS Error: [ AgglomerativeClustering : 0.454 ]



In [44]:
train_and_test('char_9_gram.csv','KMeans')
train_and_test('char_9_gram.csv','AgglomerativeClustering')

File: char_9_gram.csv
Execution Time: 1.82 sec
Accuracy: [ KMeans : 0.518 ]
RMS Error: [ KMeans : 0.482 ]

File: char_9_gram.csv
Execution Time: 0.38 sec
Accuracy: [ AgglomerativeClustering : 0.56 ]
RMS Error: [ AgglomerativeClustering : 0.44 ]



In [45]:
train_and_test('char_10_gram.csv','KMeans')
train_and_test('char_10_gram.csv','AgglomerativeClustering')

File: char_10_gram.csv
Execution Time: 0.92 sec
Accuracy: [ KMeans : 0.496 ]
RMS Error: [ KMeans : 0.504 ]

File: char_10_gram.csv
Execution Time: 0.35 sec
Accuracy: [ AgglomerativeClustering : 0.539 ]
RMS Error: [ AgglomerativeClustering : 0.461 ]

