<font size="5">Import required libraries</font>

In [1]:
#import libraries
import time
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('words')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/tikkanr1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /home/tikkanr1/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

<font size="5">Preprocessing data </font>

In [3]:
#load dataset
df = pd.read_csv('abstractdata5.csv', sep='\n', header=None)

#separate data columns and combine title and text
df = df[0].str.split('#', expand=True)
df['text'] = df[2]+df[3]
df = df.drop([2,3], axis=1)

#rename columns
df = df.rename(columns={0:'id', 1:'class'})

#set index
df = df.set_index('id')
df.head()

Unnamed: 0_level_0,class,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
id1,1,Anomaly detection in wide area imagery [Geniş ...
id2,1,Person re-identification with deep kronecker-p...
id3,1,Crack detection in images of masonry using cnn...
id4,5,Towards an energy efficient code generator for...
id5,5,Sub-polyhedral scheduling using (Unit-)two-var...


In [4]:
#preprosessing
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
stemmer = nltk.stem.snowball.EnglishStemmer()
lemmatizer = nltk.stem.WordNetLemmatizer()
stop = stopwords.words('english')
for index, row in df.iterrows():
    
    #tokenize text (also removes punctuation)
    tokens = tokenizer.tokenize(row['text'])
    
    #filtered words list
    filtered_words = []
    
    #remove non-english words and stopwords
    for word in tokens:
        word = word.lower()
        if word.isascii() and word.isalpha() and word not in stop:
            filtered_words.append(word)

    #stemming
    final_words = [stemmer.stem(word.strip()) for word in filtered_words]
    #lemmatization
    #final_words = [lemmatizer.lemmatize(word.strip(), pos='a') for word in filtered_words]
    row['text'] = " ".join(final_words)
df.head()

Unnamed: 0_level_0,class,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
id1,1,anomali detect wide area imageri alan anomali ...
id2,1,person identif deep kroneck product match grou...
id3,1,crack detect imag masonri use cnnswhile signif...
id4,5,toward energi effici code generat mobil phones...
id5,5,sub polyhedr schedul use unit two variabl per ...


<font size="5">Calculating tfidf</font>

In [5]:
# create tfidf matrix and normalize data
#‘l2’: Sum of squares of vector elements is 1. 
#The cosine similarity between two vectors is their 
#dot product when l2 norm has been applied.
def tfidf(df, norml):
    count = TfidfVectorizer(norm=norml)
    data = count.fit_transform(df['text'])
    bag = pd.DataFrame(data.toarray(), columns=count.get_feature_names(), index=df.index)
    return bag

count_df = tfidf(df,'l2')
count_df.head()

Unnamed: 0_level_0,aan,aanmdof,aatcc,ab,abadi,abandon,abattoir,abc,abdomen,abdomin,...,zipper,zipperw,zno,zolb,zone,zsl,zucca,zupt,zybo,zynq
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
id2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
id3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
id4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
id5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


<font size="5">KMeans clustering</font>

In [7]:
#cluster using KMeans K=5
K = 5

#seed for reproducing results 
# DO NOT CHANGE!!!!
seed = 11 

#clustering
kmeans_5 = KMeans(n_clusters=5, random_state=seed).fit(count_df)

<font size="5">Constructing confusion matrix</font>

In [8]:
#confusion matrix
def confusion_matrix(df,kmeans):
    labels = kmeans.labels_
    confusion_df = pd.DataFrame({'Classes': df['class'].astype(int).values, 'Clusters': labels})
    confusion_table = pd.crosstab(confusion_df['Classes'], confusion_df['Clusters'])
    return confusion_table

confusion_matrix(df, kmeans_5)

Clusters,0,1,2,3,4
Classes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0,0,1,351,1
2,0,2,165,32,3
3,164,1,0,97,1
4,0,1,1,44,193
5,0,234,5,26,10


<font size="5">Calculating NMI score</font>

In [10]:
#NMI
#NMI score with geometric average as in Strehl and Ghosh
def nmi(df,kmeans):
    truelabels = df['class'].astype(int).values
    predlabels = kmeans.labels_
    nmi = metrics.normalized_mutual_info_score(truelabels, predlabels, average_method='geometric')
    return nmi

nmi(df, kmeans_5)

0.676632763629533