<font size="5">Importing required libraries</font>

In [1]:
#import libraries
#import sys
#!{sys.executable} -m pip install scikit-learn-extra
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.decomposition import PCA, TruncatedSVD
import nltk
nltk.download('stopwords')
nltk.download('words')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/tikkanr1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /home/tikkanr1/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to /home/tikkanr1/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

<font size="5">Preprocessing the data</font>

In [2]:
#load dataset
df = pd.read_csv('abstractdata5.csv', sep='\n', header=None)

#separate data columns and combine title and text
df = df[0].str.split('#', expand=True)
df['text'] = df[2]+df[3]
df = df.drop([2,3], axis=1)

#rename columns
df = df.rename(columns={0:'id', 1:'class'})

#set index
df = df.set_index('id')
df.head()

Unnamed: 0_level_0,class,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
id1,1,Anomaly detection in wide area imagery [Geniş ...
id2,1,Person re-identification with deep kronecker-p...
id3,1,Crack detection in images of masonry using cnn...
id4,5,Towards an energy efficient code generator for...
id5,5,Sub-polyhedral scheduling using (Unit-)two-var...


In [3]:
#preprosessing
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
stemmer = nltk.stem.snowball.EnglishStemmer()
lemmatizer = nltk.stem.WordNetLemmatizer()
stop = nltk.corpus.stopwords.words('english')
for index, row in df.iterrows():
    
    #tokenize text
    tokens = tokenizer.tokenize(row['text'])
    
    #filtered words list
    filtered_words = []
    
    #remove non-english words and stopwords
    for word in tokens:
        word = word.lower()
        if word.isascii() and word.isalpha() and word not in stop:
            filtered_words.append(word)

    #stemming
    final_words = [stemmer.stem(word.strip()) for word in filtered_words]
    #lemmatization
    #final_words = [lemmatizer.lemmatize(word.strip(), pos='n') for word in filtered_words]
    row['text'] = " ".join(final_words)
df.head()

Unnamed: 0_level_0,class,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
id1,1,anomali detect wide area imageri alan anomali ...
id2,1,person identif deep kroneck product match grou...
id3,1,crack detect imag masonri use cnnswhile signif...
id4,5,toward energi effici code generat mobil phones...
id5,5,sub polyhedr schedul use unit two variabl per ...


<font size="5">Computing the tfidf matrix</font>

In [4]:
# create tfidf matrix with n_gram(1,3) and normalize data 
def tfidf(df, norml, mindf):
    count = TfidfVectorizer(norm=norml,min_df=mindf, ngram_range=(1,3))
    data = count.fit_transform(df['text'])
    bag = pd.DataFrame(data.toarray(), columns=count.get_feature_names(), index=df.index)
    return bag


<font size="5">Calculating the NMI score</font>

In [5]:
#NMI score with geometric average as in Strehl and Ghosh
def nmi(truelabels, predlabels):
    nmi = metrics.normalized_mutual_info_score(truelabels, predlabels, average_method='geometric')
    return nmi

<font size="5"> Clustering using the Buckshot method</font>

In [6]:
#Buckshot method
def buckshot(df, K, norm, minfr, affinity, linkage, seed):
    
    #define data
    data = tfidf(df, norm, minfr)

    #choose sqrt(Kn) samples
    n = len(data)
    n_samples = int(np.sqrt(K*n))
    samples = data.sample(n_samples, random_state=seed)

    #create K seeds with hierarchical clustering
    clustering = AgglomerativeClustering(n_clusters=K, affinity=affinity, linkage=linkage)
    clustering = clustering.fit(samples)

    #concatenate word occurrence in documents for each K cluster
    samples['results_of_clustering'] = clustering.labels_
    concatenated = samples.groupby(samples['results_of_clustering']).sum()
    init = concatenated.values
    
    #normalize word occurrence values
    init = normalize(init, norm='l1')

    #apply K-means to seeds
    gather = KMeans(n_clusters=K, init=init, n_init=1)
    result = gather.fit(data)
    
    return result.labels_, data


<font size="5">Comparing the clustering results</font>

In [7]:
#compare buckshot with different parameters
norms = ['l2']
mins = [5, 4, 3, 2]
links = ['ward', 'complete', 'average', 'single']
all_affs = ['euclidean', 'l1', 'l2', 'manhattan', 'cosine']

#define truelabels
truelabels = df['class'].astype(int).values

#df for results
results = pd.DataFrame()

#placeholder
best = 0
bestname = ''

#SEED FOR REPRODUCING RESULTS
seed = 11

#compute NMI for all parameters
for norm in norms:
    for minf in mins:
        for link in links:
            if link == 'ward':
                affs = ['euclidean']
            else:
                affs = all_affs
            for aff in affs:
                name = norm + '-' + str(minf) + '-' + link + '-' + aff
                buckshot_labels, data = buckshot(df, 5, norm, minf, aff, link, seed)
                buckshot_nmi = np.round(nmi(truelabels, buckshot_labels),4)
                results[name] = [buckshot_nmi]
                if buckshot_nmi > best:
                    best = buckshot_nmi
                    bestname = name
                print("current nmi: ", str(buckshot_nmi), "current best: ", bestname, str(best))


    

current nmi:  0.7828 current best:  l2-5-ward-euclidean 0.7828
current nmi:  0.7616 current best:  l2-5-ward-euclidean 0.7828
current nmi:  0.4981 current best:  l2-5-ward-euclidean 0.7828
current nmi:  0.7616 current best:  l2-5-ward-euclidean 0.7828
current nmi:  0.4981 current best:  l2-5-ward-euclidean 0.7828
current nmi:  0.7616 current best:  l2-5-ward-euclidean 0.7828
current nmi:  0.7061 current best:  l2-5-ward-euclidean 0.7828
current nmi:  0.4722 current best:  l2-5-ward-euclidean 0.7828
current nmi:  0.7061 current best:  l2-5-ward-euclidean 0.7828
current nmi:  0.4722 current best:  l2-5-ward-euclidean 0.7828
current nmi:  0.7073 current best:  l2-5-ward-euclidean 0.7828
current nmi:  0.5342 current best:  l2-5-ward-euclidean 0.7828
current nmi:  0.4722 current best:  l2-5-ward-euclidean 0.7828
current nmi:  0.5342 current best:  l2-5-ward-euclidean 0.7828
current nmi:  0.4722 current best:  l2-5-ward-euclidean 0.7828
current nmi:  0.5342 current best:  l2-5-ward-euclidean

In [8]:
#print results in ascending order
results.iloc[0].sort_values(ascending=False)

l2-4-ward-euclidean        0.8120
l2-3-ward-euclidean        0.8045
l2-3-complete-cosine       0.7941
l2-3-complete-euclidean    0.7941
l2-3-complete-l2           0.7941
                            ...  
l2-3-complete-manhattan    0.4451
l2-3-complete-l1           0.4451
l2-2-single-euclidean      0.3247
l2-2-single-l2             0.3247
l2-2-single-cosine         0.3247
Name: 0, Length: 64, dtype: float64

<font size="5">Constructing the confusion matrix </font>

In [9]:
#confusion matrix
def confusion_matrix(truelabels,labels):
    confusion_df = pd.DataFrame({'Classes': truelabels, 'Clusters': labels})
    confusion_table = pd.crosstab(confusion_df['Classes'], confusion_df['Clusters'])
    return confusion_table

#truelabels
truelabels = df['class'].astype(int).values

#best buckshot clustering labels
buckshot_labels, data = buckshot(df, 5, 'l2', 4, 'euclidean', 'ward', 11)

#result matrix
confusion_matrix(truelabels, buckshot_labels)

Clusters,0,1,2,3,4
Classes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,346,2,1,0,4
2,7,5,185,2,3
3,27,0,0,3,233
4,4,227,1,7,0
5,5,5,5,253,7


<font size="5">Finding topics for the clusters</font>

In [10]:
df['cluster'] = buckshot_labels
df.head()

Unnamed: 0_level_0,class,text,cluster
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id1,1,anomali detect wide area imageri alan anomali ...,0
id2,1,person identif deep kroneck product match grou...,0
id3,1,crack detect imag masonri use cnnswhile signif...,0
id4,5,toward energi effici code generat mobil phones...,3
id5,5,sub polyhedr schedul use unit two variabl per ...,3


In [11]:
#Get information for topics of clusters
from collections import Counter

cluster_relative_freq = []

for i in range(0,5):
    
    cluster = df[df['cluster']==i]
    #print(cluster.head())
    
    count = Counter(" ".join(cluster["text"]).split()).most_common(100)
    print("Cluster " + str(i) + " frequencies: ")
    print(count)
    topten = count[:20]
    relative_frequency = []
    for j in range(0,20):
        sum_of_appearances = 0
        for index, row in cluster.iterrows():
            if topten[j][0] in row['text']:
                sum_of_appearances += 1
        relative_frequency.append((topten[j][0],sum_of_appearances,sum_of_appearances/len(cluster)))
    cluster_relative_freq.append(relative_frequency)
    
    relative_frequency.sort(key=lambda x:x[2],reverse=True)
    print("\nRelative frequencies inside cluster (keyword, in how many docs does it appear, how much is that compared to docs in cluster): ")
    print(relative_frequency)
    print("\n")
    

Cluster 0 frequencies: 
[('use', 681), ('imag', 639), ('base', 538), ('method', 536), ('detect', 470), ('model', 463), ('propos', 450), ('comput', 396), ('vision', 365), ('learn', 364), ('system', 340), ('perform', 316), ('network', 315), ('data', 295), ('result', 292), ('object', 266), ('algorithm', 265), ('deep', 251), ('dataset', 244), ('featur', 238), ('approach', 233), ('paper', 213), ('time', 211), ('accuraci', 207), ('train', 203), ('studi', 200), ('video', 194), ('process', 192), ('applic', 191), ('visual', 187), ('track', 187), ('differ', 185), ('research', 185), ('measur', 179), ('techniqu', 178), ('improv', 177), ('inform', 173), ('develop', 169), ('estim', 160), ('evalu', 160), ('segment', 160), ('neural', 154), ('structur', 153), ('test', 150), ('camera', 149), ('show', 148), ('two', 145), ('high', 143), ('work', 141), ('present', 140), ('achiev', 136), ('recognit', 134), ('provid', 133), ('challeng', 132), ('effect', 131), ('real', 130), ('analysi', 130), ('design', 127),