In [23]:
import os
import re
import logging
from optparse import OptionParser
import sys
from time import time
import glob
from pathlib import Path
from collections import defaultdict
import numpy as np
import pandas as pd
from time import time
import sklearn
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, DBSCAN
from sklearn.cluster import Birch
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AgglomerativeClustering, FeatureAgglomeration
from sklearn import metrics
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from datetime import datetime

In [24]:
#change this to where you have the dataset in your local machine 

data_folder = 'C:\\Data\\DataSetForPaper2023\\'
#'C:\\Users\\boo34\\OneDrive - University of Brighton\\Desktop\\Research Papers\\Publication\\DataSetForPaper2023\\'

In [51]:
t0 = time()
collection_list = ["crisis3", "NG3", "crisis4", "R4", "NG5", "R5", "NG6", "R6"]
#collection_list = ["NG3"]

for collectionName in collection_list:

    container_path = Path(data_folder + collectionName)

    dataset = sklearn.datasets.load_files(container_path,  description=None, categories=None, load_content=True,
                                          shuffle=True, encoding='utf-8', decode_error='ignore', random_state=0, allowed_extensions=None)

    print("%d documents" % len(dataset.data))
    print("%d categories" % len(dataset.target_names))
    print()    

    labels = dataset.target
    true_k = np.unique(labels).shape[0] 

    useHashing = False
    useIDF = False
    nFeature = 1000

    if useHashing:
        if useIDF:
            # Perform an IDF normalization on the output of HashingVectorizer
            hasher = HashingVectorizer(
                n_features=nFeature,
                stop_words="english",
                alternate_sign=False,
                norm=None,
            )
            vectorizer = make_pipeline(hasher, TfidfTransformer())
        else:
            vectorizer = HashingVectorizer(
                n_features=nFeature,
                stop_words="english",
                alternate_sign=False,
                norm="l2",
            )
    else:
        print("tfidf vectorizer")
        vectorizer = TfidfVectorizer(
            max_df=0.5,
            max_features= nFeature,
            min_df=2,
            stop_words="english",
            use_idf= True 
        )
        
    X = vectorizer.fit_transform(dataset.data)
    tvm = vectorizer.fit_transform(dataset.data).toarray()
     
    print("n_samples: %d, n_features: %d" % X.shape)
    numDocs = X.shape[0]   
    
    ag =  AgglomerativeClustering(n_clusters = None, distance_threshold= 1.70). fit(tvm) 
        
    ag_v = metrics.v_measure_score(labels, ag.labels_)
    ag_h = metrics.homogeneity_score(labels, ag.labels_)
    ag_c = metrics.completeness_score(labels, ag.labels_)
    ag_adjustedRand = metrics.adjusted_rand_score(labels, ag.labels_)
        
    labelLength = ag.labels_
    uniqueLabel=len(np.unique(labelLength))

    print(f"ag v {ag_v}  ag rand {ag_adjustedRand} ag unique label length {uniqueLabel}")
        
    sc= SpectralClustering(n_clusters = true_k, affinity ='nearest_neighbors').fit(tvm)

    sc_v = metrics.v_measure_score(labels, sc.labels_)
    sc_h = metrics.homogeneity_score(labels, sc.labels_)
    sc_c = metrics.completeness_score(labels, sc.labels_)
    sc_adjustedRand = metrics.adjusted_rand_score(labels, sc.labels_)

    print(f"spectral v {sc_v} spectral adjustred rand {sc_adjustedRand}")

    # K - means clustering runs
    for i in range(11):  

        print()

        km = KMeans(
            n_clusters=true_k,
            init="k-means++",
            max_iter=100,
            n_init=1,
            verbose= False
        )

        print(f"kMeans ++ run number: {i}")
        print(f"Clustering sparse data with {km}") 
        km.fit(X)

        # %%
        # Performance metrics
        # -------------------

        km_v = metrics.v_measure_score(labels, km.labels_)
        km_h = metrics.homogeneity_score(labels, km.labels_)
        km_c = metrics.completeness_score(labels, km.labels_)
        km_adjustedRand = metrics.adjusted_rand_score(labels, km.labels_)        
        
        print(f"K Means V-measure: {km_v:.5f} Homogeneity: {km_h:.2f} Completeness: {km_c:.2f} Adjusted Rand-Index: {km_adjustedRand:.2f}")
                
        filePath = "results/resultsKpython4c.csv"
        resultsFile = open(filePath, "a")

        if os.path.getsize(filePath) == 0:
            resultsFile.write("index, km_v, km_h, km_c, km_adjustRand, ag_v, ag_adjustedRand, sc_v, sc_adjustedRand, nFeature, numDocs, useHashing, date \n")

        resultsFile.write(f"{collectionName}, {km_v}, {km_h}, {km_c}, {km_adjustedRand}, {ag_v}, {ag_adjustedRand}, {sc_v}, {sc_adjustedRand}, {nFeature}, {numDocs}, {useHashing}, {datetime.now()}  \n")    

        print()
        resultsFile.close()  
 
print("total time %fs" % (time() - t0))
    

1500 documents
3 categories

tfidf vectorizer
n_samples: 1500, n_features: 1000
ag v 0.2425812366191477  ag rand 0.010373549799069197 ag unique label length 279




spectral v 0.03979519291518766 spectral adjustred rand 0.003036851561645567

kMeans ++ run number: 0
Clustering sparse data with KMeans(max_iter=100, n_clusters=3, n_init=1, verbose=False)
K Means V-measure: 0.60147 Homogeneity: 0.58 Completeness: 0.62 Adjusted Rand-Index: 0.52


kMeans ++ run number: 1
Clustering sparse data with KMeans(max_iter=100, n_clusters=3, n_init=1, verbose=False)
K Means V-measure: 0.51309 Homogeneity: 0.48 Completeness: 0.55 Adjusted Rand-Index: 0.38


kMeans ++ run number: 2
Clustering sparse data with KMeans(max_iter=100, n_clusters=3, n_init=1, verbose=False)
K Means V-measure: 0.61527 Homogeneity: 0.59 Completeness: 0.64 Adjusted Rand-Index: 0.55


kMeans ++ run number: 3
Clustering sparse data with KMeans(max_iter=100, n_clusters=3, n_init=1, verbose=False)
K Means V-measure: 0.34157 Homogeneity: 0.31 Completeness: 0.38 Adjusted Rand-Index: 0.24


kMeans ++ run number: 4
Clustering sparse data with KMeans(max_iter=100, n_clusters=3, n_init=1, verbose=Fa



spectral v 0.8130196076236885 spectral adjustred rand 0.8691482374378939

kMeans ++ run number: 0
Clustering sparse data with KMeans(max_iter=100, n_clusters=3, n_init=1, verbose=False)
K Means V-measure: 0.87555 Homogeneity: 0.88 Completeness: 0.88 Adjusted Rand-Index: 0.92


kMeans ++ run number: 1
Clustering sparse data with KMeans(max_iter=100, n_clusters=3, n_init=1, verbose=False)
K Means V-measure: 0.83384 Homogeneity: 0.83 Completeness: 0.84 Adjusted Rand-Index: 0.85


kMeans ++ run number: 2
Clustering sparse data with KMeans(max_iter=100, n_clusters=3, n_init=1, verbose=False)
K Means V-measure: 0.85002 Homogeneity: 0.85 Completeness: 0.85 Adjusted Rand-Index: 0.89


kMeans ++ run number: 3
Clustering sparse data with KMeans(max_iter=100, n_clusters=3, n_init=1, verbose=False)
K Means V-measure: 0.82302 Homogeneity: 0.82 Completeness: 0.82 Adjusted Rand-Index: 0.85


kMeans ++ run number: 4
Clustering sparse data with KMeans(max_iter=100, n_clusters=3, n_init=1, verbose=False



spectral v 0.06673473479749487 spectral adjustred rand 0.0022926944098440693

kMeans ++ run number: 0
Clustering sparse data with KMeans(max_iter=100, n_clusters=4, n_init=1, verbose=False)
K Means V-measure: 0.57504 Homogeneity: 0.54 Completeness: 0.62 Adjusted Rand-Index: 0.38


kMeans ++ run number: 1
Clustering sparse data with KMeans(max_iter=100, n_clusters=4, n_init=1, verbose=False)
K Means V-measure: 0.62925 Homogeneity: 0.60 Completeness: 0.66 Adjusted Rand-Index: 0.49


kMeans ++ run number: 2
Clustering sparse data with KMeans(max_iter=100, n_clusters=4, n_init=1, verbose=False)
K Means V-measure: 0.62514 Homogeneity: 0.60 Completeness: 0.66 Adjusted Rand-Index: 0.48


kMeans ++ run number: 3
Clustering sparse data with KMeans(max_iter=100, n_clusters=4, n_init=1, verbose=False)
K Means V-measure: 0.38572 Homogeneity: 0.35 Completeness: 0.42 Adjusted Rand-Index: 0.26


kMeans ++ run number: 4
Clustering sparse data with KMeans(max_iter=100, n_clusters=4, n_init=1, verbose=F



spectral v 0.43311982379883335 spectral adjustred rand 0.2518157413220778

kMeans ++ run number: 0
Clustering sparse data with KMeans(max_iter=100, n_clusters=4, n_init=1, verbose=False)
K Means V-measure: 0.61337 Homogeneity: 0.55 Completeness: 0.69 Adjusted Rand-Index: 0.44


kMeans ++ run number: 1
Clustering sparse data with KMeans(max_iter=100, n_clusters=4, n_init=1, verbose=False)
K Means V-measure: 0.59980 Homogeneity: 0.55 Completeness: 0.66 Adjusted Rand-Index: 0.42


kMeans ++ run number: 2
Clustering sparse data with KMeans(max_iter=100, n_clusters=4, n_init=1, verbose=False)
K Means V-measure: 0.55066 Homogeneity: 0.50 Completeness: 0.61 Adjusted Rand-Index: 0.42


kMeans ++ run number: 3
Clustering sparse data with KMeans(max_iter=100, n_clusters=4, n_init=1, verbose=False)
K Means V-measure: 0.63230 Homogeneity: 0.58 Completeness: 0.69 Adjusted Rand-Index: 0.47


kMeans ++ run number: 4
Clustering sparse data with KMeans(max_iter=100, n_clusters=4, n_init=1, verbose=Fals



spectral v 0.6099308914347776 spectral adjustred rand 0.5169405972607832

kMeans ++ run number: 0
Clustering sparse data with KMeans(max_iter=100, n_clusters=5, n_init=1, verbose=False)
K Means V-measure: 0.70376 Homogeneity: 0.69 Completeness: 0.72 Adjusted Rand-Index: 0.59


kMeans ++ run number: 1
Clustering sparse data with KMeans(max_iter=100, n_clusters=5, n_init=1, verbose=False)
K Means V-measure: 0.75888 Homogeneity: 0.75 Completeness: 0.77 Adjusted Rand-Index: 0.70


kMeans ++ run number: 2
Clustering sparse data with KMeans(max_iter=100, n_clusters=5, n_init=1, verbose=False)
K Means V-measure: 0.76725 Homogeneity: 0.76 Completeness: 0.77 Adjusted Rand-Index: 0.72


kMeans ++ run number: 3
Clustering sparse data with KMeans(max_iter=100, n_clusters=5, n_init=1, verbose=False)
K Means V-measure: 0.75434 Homogeneity: 0.75 Completeness: 0.76 Adjusted Rand-Index: 0.70


kMeans ++ run number: 4
Clustering sparse data with KMeans(max_iter=100, n_clusters=5, n_init=1, verbose=False



spectral v 0.30244325434372066 spectral adjustred rand 0.13978698541144677

kMeans ++ run number: 0
Clustering sparse data with KMeans(max_iter=100, n_clusters=5, n_init=1, verbose=False)
K Means V-measure: 0.52933 Homogeneity: 0.50 Completeness: 0.56 Adjusted Rand-Index: 0.44


kMeans ++ run number: 1
Clustering sparse data with KMeans(max_iter=100, n_clusters=5, n_init=1, verbose=False)
K Means V-measure: 0.59486 Homogeneity: 0.57 Completeness: 0.63 Adjusted Rand-Index: 0.48


kMeans ++ run number: 2
Clustering sparse data with KMeans(max_iter=100, n_clusters=5, n_init=1, verbose=False)
K Means V-measure: 0.42103 Homogeneity: 0.38 Completeness: 0.47 Adjusted Rand-Index: 0.29


kMeans ++ run number: 3
Clustering sparse data with KMeans(max_iter=100, n_clusters=5, n_init=1, verbose=False)
K Means V-measure: 0.52838 Homogeneity: 0.51 Completeness: 0.54 Adjusted Rand-Index: 0.46


kMeans ++ run number: 4
Clustering sparse data with KMeans(max_iter=100, n_clusters=5, n_init=1, verbose=Fal



spectral v 0.6390894595075094 spectral adjustred rand 0.5436943492754908

kMeans ++ run number: 0
Clustering sparse data with KMeans(max_iter=100, n_clusters=6, n_init=1, verbose=False)
K Means V-measure: 0.66164 Homogeneity: 0.65 Completeness: 0.67 Adjusted Rand-Index: 0.57


kMeans ++ run number: 1
Clustering sparse data with KMeans(max_iter=100, n_clusters=6, n_init=1, verbose=False)
K Means V-measure: 0.69934 Homogeneity: 0.68 Completeness: 0.71 Adjusted Rand-Index: 0.58


kMeans ++ run number: 2
Clustering sparse data with KMeans(max_iter=100, n_clusters=6, n_init=1, verbose=False)
K Means V-measure: 0.70655 Homogeneity: 0.69 Completeness: 0.72 Adjusted Rand-Index: 0.59


kMeans ++ run number: 3
Clustering sparse data with KMeans(max_iter=100, n_clusters=6, n_init=1, verbose=False)
K Means V-measure: 0.68252 Homogeneity: 0.67 Completeness: 0.69 Adjusted Rand-Index: 0.60


kMeans ++ run number: 4
Clustering sparse data with KMeans(max_iter=100, n_clusters=6, n_init=1, verbose=False



spectral v 0.3257699970352392 spectral adjustred rand 0.0848087658358895

kMeans ++ run number: 0
Clustering sparse data with KMeans(max_iter=100, n_clusters=6, n_init=1, verbose=False)
K Means V-measure: 0.60837 Homogeneity: 0.56 Completeness: 0.67 Adjusted Rand-Index: 0.45


kMeans ++ run number: 1
Clustering sparse data with KMeans(max_iter=100, n_clusters=6, n_init=1, verbose=False)
K Means V-measure: 0.63004 Homogeneity: 0.60 Completeness: 0.66 Adjusted Rand-Index: 0.47


kMeans ++ run number: 2
Clustering sparse data with KMeans(max_iter=100, n_clusters=6, n_init=1, verbose=False)
K Means V-measure: 0.52446 Homogeneity: 0.45 Completeness: 0.62 Adjusted Rand-Index: 0.32


kMeans ++ run number: 3
Clustering sparse data with KMeans(max_iter=100, n_clusters=6, n_init=1, verbose=False)
K Means V-measure: 0.70552 Homogeneity: 0.70 Completeness: 0.71 Adjusted Rand-Index: 0.65


kMeans ++ run number: 4
Clustering sparse data with KMeans(max_iter=100, n_clusters=6, n_init=1, verbose=False