In [1]:
import os
import re
import logging
from optparse import OptionParser
import sys
from time import time
import glob
from pathlib import Path
from collections import defaultdict
import numpy as np
import pandas as pd
from time import time
import sklearn
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, DBSCAN
from sklearn.cluster import Birch
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AgglomerativeClustering, FeatureAgglomeration
from sklearn import metrics
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD

In [2]:
#change this to where you have the dataset in your local machine 

data_folder = 'C:\\Data\\DataSetForPaper2023\\'
#'C:\\Users\\boo34\\OneDrive - University of Brighton\\Desktop\\Research Papers\\Publication\\DataSetForPaper2023\\'

In [3]:
#collection_list = ["crisis3", "NG3", "crisis4", "R4", "NG5", "R5", "NG6", "R6"]
collection_list = ["NG3"]

for collectionName in collection_list:

    container_path = Path(data_folder + collectionName)

    dataset = sklearn.datasets.load_files(container_path,  description=None, categories=None, load_content=True,
                                          shuffle=True, encoding='utf-8', decode_error='ignore', random_state=0, allowed_extensions=None)

    print("%d documents" % len(dataset.data))
    print("%d categories" % len(dataset.target_names))
    print()
    
     # Feature Extraction
    # ------------------

    labels = dataset.target
    true_k = np.unique(labels).shape[0]

    print("Extracting features from the training dataset using a sparse vectorizer")
    t0 = time()

    useHashing = True
    useIDF = True
    nFeature = 5000

    if useHashing:
        if useIDF:
            # Perform an IDF normalization on the output of HashingVectorizer
            hasher = HashingVectorizer(
                n_features=nFeature,
                stop_words="english",
                alternate_sign=False,
                norm=None,
            )
            vectorizer = make_pipeline(hasher, TfidfTransformer())
        else:
            vectorizer = HashingVectorizer(
                n_features=nFeature,
                stop_words="english",
                alternate_sign=False,
                norm="l2",
            )
    else:
        print("tfidf vectorizer")
        vectorizer = TfidfVectorizer(
            max_df=0.5,
            max_features= 5000, #  opts.n_features,
            min_df=2,
            stop_words="english",
            use_idf= True #opts.use_idf,
        )

1200 documents
3 categories

Extracting features from the training dataset using a sparse vectorizer


In [4]:
# Agglomerative Clustering Ward Method  - clustering runs
for i in range(11):
    X = vectorizer.fit_transform(dataset.data)

    print("done in %fs" % (time() - t0))
    print("n_samples: %d, n_features: %d" % X.shape)
    numDocs = X.shape[0]

    print()

    ag =  AgglomerativeClustering(metric = 'euclidean', linkage = 'ward')

    print("Agglomerative Clustering run number: " + str(i))
    print("Clustering sparse data with %s" % ag)
    X = X.toarray()
    t0 = time()
    ag.fit(X)
    print("done in %0.3fs" % (time() - t0))

    # %%
    # Performance metrics
    # -------------------

    v = metrics.v_measure_score(labels, ag.labels_)
    h = metrics.homogeneity_score(labels, ag.labels_)
    c = metrics.completeness_score(labels, ag.labels_)
    adjustedRand = metrics.adjusted_rand_score(labels, ag.labels_)

    print("V-measure: %0.3f" % v)
    print("Homogeneity: %0.3f" % h)
    print("Completeness: %0.3f" % c)

    print("Adjusted Rand-Index: %.3f" %
          metrics.adjusted_rand_score(labels, ag.labels_))
  # print("Silhouette Coefficient: %0.3f" %
        #  metrics.silhouette_score(X, ag.labels_, sample_size=1000))

    filePath = "resultsKpython.csv"
    resultsFile = open(filePath, "a")

    if os.path.getsize(filePath) == 0:
        resultsFile.write("index, v, h, c, adjustRand, numDocs \n")

    resultsFile.write(collectionName + ", " + str(v) +
                      ", " + str(h) + ", " + str(c) +  ", " + str(adjustedRand) + ", " + str(numDocs) + "\n")

    print()
    resultsFile.close()

done in 0.184741s
n_samples: 1200, n_features: 5000

Agglomerative Clustering run number: 0
Clustering sparse data with AgglomerativeClustering(metric='euclidean')
done in 1.807s
V-measure: 0.419
Homogeneity: 0.315
Completeness: 0.626
Adjusted Rand-Index: 0.282

done in 1.980016s
n_samples: 1200, n_features: 5000

Agglomerative Clustering run number: 1
Clustering sparse data with AgglomerativeClustering(metric='euclidean')
done in 1.772s
V-measure: 0.419
Homogeneity: 0.315
Completeness: 0.626
Adjusted Rand-Index: 0.282

done in 1.944101s
n_samples: 1200, n_features: 5000

Agglomerative Clustering run number: 2
Clustering sparse data with AgglomerativeClustering(metric='euclidean')
done in 1.784s
V-measure: 0.419
Homogeneity: 0.315
Completeness: 0.626
Adjusted Rand-Index: 0.282

done in 1.955814s
n_samples: 1200, n_features: 5000

Agglomerative Clustering run number: 3
Clustering sparse data with AgglomerativeClustering(metric='euclidean')
done in 1.787s
V-measure: 0.419
Homogeneity: 0.

In [4]:
lau = ag.labels_
len(np.unique(lau))

NameError: name 'ag' is not defined

In [6]:
# K - means clustering runs
for i in range(11):
    X = vectorizer.fit_transform(dataset.data)

    print("done in %fs" % (time() - t0))
    print("n_samples: %d, n_features: %d" % X.shape)
    numDocs = X.shape[0]

    print()

    km = KMeans(
        n_clusters=true_k,
        init="k-means++",
        max_iter=100,
        n_init=1,
        verbose= False
    )

    print("kMeans ++ run number: " + str(i))
    print("Clustering sparse data with %s" % km)
    t0 = time()
    km.fit(X)
    print("done in %0.3fs" % (time() - t0))

    # %%
    # Performance metrics
    # -------------------

    v = metrics.v_measure_score(labels, km.labels_)
    h = metrics.homogeneity_score(labels, km.labels_)
    c = metrics.completeness_score(labels, km.labels_)
    adjustedRand = metrics.adjusted_rand_score(labels, km.labels_)

    print("V-measure: %0.3f" % v)
    print("Homogeneity: %0.3f" % h)
    print("Completeness: %0.3f" % c)

    print("Adjusted Rand-Index: %.3f" %
          metrics.adjusted_rand_score(labels, km.labels_))
    print("Silhouette Coefficient: %0.3f" %
          metrics.silhouette_score(X, km.labels_, sample_size=1000))

    filePath = "resultsKpython.csv"
    resultsFile = open(filePath, "a")

    if os.path.getsize(filePath) == 0:
        resultsFile.write("index, v, h, c, adjustRand, numDocs \n")

    resultsFile.write(collectionName + ", " + str(v) +
                      ", " + str(h) + ", " + str(c) +  ", " + str(adjustedRand) + ", " + str(numDocs) + "\n")

    print()
    resultsFile.close()

done in 3.404462s
n_samples: 1200, n_features: 5000

kMeans ++ run number: 0
Clustering sparse data with KMeans(max_iter=100, n_clusters=3, n_init=1, verbose=False)
done in 0.056s
V-measure: 0.868
Homogeneity: 0.867
Completeness: 0.868
Adjusted Rand-Index: 0.904
Silhouette Coefficient: 0.012

done in 0.306756s
n_samples: 1200, n_features: 5000

kMeans ++ run number: 1
Clustering sparse data with KMeans(max_iter=100, n_clusters=3, n_init=1, verbose=False)
done in 0.015s
V-measure: 0.865
Homogeneity: 0.865
Completeness: 0.866
Adjusted Rand-Index: 0.904
Silhouette Coefficient: 0.013

done in 0.260939s
n_samples: 1200, n_features: 5000

kMeans ++ run number: 2
Clustering sparse data with KMeans(max_iter=100, n_clusters=3, n_init=1, verbose=False)
done in 0.018s
V-measure: 0.431
Homogeneity: 0.341
Completeness: 0.589
Adjusted Rand-Index: 0.291
Silhouette Coefficient: 0.006

done in 0.264957s
n_samples: 1200, n_features: 5000

kMeans ++ run number: 3
Clustering sparse data with KMeans(max_it

In [None]:
# Agglomerative Clustering Average Method - clustering runs
for i in range(11):
    X = vectorizer.fit_transform(dataset.data)

    print("done in %fs" % (time() - t0))
    print("n_samples: %d, n_features: %d" % X.shape)
    numDocs = X.shape[0]

    print()

    ag =  AgglomerativeClustering(metric = 'euclidean', linkage='average')

    print("Agglomerative Clustering run number: " + str(i))
    print("Clustering sparse data with %s" % ag)
    X = X.toarray()
    t0 = time()
    ag.fit(X)
    print("done in %0.3fs" % (time() - t0))

    # %%
    # Performance metrics
    # -------------------

    v = metrics.v_measure_score(labels, ag.labels_)
    h = metrics.homogeneity_score(labels, ag.labels_)
    c = metrics.completeness_score(labels, ag.labels_)
    adjustedRand = metrics.adjusted_rand_score(labels, ag.labels_)

    print("V-measure: %0.3f" % v)
    print("Homogeneity: %0.3f" % h)
    print("Completeness: %0.3f" % c)

    print("Adjusted Rand-Index: %.3f" %
          metrics.adjusted_rand_score(labels, ag.labels_))
    #print("Silhouette Coefficient: %0.3f" %
          #metrics.silhouette_score(X, ag.labels_, sample_size=1000))

    filePath = "resultsKpython.csv"
    resultsFile = open(filePath, "a")

    if os.path.getsize(filePath) == 0:
        resultsFile.write("index, v, h, c, adjustRand, numDocs \n")

    resultsFile.write(collectionName + ", " + str(v) +
                      ", " + str(h) + ", " + str(c) +  ", " + str(adjustedRand) + ", " + str(numDocs) + "\n")

    print()
    resultsFile.close()

In [8]:
# Agglomerative Clustering Complete Method - clustering runs
for i in range(11):
    X = vectorizer.fit_transform(dataset.data)

    print("done in %fs" % (time() - t0))
    print("n_samples: %d, n_features: %d" % X.shape)
    numDocs = X.shape[0]

    print()

    ag =  AgglomerativeClustering(metric = 'euclidean', linkage='complete')

    print("Agglomerative Clustering run number: " + str(i))
    print("Clustering sparse data with %s" % ag)
    X = X.toarray()
    t0 = time()
    ag.fit(X)
    print("done in %0.3fs" % (time() - t0))

    # %%
    # Performance metrics
    # -------------------

    v = metrics.v_measure_score(labels, ag.labels_)
    h = metrics.homogeneity_score(labels, ag.labels_)
    c = metrics.completeness_score(labels, ag.labels_)
    adjustedRand = metrics.adjusted_rand_score(labels, ag.labels_)

    print("V-measure: %0.3f" % v)
    print("Homogeneity: %0.3f" % h)
    print("Completeness: %0.3f" % c)

    print("Adjusted Rand-Index: %.3f" %
          metrics.adjusted_rand_score(labels, ag.labels_))
    print("Silhouette Coefficient: %0.3f" %
          metrics.silhouette_score(X, ag.labels_, sample_size=1000))

    filePath = "resultsKpython.csv"
    resultsFile = open(filePath, "a")

    if os.path.getsize(filePath) == 0:
        resultsFile.write("index, v, h, c, adjustRand, numDocs \n")

    resultsFile.write(collectionName + ", " + str(v) +
                      ", " + str(h) + ", " + str(c) +  ", " + str(adjustedRand) + ", " + str(numDocs) + "\n")

    print()
    resultsFile.close()

done in 1.972646s
n_samples: 1200, n_features: 5000

Agglomerative Clustering run number: 0
Clustering sparse data with AgglomerativeClustering(linkage='complete', metric='euclidean')
done in 1.789s
V-measure: 0.088
Homogeneity: 0.066
Completeness: 0.128
Adjusted Rand-Index: 0.064
Silhouette Coefficient: 0.002

done in 2.052081s
n_samples: 1200, n_features: 5000

Agglomerative Clustering run number: 1
Clustering sparse data with AgglomerativeClustering(linkage='complete', metric='euclidean')
done in 1.846s
V-measure: 0.088
Homogeneity: 0.066
Completeness: 0.128
Adjusted Rand-Index: 0.064
Silhouette Coefficient: 0.003

done in 2.077305s
n_samples: 1200, n_features: 5000

Agglomerative Clustering run number: 2
Clustering sparse data with AgglomerativeClustering(linkage='complete', metric='euclidean')
done in 1.759s
V-measure: 0.088
Homogeneity: 0.066
Completeness: 0.128
Adjusted Rand-Index: 0.064
Silhouette Coefficient: 0.003

done in 1.998310s
n_samples: 1200, n_features: 5000

Agglomer

In [9]:
# Agglomerative Clustering Single Method - clustering runs
for i in range(11):
    X = vectorizer.fit_transform(dataset.data)

    print("done in %fs" % (time() - t0))
    print("n_samples: %d, n_features: %d" % X.shape)
    numDocs = X.shape[0]

    print()

    ag =  AgglomerativeClustering(metric = 'euclidean', linkage='single')

    print("Agglomerative Clustering run number: " + str(i))
    print("Clustering sparse data with %s" % ag)
    X = X.toarray()
    t0 = time()
    ag.fit(X)
    print("done in %0.3fs" % (time() - t0))

    # %%
    # Performance metrics
    # -------------------

    v = metrics.v_measure_score(labels, ag.labels_)
    h = metrics.homogeneity_score(labels, ag.labels_)
    c = metrics.completeness_score(labels, ag.labels_)
    adjustedRand = metrics.adjusted_rand_score(labels, ag.labels_)

    print("V-measure: %0.3f" % v)
    print("Homogeneity: %0.3f" % h)
    print("Completeness: %0.3f" % c)

    print("Adjusted Rand-Index: %.3f" %
          metrics.adjusted_rand_score(labels, ag.labels_))
    #print("Silhouette Coefficient: %0.3f" %
         # metrics.silhouette_score(X, ag.labels_, sample_size=1000))

    filePath = "resultsKpython.csv"
    resultsFile = open(filePath, "a")

    if os.path.getsize(filePath) == 0:
        resultsFile.write("index, v, h, c, adjustRand, numDocs \n")

    resultsFile.write(collectionName + ", " + str(v) +
                      ", " + str(h) + ", " + str(c) +  ", " + str(adjustedRand) + ", " + str(numDocs) + "\n")

    print()
    resultsFile.close()

done in 2.054746s
n_samples: 1200, n_features: 5000

Agglomerative Clustering run number: 0
Clustering sparse data with AgglomerativeClustering(linkage='single', metric='euclidean')
done in 3.695s
V-measure: 0.002
Homogeneity: 0.001
Completeness: 0.136
Adjusted Rand-Index: 0.000

done in 3.869785s
n_samples: 1200, n_features: 5000

Agglomerative Clustering run number: 1
Clustering sparse data with AgglomerativeClustering(linkage='single', metric='euclidean')
done in 3.661s
V-measure: 0.002
Homogeneity: 0.001
Completeness: 0.136
Adjusted Rand-Index: 0.000

done in 3.836673s
n_samples: 1200, n_features: 5000

Agglomerative Clustering run number: 2
Clustering sparse data with AgglomerativeClustering(linkage='single', metric='euclidean')
done in 3.671s
V-measure: 0.002
Homogeneity: 0.001
Completeness: 0.136
Adjusted Rand-Index: 0.000

done in 3.844595s
n_samples: 1200, n_features: 5000

Agglomerative Clustering run number: 3
Clustering sparse data with AgglomerativeClustering(linkage='sing