In [1]:
from pathlib import Path
import sklearn
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from sklearn.cluster import DBSCAN, AgglomerativeClustering, KMeans, MiniBatchKMeans

import logging
from optparse import OptionParser
import sys
from time import time
import numpy as np
import os


data_folder = "C:/Data/DataSetForPaper2023/crisis3"
dataset = sklearn.datasets.load_files(data_folder,  description=None, categories=None, load_content=True, shuffle=True, encoding='utf-8', decode_error='ignore', random_state=0)


print("%d documents" % len(dataset.data))
print("%d categories" % len(dataset.target_names))
print()

1500 documents
3 categories



In [2]:
labels = dataset.target
true_k = np.unique(labels).shape[0]

print("Extracting features from the training dataset using a sparse vectorizer")
t0 = time()

print("tfidf vectorizer")
vectorizer = TfidfVectorizer(
            max_df=0.5,
            max_features= 2000, #10000,
            min_df=2,
            stop_words="english",
            use_idf= True
)

Extracting features from the training dataset using a sparse vectorizer
tfidf vectorizer


In [3]:
X = vectorizer.fit_transform(dataset.data)
print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X.shape)
numDocs = X.shape[0]   

done in 0.044480s
n_samples: 1500, n_features: 1331


In [4]:
km = KMeans(
            n_clusters=true_k,
            init="k-means++",
            max_iter=100,
            n_init=1,
            verbose= False  
        )

#print("kMeans ++ run number: " + str(i))
print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))

Clustering sparse data with KMeans(max_iter=100, n_clusters=3, n_init=1, verbose=False)
done in 0.162s


In [5]:
v = metrics.v_measure_score(labels, km.labels_)
h = metrics.homogeneity_score(labels, km.labels_)
c = metrics.completeness_score(labels, km.labels_)
adjustedRand = metrics.adjusted_rand_score(labels, km.labels_)

print("V-measure: %0.3f" % v)
print("Homogeneity: %0.3f" % h)
print("Completeness: %0.3f" % c)
print("Adjusted Rand: %0.3f" % adjustedRand)

V-measure: 0.362
Homogeneity: 0.309
Completeness: 0.437
Adjusted Rand: 0.166


In [6]:
filePath = "resultsKmeans.csv"
resultsFile = open(filePath, "a")

if os.path.getsize(filePath) == 0:
  resultsFile.write("index, v, h, c, adjustRand, numDocs \n")

resultsFile.write("crisis3 " + ", " + str(v) +  ", " + str(h) + ", " + str(c) +  ", " + str(adjustedRand) + ", " + str(numDocs) + "\n")




97