In [59]:
from __future__ import print_function

from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from sklearn.cluster import KMeans, MiniBatchKMeans

import logging
from optparse import OptionParser
import sys
from time import time

import numpy as np

from os import listdir
from os.path import isfile, join

In [49]:
# Reading data in a list

data_path = "E:/fyp-shizzz/nlp/data"

data_files = [data_path+"/"+f for f in listdir(data_path) if isfile(join(data_path, f))]

data = []

for file in data_files:
    with open (file,'r',encoding="utf8") as f:
        data.append(f.read())
        

In [40]:
len(data)

6866

In [50]:
# Load some categories from the training set
categories = [
    'related',
    'notrelated'
]

In [60]:

labels = [1,0]
true_k = 2


In [61]:
print("Extracting features from the training dataset using a sparse vectorizer")
t0 = time()

vectorizer = TfidfVectorizer(max_df=0.5,
                                 min_df=1, stop_words='english',
                                 use_idf=True)
X = vectorizer.fit_transform(data)

print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X.shape)
print()

Extracting features from the training dataset using a sparse vectorizer
done in 5.044667s
n_samples: 6866, n_features: 56460



In [69]:
indices = np.argsort(vectorizer.idf_)[::-1]
features = vectorizer.get_feature_names()
top_n = 100
top_features = [features[i] for i in indices[:top_n]]
print (top_features)

['힘내주세요', 'oases', 'e_143083589', 'oaths', 'oatmeal', 'oatway', 'oatway81', 'oaxaca', 'e953a2337d39_i1', 'obaid', 'obaidul', '2487908', 'e82ac', 'obeisance', 'obermann', 'oberoi', 'e6frg12c', 'e6frfmyi', 'obfuscation', 'obgyn', 'e6frfkui', 'e6e635ea', '2487880', 'objectification', 'e5a13999412f95a3ec1eef19d34146a3', 'oasis', 'oarticle', 'ntb', 'eac', 'nyti', '53679', 'nytimesphoto', '2489', '53695', 'nzcatholic', 'nzd', '5377', 'nzfs', '2488794', 'earl', 'earhtquake', 'nznepalsociety', 'eans', 'nzsee', 'nzt', 'eam', 'o001', 'o2', 'eahpssrwis', 'eagan', 'oakdale', 'oakes', '2487635', 'objectively', 'e4828ebc6177a91cb13eb89a774857f3', 'e475897dce9b', 'observatories', 'dysfunction', '24806357d366b0df9c96c149dd23fc1e', '2480', 'dynamited', '538800', 'dyna', 'dyn', 'dyet', 'obsessively', 'dyed', 'dydirector', 'dybesh', 'obstetrical', 'dybach', 'dy', 'dxit', 'dws', '539c', 'obstructionist', 'dwells', '24761', '24743', '24819', 'dzogchen', 'observances', 'oblivion', 'e44280c2', 'e2gkkjvri1', 

In [54]:
if opts.n_components:
    print("Performing dimensionality reduction using LSA")
    t0 = time()
    # Vectorizer results are normalized, which makes KMeans behave as
    # spherical k-means for better results. Since LSA/SVD results are
    # not normalized, we have to redo the normalization.
    svd = TruncatedSVD(opts.n_components)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)

    X = lsa.fit_transform(X)

    print("done in %fs" % (time() - t0))

    explained_variance = svd.explained_variance_ratio_.sum()
    print("Explained variance of the SVD step: {}%".format(
        int(explained_variance * 100)))

    print()

In [55]:
# #############################################################################
# Do the actual clustering


km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
                verbose=opts.verbose)

print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()

Clustering sparse data with KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=2, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=False)
done in 24.942s



In [58]:
# print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
# print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
# print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
# print("Adjusted Rand-Index: %.3f"
#       % metrics.adjusted_rand_score(labels, km.labels_))
# print("Silhouette Coefficient: %0.3f"
#       % metrics.silhouette_score(X, km.labels_, sample_size=1000))

print()


if not opts.use_hashing:
    print("Top terms per cluster:")

    if opts.n_components:
        original_space_centroids = svd.inverse_transform(km.cluster_centers_)
        order_centroids = original_space_centroids.argsort()[:, ::-1]
    else:
        order_centroids = km.cluster_centers_.argsort()[:, ::-1]

    terms = vectorizer.get_feature_names()
    for i in range(true_k):
        print()
        print("Cluster %d:" % i, end='')
        for ind in order_centroids[i, :50]:
            print(' %s' % terms[ind], end='')
        print()


Top terms per cluster:

Cluster 0: rescue aid india china everest team killed chinese injured toll search government helicopter capital teams nepalese buildings disaster tuesday death million camp international indian army avalanche april areas helicopters saturday collapsed including base aftershocks sunday missing survivors foreign military medical supplies rubble reported told ministry officials dead district according police

Cluster 1: team aid family nepalese rescue government disaster medical india victims home support april 2015 children efforts says just saturday israel minister safe world international food affected new red time emergency students need water supplies israeli cross community group assistance members days like local million response missing 25 humanitarian foreign families
