In [71]:
import pandas as pd
import numpy as np
import re

In [118]:
dataset = pd.read_csv("dataset.txt", delimiter="|", encoding="ISO-8859-1", index_col="AutoID")
dataset.head()

Unnamed: 0_level_0,Date,Year,Month,MediaType,FullText
AutoID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,8/26/2015,2015,8,twitter,3 ways the internet of things will change Bank...
2,8/5/2015,2015,8,twitter,BankB BankB Name downgrades apple stock to neu...
3,8/12/2015,2015,8,twitter,BankB returns to profit on INTERNET/! board2? ...
4,8/5/2015,2015,8,twitter,BankB tells advisers to exit paulson hedge fun...
5,8/12/2015,2015,8,twitter,BankC may plead guilty over foreign exchange p...


## Sample Dataset

In [91]:
rows = np.random.choice(dataset.index.values, 10000)
sample = dataset.ix[rows]

## Clustering

In [92]:
import nltk
from nltk.stem.snowball import SnowballStemmer
from sklearn import feature_extraction

In [105]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords += ["ADDRESS", "INTERNET", "Name", "twit_hndl", "PHONE", "twit_hndl_BankA", "twit_hndl_BankB", "twit_hndl_BankC", "twit_hndl_BankD"]
stemmer = SnowballStemmer("english")

In [94]:
def tokenize_and_stem(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    tokens = list(filter(lambda x: not x[0].isupper(), tokens))
    filtered_tokens = []
    for token in tokens:        
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    tokens = list(filter(lambda x: not x[0].isupper(), tokens))
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [95]:
totalvocab_stemmed = []
totalvocab_tokenized = []
for each in sample["FullText"]:
    allwords_stemmed = tokenize_and_stem(each)
    totalvocab_stemmed.extend(allwords_stemmed)
    
    allwords_tokenized = tokenize_only(each)
    totalvocab_tokenized.extend(allwords_tokenized)

In [96]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index=totalvocab_stemmed)
vocab_frame = vocab_frame.dropna()

In [117]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df=.8, max_features=20000,
                                    min_df=.2, stop_words=stopwords,
                                  use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(sample["FullText"])

print(tfidf_matrix.shape)

CPU times: user 20.8 s, sys: 222 ms, total: 21 s
Wall time: 24.2 s
(10000, 6)
  (0, 3)	0.61341297936
  (0, 1)	0.682664841344
  (0, 4)	0.397106070395


In [110]:
from sklearn.cluster import KMeans
num_clusters = 5

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

CPU times: user 2.71 s, sys: 24.9 ms, total: 2.74 s
Wall time: 3.03 s


In [111]:
from sklearn.externals import joblib

joblib.dump(km, 'cluster_algo1.pkl')

# km = joblib.load('cluster_algo1.pkl')

['cluster_algo1.pkl', 'cluster_algo1.pkl_01.npy', 'cluster_algo1.pkl_02.npy']

In [112]:
sample["Cluster"] = clusters
sample["Cluster"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


2    3613
3    1929
1    1551
0    1458
4    1449
dtype: int64

In [114]:
print("Top terms per cluster:")
print()
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = tfidf_vectorizer.get_feature_names()

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :20]:
        print(" %s" % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0], end=",")
    print()
    print()
    
print()
print()

Top terms per cluster:

Cluster 0 words: nan, name, internet, nan, name, nan,

Cluster 1 words: nan, name, internet, name, nan, nan,

Cluster 2 words: name, name, internet, nan, nan, nan,

Cluster 3 words: internet, name, nan, nan, nan, name,

Cluster 4 words: nan, name, internet, nan, nan, name,



