In [16]:
import numpy as np
import pandas as pd
import nltk
from bs4 import BeautifulSoup
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3

In [17]:
# load nltk's English stopwords as variable called 'stopwords'
stopwords = nltk.corpus.stopwords.words('english')

In [18]:
# load nltk's SnowballStemmer as variabled 'stemmer'
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [19]:
# here I define a tokenizer and stemmer which returns the set of stems in the text that it is passed

def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token) and token not in stopwords:
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token) and token not in stopwords:
            filtered_tokens.append(token)
    return filtered_tokens

In [20]:
file = pd.read_csv('Python_Training.csv', sep=',', header=None)

In [21]:
print(file[:3])
print(file.index)

                                                 0  1
0          rego on hold can it be put back on road  1
1                                          buy now  4
2  good runner engine drivetrain x system all good  1
Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            589, 590, 591, 592, 593, 594, 595, 596, 597, 598],
           dtype='int64', length=599)


In [22]:
QAs = []
categories = []
print("training data contains {}".format(len(file.index)))
for row in range(len(file.index)):
    tweet = file.iloc[row][0]
    category = file.iloc[row][1]
    QAs.append(tweet)
    categories.append(category)

training data contains 599


In [23]:
print(QAs[0:10])
print(categories[0:10])
print(type(QAs))
print(len(QAs))

['rego on hold can it be put back on road', 'buy now', 'good runner engine drivetrain x system all good', 'fair enough does it have a towbar', 'hi any chance you could post a photo of engine thanks', 'seller added photos', 'where is the rust', 'hi ther wer bouts is pik up from', 'hi there would you like to swap intex metal frame pool and mgp vx team eddition', 'hey guys l am away overseas till nov is it ok to bid an if when pay an pick first weekend home l am from taihape']
[1, 4, 1, 1, 1, 1, 1, 5, 4, 3]
<class 'list'>
599


In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=5, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(QAs)

print(tfidf_matrix.shape)

CPU times: user 242 ms, sys: 1.76 ms, total: 244 ms
Wall time: 244 ms
(599, 188)


In [25]:
terms = tfidf_vectorizer.get_feature_names()


In [26]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

In [27]:
from sklearn.cluster import KMeans

num_clusters = 5

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

CPU times: user 243 ms, sys: 1.99 ms, total: 245 ms
Wall time: 245 ms


In [29]:
print(clusters)
print(categories)

[0, 0, 0, 0, 3, 0, 0, 2, 4, 0, 4, 4, 0, 0, 0, 0, 2, 3, 2, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 2, 3, 3, 3, 4, 1, 2, 4, 0, 3, 4, 4, 2, 0, 0, 0, 0, 3, 2, 0, 2, 2, 2, 0, 0, 0, 0, 4, 4, 1, 1, 2, 2, 0, 0, 2, 3, 2, 0, 0, 0, 1, 1, 0, 0, 4, 0, 2, 2, 1, 0, 1, 3, 0, 0, 0, 3, 3, 2, 3, 3, 2, 1, 0, 3, 0, 3, 1, 1, 1, 0, 0, 3, 1, 1, 1, 1, 1, 2, 1, 1, 2, 3, 3, 1, 3, 3, 0, 3, 3, 3, 2, 1, 3, 3, 4, 4, 2, 1, 1, 1, 2, 3, 3, 0, 0, 0, 0, 0, 1, 2, 2, 1, 2, 2, 4, 0, 3, 2, 2, 0, 0, 0, 3, 2, 0, 2, 0, 3, 0, 1, 1, 1, 0, 4, 4, 0, 4, 0, 0, 0, 0, 2, 0, 3, 2, 1, 0, 3, 2, 3, 3, 2, 3, 3, 4, 0, 2, 0, 0, 1, 0, 0, 3, 0, 1, 1, 2, 3, 2, 3, 0, 0, 0, 2, 0, 0, 1, 3, 0, 0, 1, 2, 0, 0, 0, 1, 3, 0, 1, 1, 1, 1, 1, 0, 2, 0, 4, 4, 4, 0, 3, 0, 0, 0, 0, 0, 3, 3, 3, 0, 3, 2, 0, 0, 0, 0, 0, 0, 0, 3, 3, 2, 3, 2, 4, 0, 0, 2, 0, 0, 0, 2, 3, 3, 1, 3, 0, 1, 1, 0, 1, 2, 4, 4, 0, 3, 0, 3, 3, 2, 0, 3, 2, 2, 0, 2, 3, 0, 2, 1, 0, 0, 0, 3, 0, 4, 4, 2, 1, 0, 0, 0, 3, 3, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 2, 0, 0, 0, 2, 3, 4, 0, 1, 3, 0, 0, 2, 1, 3, 0, 0, 

In [31]:
short_text_classification = { 'cluster': clusters, 'categories': categories }

frame = pd.DataFrame(short_text_classification, index = [clusters] , columns = ['cluster', 'categories'])

In [32]:
frame[:10]

  def _ipython_display_formatter_default(self):
  def _formatters_default(self):
  def _deferred_printers_default(self):
  def _singleton_printers_default(self):
  def _type_printers_default(self):
  def _singleton_printers_default(self):
  def _type_printers_default(self):
  def _deferred_printers_default(self):


Unnamed: 0,cluster,categories
0,0,1
0,0,4
0,0,1
0,0,1
3,3,1
0,0,1
0,0,1
2,2,5
4,4,4
0,0,3


In [33]:
frame['cluster'].value_counts()


0    267
3    108
2     91
1     77
4     56
Name: cluster, dtype: int64

In [53]:
frame['categories'].value_counts()

1    237
4    190
5     95
2     51
3     26
Name: categories, dtype: int64

In [54]:
frame[frame.categories==1]['cluster'].value_counts()


0    104
1     47
3     44
2     42
Name: cluster, dtype: int64

In [46]:
frame[frame.categories==1]['cluster'].value_counts()

0    104
1     47
3     44
2     42
Name: cluster, dtype: int64

In [47]:
frame[frame.categories==2]['cluster'].value_counts()


0    24
1    13
2     8
3     6
Name: cluster, dtype: int64

In [49]:
frame[frame.categories==3]['cluster'].value_counts()

0    11
3     6
2     6
1     3
Name: cluster, dtype: int64

In [50]:
frame[frame.categories==4]['cluster'].value_counts()

0    82
4    56
3    26
2    19
1     7
Name: cluster, dtype: int64

In [51]:
frame[frame.categories==5]['cluster'].value_counts()

0    46
3    26
2    16
1     7
Name: cluster, dtype: int64

In [52]:
frame['categories'].value_counts()

1    237
4    190
5     95
2     51
3     26
Name: categories, dtype: int64