In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.cluster import KMeans
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
%matplotlib inline

In [2]:
df_train = pd.read_csv('labeledTrainData.tsv',sep='\t')
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
id           25000 non-null object
sentiment    25000 non-null int64
review       25000 non-null object
dtypes: int64(1), object(2)
memory usage: 586.0+ KB


In [3]:
df_train.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [4]:
df_train.shape

(25000, 3)

In [5]:
punc = ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}',"%"]
stop_words = text.ENGLISH_STOP_WORDS.union(punc)

In [6]:
desc = df_train['review'].values

In [15]:
vectorizer = TfidfVectorizer(stop_words = stop_words)
X = vectorizer.fit_transform(desc)

In [18]:
print(X.shape)

(25000, 74538)


In [7]:
stemmer = SnowballStemmer('english')
tokenizer = RegexpTokenizer(r'[a-zA-Z\']+')

def tokenize(text):
    return [stemmer.stem(word) for word in tokenizer.tokenize(text.lower())]

In [14]:
vectorizer3 = TfidfVectorizer(stop_words = stop_words, tokenizer = tokenize, max_features = 10)
X3 = vectorizer3.fit_transform(desc)
words = vectorizer3.get_feature_names()
print(len(words))
print(words[:50])


  'stop_words.' % sorted(inconsistent))


10
['bad', 'best', 'better', "don't", 'good', 'great', 'love', 'realli', 'want', 'watch']


In [13]:
# cachedStopWords.update(('and','I','A','And','So','arnt','This','When','It','many','Many','so','cant','Yes','yes','No','no','These','these'))
stop_words = stop_words.union(('act', 'actor', 'ani', 'becaus', 'br', 'charact', 'come', 'did', 'doe', 'end', 'feel', 'film' , 'just', 'know', 'life', 'like', 'littl', 'look', 'make', 'man', 'mani', 'movi', 'onli', 'peopl', 'perform', 'play', 'plot', 'say', 'scene', 'seen', 'stori', 'thing', 'think', 'time', 'tri', 'veri', 'way', 'whi', 'work', 'year'))
stop_words

frozenset({'!',
           '"',
           '%',
           "'",
           '(',
           ')',
           ',',
           '.',
           ':',
           ';',
           '?',
           '[',
           ']',
           'a',
           'about',
           'above',
           'across',
           'act',
           'actor',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'ani',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           '

In [61]:
word2 = ['bad','good','great']

In [35]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters = 2, init="k-means++",n_init = 20, n_jobs = 2,max_iter=300) # n_init(number of iterations for clsutering) n_jobs(number of cpu cores to use)
kmeans.fit(X3)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=2, n_init=20, n_jobs=2, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [33]:
print("Top terms per cluster:")
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer3.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :4]:
        print(' %s' % terms[ind]),
    print

print("\n")

Top terms per cluster:
Cluster 0:
 good
 realli
 great
 bad
Cluster 1:
 watch
 good
 don't
 realli




In [21]:
# We look at 2 the clusters generated by k-means.
# common_words = kmeans.cluster_centers_.argsort()[:,-1:-26:-1]
common_words = kmeans.cluster_centers_.argsort()[:,::-1]
for num, centroid in enumerate(common_words):
    print(str(num) + ' : ' + ', '.join(words[word] for word in centroid))

0 : good, realli, great, bad, love, don't, best, want, better, watch
1 : watch, good, don't, realli, bad, want, love, great, better, best


In [16]:
kmeans.inertia_

15718.681981145695

In [17]:
y_pred = kmeans.labels_

In [18]:
y_true = df_train['sentiment'].values

In [19]:
from sklearn import metrics
print(metrics.accuracy_score(y_true,y_pred))
print(metrics.confusion_matrix(y_true,y_pred))
print(metrics.f1_score(y_true,y_pred))

0.48228
[[9103 3397]
 [9546 2954]]
0.31340512439658375


In [54]:
y_pred.sum()

10459

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'I love you.',
    'Is this the first document?',
]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
words = vectorizer.get_feature_names()

print(vectorizer.get_feature_names())

print(X.shape)

['document', 'first', 'is', 'love', 'second', 'the', 'this', 'you']
(4, 8)


In [27]:
from sklearn.cluster import KMeans
model = KMeans(n_clusters = 2, init="random",n_init = 20, n_jobs = 2,max_iter=300) # n_init(number of iterations for clsutering) n_jobs(number of cpu cores to use)
model.fit(X)

KMeans(algorithm='auto', copy_x=True, init='random', max_iter=300,
    n_clusters=2, n_init=20, n_jobs=2, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [29]:
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :5]:
        print(' %s' % terms[ind]),
    print

print("\n")

Top terms per cluster:
Cluster 0:
 you
 love
 this
 the
 second
Cluster 1:
 document
 this
 the
 is
 first




In [51]:
kmeans.labels_

array([0, 0, 1, 0])

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

documents = ["This little kitty came to play when I was eating at a restaurant.",
             "Merley has the best squooshy kitten belly.",
             "Google Translate app is incredible.",
             "If you open 100 tab in google you get a smiley face.",
             "Best cat photo I've ever taken.",
             "Climbing ninja cat.",
             "Impressed with google map feedback.",
             "Key promoter extension for Google Chrome."]

vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)

true_k = 2
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)

print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),
    print

print("\n")
print("Prediction")

Y = vectorizer.transform(["chrome browser to open."])
prediction = model.predict(Y)
print(prediction)

Y = vectorizer.transform(["My cat is hungry."])
prediction = model.predict(Y)
print(prediction)

Top terms per cluster:
Cluster 0:
 cat
 best
 climbing
 ninja
 ve
 photo
 taken
 belly
 merley
 kitten
Cluster 1:
 google
 feedback
 map
 app
 impressed
 incredible
 translate
 key
 extension
 chrome


Prediction
[1]
[0]
