In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction import text
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
%matplotlib inline
from nltk.corpus import stopwords

Using TensorFlow backend.


In [2]:
train_df = pd.read_csv('labeledTrainData.tsv',sep='\t')
train_df.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [3]:
import string
getPunctuations = str.maketrans("","",string.punctuation) 

from nltk.corpus import stopwords
stop = stopwords.words("english")
train_df['review'] = train_df['review'].apply(lambda x: x.lower())
train_df['review'] = train_df['review'].apply(lambda x: x.translate(getPunctuations))
train_df['review'] = train_df['review'].apply(lambda x: x.split())
train_df['review'] = train_df['review'].apply(lambda x: [item for item in x if item not in stop])
train_df.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,"[stuff, going, moment, mj, ive, started, liste..."
1,2381_9,1,"[classic, war, worlds, timothy, hines, enterta..."
2,7759_3,0,"[film, starts, manager, nicholas, bell, giving..."
3,3630_4,0,"[must, assumed, praised, film, greatest, filme..."
4,9495_8,1,"[superbly, trashy, wondrously, unpretentious, ..."


In [4]:
X_train = train_df['review'].values
sentences = list(X_train)

In [4]:
from gensim.models import Word2Vec
model = Word2Vec(sentences, min_count=1)

Now we have model with words embedded. We can query model for similar words like below or ask to represent word as vector:

In [5]:
#to get similarity between words
print(model.similarity('good','great'))

print (model.most_similar(positive=['movie'], negative=[], topn=2))

#get the words most similar to given word in our data
ms=model.most_similar('film')
for x in ms:
    print (x[0],x[1])

0.7405692
[('film', 0.8293483257293701), ('moviebr', 0.8118131160736084)]
movie 0.8293483257293701
filmbr 0.762588381767273
films 0.7148263454437256
moviebr 0.688957691192627
documentary 0.6654255390167236
picture 0.6549851894378662
cinema 0.6480210423469543
product 0.6471505761146545
experience 0.6226150989532471
certainly 0.618864119052887


  
  after removing the cwd from sys.path.
  import sys


To get vocabulary or the number of words in vocabulary:

In [6]:
# print(list(model.wv.vocab))
print (len(list(model.wv.vocab)))

121223


Now we will feed word embeddings into clustering algorithm such as k Means which is one of the most popular unsupervised learning algorithms for finding interesting segments in the data. It can be used for separating customers into groups, combining documents into topics and for many other applications.

#### Note:
preparing data for kmeans clustering (converting each sentence into vectors, so that clustering occurs based on sentences). We are taking sum of vectors of each word and averaging over length of sentence. even though aeraging word vectors doesn't have any meaning, we are getting accuracy of 63%. We should find a way to use original word embeddings for text clustering.

In [None]:
def sent_vectorizer(sent, model):
    sent_vec =[]
    numw = 0
    for w in sent:
        try:
            if numw == 0:
                sent_vec = model[w]
            else:
                sent_vec = np.add(sent_vec, model[w])
            numw+=1
        except:
            pass
     
    return np.asarray(sent_vec) / numw
  
  
X=[]
for sentence in sentences:
    X.append(sent_vectorizer(sentence, model))   
 
print ("========================")

In [18]:
print (len(X[0]))

100


#### K Means Clustering with Scikit-learn Library

This example is based on k means from scikit-learn library.

In [8]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=2,n_jobs = 2,init = "k-means++",n_init=20,random_state=42)
kmeans.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=2, n_init=20, n_jobs=2, precompute_distances='auto',
    random_state=42, tol=0.0001, verbose=0)

In [11]:
y_pred = kmeans.labels_
centroids = kmeans.cluster_centers_
 
print ("Cluster id labels for inputted data")
print (len(y_pred))

# print("Top terms per cluster:")
# order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
# terms = list(model.wv.vocab)
# for i in range(2):
#     print("Cluster %d:" % i),
#     for ind in order_centroids[i, :10]:
#         print(' %s' % terms[ind]),
#     print
    
# print("\n")

Cluster id labels for inputted data
25000


#### K Means Clustering with NLTK Library

In [19]:
from nltk.cluster import KMeansClusterer
import nltk
NUM_CLUSTERS=2
kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25)
assigned_clusters = kclusterer.cluster(X, assign_clusters=True)
print (len(assigned_clusters))

25000


In [23]:
y_pred = []
for index, sentence in enumerate(sentences[:10]):    
    print (str(assigned_clusters[index]) + ":" + str(sentence))
    y_pred.append(assigned_clusters[index])
#     print(index),

0:['stuff', 'going', 'moment', 'mj', 'ive', 'started', 'listening', 'music', 'watching', 'odd', 'documentary', 'watched', 'wiz', 'watched', 'moonwalker', 'maybe', 'want', 'get', 'certain', 'insight', 'guy', 'thought', 'really', 'cool', 'eighties', 'maybe', 'make', 'mind', 'whether', 'guilty', 'innocent', 'moonwalker', 'part', 'biography', 'part', 'feature', 'film', 'remember', 'going', 'see', 'cinema', 'originally', 'released', 'subtle', 'messages', 'mjs', 'feeling', 'towards', 'press', 'also', 'obvious', 'message', 'drugs', 'bad', 'mkaybr', 'br', 'visually', 'impressive', 'course', 'michael', 'jackson', 'unless', 'remotely', 'like', 'mj', 'anyway', 'going', 'hate', 'find', 'boring', 'may', 'call', 'mj', 'egotist', 'consenting', 'making', 'movie', 'mj', 'fans', 'would', 'say', 'made', 'fans', 'true', 'really', 'nice', 'himbr', 'br', 'actual', 'feature', 'film', 'bit', 'finally', 'starts', '20', 'minutes', 'excluding', 'smooth', 'criminal', 'sequence', 'joe', 'pesci', 'convincing', 'psy

In [46]:
y_true = train_df['sentiment'].values
from sklearn import metrics
print(metrics.accuracy_score(y_true,y_pred))
print(metrics.confusion_matrix(y_true,y_pred))
print(metrics.f1_score(y_true,y_pred))

0.50088
[[12491     9]
 [12469    31]]
0.0049441786283891545


In [14]:
y_pred[:10]

array([0, 1, 1, 1, 1, 0, 0, 1, 1, 1])

In [25]:
y_true[:10]

array([1, 1, 0, 0, 1, 1, 0, 0, 0, 1], dtype=int64)

#### doc2vec 

since averaging word vectors doesn't have any geometric meaning, I'm trying doc2vec

In [20]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
 
print (common_texts)

[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']]


In [21]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
 
print (documents)

[TaggedDocument(words=['human', 'interface', 'computer'], tags=[0]), TaggedDocument(words=['survey', 'user', 'computer', 'system', 'response', 'time'], tags=[1]), TaggedDocument(words=['eps', 'user', 'interface', 'system'], tags=[2]), TaggedDocument(words=['system', 'human', 'system', 'eps'], tags=[3]), TaggedDocument(words=['user', 'response', 'time'], tags=[4]), TaggedDocument(words=['trees'], tags=[5]), TaggedDocument(words=['graph', 'trees'], tags=[6]), TaggedDocument(words=['graph', 'minors', 'trees'], tags=[7]), TaggedDocument(words=['graph', 'minors', 'survey'], tags=[8])]


In [22]:
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)

In [23]:
#Persist a model to disk:
 
from gensim.test.utils import get_tmpfile
fname = get_tmpfile("my_doc2vec_model")
 
print (fname)

C:\Users\coe04\AppData\Local\Temp\my_doc2vec_model


In [26]:
#load model from saved file
model.save(fname)
model = Doc2Vec.load(fname)  

In [36]:
# # you can continue training with the loaded model!
# #If you’re finished training a model (=no more updates, only querying, reduce memory usage), you can do:
 
# model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
 
# #Infer vector for a new document:
# #Here our text paragraph just 2 words
# vector = model.infer_vector(["system", "response"])
# print (vector)
 
# """
# output
 
# [-0.08390492  0.01629403 -0.08274432  0.06739668 -0.07021132]
  
#  """

In [27]:
from sklearn import metrics
 
import gensim.models as g
import codecs 

In [28]:
#inference hyper-parameters
start_alpha=0.01
infer_epoch=1000

In [38]:
#load model
# m = Doc2Vec.load(model)

print (sentences[:10])

[['stuff', 'going', 'moment', 'mj', 'ive', 'started', 'listening', 'music', 'watching', 'odd', 'documentary', 'watched', 'wiz', 'watched', 'moonwalker', 'maybe', 'want', 'get', 'certain', 'insight', 'guy', 'thought', 'really', 'cool', 'eighties', 'maybe', 'make', 'mind', 'whether', 'guilty', 'innocent', 'moonwalker', 'part', 'biography', 'part', 'feature', 'film', 'remember', 'going', 'see', 'cinema', 'originally', 'released', 'subtle', 'messages', 'mjs', 'feeling', 'towards', 'press', 'also', 'obvious', 'message', 'drugs', 'bad', 'mkaybr', 'br', 'visually', 'impressive', 'course', 'michael', 'jackson', 'unless', 'remotely', 'like', 'mj', 'anyway', 'going', 'hate', 'find', 'boring', 'may', 'call', 'mj', 'egotist', 'consenting', 'making', 'movie', 'mj', 'fans', 'would', 'say', 'made', 'fans', 'true', 'really', 'nice', 'himbr', 'br', 'actual', 'feature', 'film', 'bit', 'finally', 'starts', '20', 'minutes', 'excluding', 'smooth', 'criminal', 'sequence', 'joe', 'pesci', 'convincing', 'psyc

In [40]:
X=[]
for d in sentences:
    X.append( model.infer_vector(d, alpha=start_alpha, steps=infer_epoch) )

In [41]:
k=2
 
from sklearn.cluster import Birch
 
brc = Birch(branching_factor=50, n_clusters=k, threshold=0.1, compute_labels=True)
brc.fit(X)

Birch(branching_factor=50, compute_labels=True, copy=True, n_clusters=2,
   threshold=0.1)

In [45]:
clusters = brc.predict(X)
 
y_pred = brc.labels_
 
 
print ("Clusters: ")
print (len(clusters))
 
 
silhouette_score = metrics.silhouette_score(X, labels, metric='euclidean')
 
print ("Silhouette_score: ")
print (silhouette_score)

Clusters: 
25000
Silhouette_score: 
0.76188165


In [44]:
len(labels)

25000

In [30]:
#Import all the dependencies
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

In [31]:
data = ["I love machine learning. Its awesome.",
        "I love coding in python",
        "I love building chatbots",
        "they chat amagingly well"]

tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]

In [32]:
tagged_data

[TaggedDocument(words=['i', 'love', 'machine', 'learning', '.', 'its', 'awesome', '.'], tags=['0']),
 TaggedDocument(words=['i', 'love', 'coding', 'in', 'python'], tags=['1']),
 TaggedDocument(words=['i', 'love', 'building', 'chatbots'], tags=['2']),
 TaggedDocument(words=['they', 'chat', 'amagingly', 'well'], tags=['3'])]

In [33]:
max_epochs = 100
vec_size = 20
alpha = 0.025

model = Doc2Vec(size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)
  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("d2v.model")
print("Model Saved")

  segments.append('"%s"' % self.comment)


iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
iteration

In [34]:
from gensim.models.doc2vec import Doc2Vec

model= Doc2Vec.load("d2v.model")
#to find the vector of a document which is not in training data
test_data = word_tokenize("I love chatbots".lower())
v1 = model.infer_vector(test_data)
print("V1_infer", v1)

# to find most similar doc using tags
similar_doc = model.docvecs.most_similar('1')
print(similar_doc)


# to find vector of doc in training data using tags or in other words, printing the vector of document at index 1 in training data
print(model.docvecs['1'])

V1_infer [ 0.01357531  0.00292198 -0.01307619 -0.00687181 -0.0229191   0.00622976
 -0.02385645 -0.00078574 -0.01658066  0.00437424 -0.00253913  0.00128016
  0.01728363  0.0206915  -0.00381131  0.03159789  0.01927276  0.01060365
 -0.0030602   0.01160008]
[('0', 0.993676483631134), ('2', 0.9932615756988525), ('3', 0.9922387599945068)]
[-0.01233657  0.06075326  0.04266645  0.20951614 -0.23930328  0.13669424
  0.02943046  0.3629795   0.3319709  -0.29642266  0.12508218 -0.05567428
  0.07741094  0.17015728 -0.1505286   0.45622203  0.38881418 -0.47629637
 -0.3589217  -0.3327939 ]
