In [45]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup  
from nltk.corpus import stopwords 

train = pd.read_csv("labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
test = pd.read_csv("testData.tsv", header=0, delimiter="\t", quoting=3)
unlabeled_train = pd.read_csv( "unlabeledTrainData.tsv", header=0, 
 delimiter="\t", quoting=3 ) #contains no label. Will use these extra data to better learn context


In [46]:
def clean_review(review): #now don't remove stop words as did in part 1 since they can capture semantic meaning
    
    review = BeautifulSoup(review).get_text()
    review = re.sub("[^a-zA-Z]", " ", review)
    review = review.lower().split()
    
    return review

stops = stopwords.words("english")

def clean_review2(review):
    
    review = BeautifulSoup(review).get_text()
    review = re.sub("[^a-zA-Z]", " ", review)
    review = review.lower().split()
    
    review = [word for word in review if word not in stops]
    
    return review


In [17]:
import nltk.data
#Word2Vec requires format to be a list of sentences where very sentence is a list of words. So
#every text is represented as a list of lists.
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') #use punkt tokenizer to split text into sentences

def get_clean_sentences(review):
    raw_sentences = tokenizer.tokenize(review) #get list of sentences
    
    clean_sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            clean_sentences += [clean_review(raw_sentence)]
    return clean_sentences
            


In [None]:
sentences = []
for review in train["review"]:
    sentences += get_clean_sentences(review)
for review in unlabeled_train["review"]:
    sentences += get_clean_sentences(review)


In [25]:
import logging
from gensim.models import word2vec

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
    level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

model = word2vec.Word2Vec(sentences, workers=num_workers,
            size=num_features, min_count = min_word_count,
            window = context, sample = downsampling) #uses cython

model.init_sims(replace=True)

2019-11-24 09:36:27,948 : INFO : collecting all words and their counts
2019-11-24 09:36:27,949 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-11-24 09:36:28,067 : INFO : PROGRESS: at sentence #10000, processed 225803 words, keeping 17776 word types
2019-11-24 09:36:28,148 : INFO : PROGRESS: at sentence #20000, processed 451892 words, keeping 24948 word types
2019-11-24 09:36:28,218 : INFO : PROGRESS: at sentence #30000, processed 671315 words, keeping 30034 word types
2019-11-24 09:36:28,292 : INFO : PROGRESS: at sentence #40000, processed 897815 words, keeping 34348 word types
2019-11-24 09:36:28,366 : INFO : PROGRESS: at sentence #50000, processed 1116963 words, keeping 37761 word types
2019-11-24 09:36:28,439 : INFO : PROGRESS: at sentence #60000, processed 1338404 words, keeping 40723 word types
2019-11-24 09:36:28,522 : INFO : PROGRESS: at sentence #70000, processed 1561580 words, keeping 43333 word types
2019-11-24 09:36:28,613 : INFO : PROGRESS: 

2019-11-24 09:36:34,275 : INFO : PROGRESS: at sentence #720000, processed 16105489 words, keeping 118221 word types
2019-11-24 09:36:34,356 : INFO : PROGRESS: at sentence #730000, processed 16331870 words, keeping 118954 word types
2019-11-24 09:36:34,466 : INFO : PROGRESS: at sentence #740000, processed 16552903 words, keeping 119668 word types
2019-11-24 09:36:34,582 : INFO : PROGRESS: at sentence #750000, processed 16771230 words, keeping 120295 word types
2019-11-24 09:36:34,732 : INFO : PROGRESS: at sentence #760000, processed 16990622 words, keeping 120930 word types
2019-11-24 09:36:34,814 : INFO : PROGRESS: at sentence #770000, processed 17217759 words, keeping 121703 word types
2019-11-24 09:36:34,889 : INFO : PROGRESS: at sentence #780000, processed 17447905 words, keeping 122402 word types
2019-11-24 09:36:34,953 : INFO : PROGRESS: at sentence #790000, processed 17674981 words, keeping 123066 word types
2019-11-24 09:36:34,988 : INFO : collected 123504 word types from a corp

2019-11-24 09:37:24,384 : INFO : EPOCH 3 - PROGRESS: at 11.89% examples, 500248 words/s, in_qsize 7, out_qsize 0
2019-11-24 09:37:25,396 : INFO : EPOCH 3 - PROGRESS: at 16.56% examples, 520264 words/s, in_qsize 7, out_qsize 0
2019-11-24 09:37:26,406 : INFO : EPOCH 3 - PROGRESS: at 21.29% examples, 535066 words/s, in_qsize 7, out_qsize 0
2019-11-24 09:37:27,408 : INFO : EPOCH 3 - PROGRESS: at 25.80% examples, 541013 words/s, in_qsize 7, out_qsize 0
2019-11-24 09:37:28,415 : INFO : EPOCH 3 - PROGRESS: at 29.98% examples, 539724 words/s, in_qsize 7, out_qsize 0
2019-11-24 09:37:29,426 : INFO : EPOCH 3 - PROGRESS: at 34.65% examples, 544708 words/s, in_qsize 7, out_qsize 0
2019-11-24 09:37:30,438 : INFO : EPOCH 3 - PROGRESS: at 39.48% examples, 551737 words/s, in_qsize 7, out_qsize 0
2019-11-24 09:37:31,446 : INFO : EPOCH 3 - PROGRESS: at 44.20% examples, 556923 words/s, in_qsize 7, out_qsize 0
2019-11-24 09:37:32,447 : INFO : EPOCH 3 - PROGRESS: at 48.87% examples, 560819 words/s, in_qsiz

2019-11-24 09:38:30,745 : INFO : EPOCH 5 - PROGRESS: at 84.67% examples, 505372 words/s, in_qsize 7, out_qsize 0
2019-11-24 09:38:31,752 : INFO : EPOCH 5 - PROGRESS: at 89.37% examples, 509473 words/s, in_qsize 7, out_qsize 0
2019-11-24 09:38:32,757 : INFO : EPOCH 5 - PROGRESS: at 94.15% examples, 513590 words/s, in_qsize 7, out_qsize 0
2019-11-24 09:38:33,768 : INFO : EPOCH 5 - PROGRESS: at 98.77% examples, 516646 words/s, in_qsize 7, out_qsize 1
2019-11-24 09:38:33,984 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-11-24 09:38:34,015 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-11-24 09:38:34,018 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-11-24 09:38:34,019 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-11-24 09:38:34,020 : INFO : EPOCH - 5 : training on 17798082 raw words (12749586 effective words) took 24.6s, 517707 effective words/s
2019-11-24 09:38:34,023 : INFO : training on

In [26]:
#let's examine the model

model.doesnt_match("man woman children kitchen".split())

  This is separate from the ipykernel package so we can avoid doing imports until
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'kitchen'

In [31]:
model.doesnt_match("sweden norway denmark london".split())

  """Entry point for launching an IPython kernel.


'london'

In [33]:
model.most_similar("dog")

  """Entry point for launching an IPython kernel.


[('puppy', 0.6654437780380249),
 ('cat', 0.6308108568191528),
 ('pet', 0.6294919848442078),
 ('bird', 0.6013981699943542),
 ('chicken', 0.5890455842018127),
 ('horse', 0.569918692111969),
 ('monkey', 0.5429009795188904),
 ('goat', 0.5336949229240417),
 ('bike', 0.5272098183631897),
 ('rat', 0.5260271430015564)]

In [None]:
model["dog"] #the 300 dimensional embedding of the word "dog"

In [52]:
#let's represent a review by taking the average of their word vectors. Since the vector captures the meaning
#of the word then hopefully taking average over the review we get the "meaning" of the review.
# We will remove stop words as they are basically just noise

vocabulary = set(model.wv.index2word)

#takes a raw review and returns average vector
def get_feature_vector(review):
    words = clean_review2(review)
    
    vector = np.zeros(num_features)
    count = 0
    for word in words:
        if word in vocabulary: #notf not all words will be in vocabulary since only look at those occuring >= 40 times
            vector += model[word]
            count += 1
    return vector/count
            

In [59]:
feature_vectors = np.zeros((train.shape[0], num_features))
for i, review in enumerate(train["review"]):
    feature_vectors[i] = get_feature_vector(review)
    
test_feature_vectors = np.zeros((test.shape[0], num_features))
for i, review in enumerate(test["review"]):
    test_feature_vectors[i] = get_feature_vector(review)
    

  from ipykernel import kernelapp as app


In [60]:
#This solution is similar to the one in part 1. When we average the words we lose
#the order of the words which surely matters.
from sklearn.ensemble import RandomForestClassifier

RFC = RandomForestClassifier(n_estimators = 200)
RFC.fit(feature_vectors, train["sentiment"])

preds = RFC.predict(test_feature_vectors)
submit = pd.DataFrame({"id": test["id"], "sentiment": preds})
submit.to_csv("part2.csv", index=False, quoting=3)