# CS174B: Word2Vec Lesson

In [1]:
import re, nltk, gensim
import pandas as pd
from nltk.corpus import stopwords
from gensim.models import Word2Vec



Load tweets sentiment dataset

In [5]:
df = pd.read_csv('data/airline-sentiment.csv',  encoding = 'unicode_escape')

In [8]:
df.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,airline_sentiment,airline_sentiment:confidence,negativereason,negativereason:confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,681448150,False,finalized,3,2/25/15 5:24,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2/24/15 11:35,5.70306e+17,,Eastern Time (US & Canada)
1,681448153,False,finalized,3,2/25/15 1:53,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2/24/15 11:15,5.70301e+17,,Pacific Time (US & Canada)
2,681448156,False,finalized,3,2/25/15 10:01,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2/24/15 11:15,5.70301e+17,Lets Play,Central Time (US & Canada)
3,681448158,False,finalized,3,2/25/15 3:05,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2/24/15 11:15,5.70301e+17,,Pacific Time (US & Canada)
4,681448159,False,finalized,3,2/25/15 5:50,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2/24/15 11:14,5.70301e+17,,Pacific Time (US & Canada)


Preprocess the text to do the following:

-Normalize every word to lower case.

-Remove punctuation and retain only numbers and alphabets.

-Remove stop words

In [11]:
stop = set(stopwords.words('english'))
def preprocess(text):
    text=text.lower()
    text=re.sub('[^0-9a-z]+',' ',text)
    split = text.split()
    stopped = [i for i in split if i not in stop]
    joined=' '.join(stopped)
    return(joined)

In [12]:
df['text'] = df['text'].apply(preprocess)

Get rid of the unnecessary columns (for now)

In [14]:
df = df[["airline_sentiment", "text"]]

In [15]:
df.head()

Unnamed: 0,airline_sentiment,text
0,neutral,virginamerica dhepburn said
1,positive,virginamerica plus added commercials experienc...
2,neutral,virginamerica today must mean need take anothe...
3,negative,virginamerica really aggressive blast obnoxiou...
4,negative,virginamerica really big bad thing


Create a list of words similar to the TFIDF exercise

In [18]:
wordlist=[]
for i in range(len(df)):
    wordlist.append(df['text'][i].split())

In [19]:
wordlist[:3]

[['virginamerica', 'dhepburn', 'said'],
 ['virginamerica', 'plus', 'added', 'commercials', 'experience', 'tacky'],
 ['virginamerica', 'today', 'must', 'mean', 'need', 'take', 'another', 'trip']]

### Training Time

Build the Word2Vec model. Define the vector size, context window size to look into, and the minimum count of a word for it to be eligible to have a word vector
- size represents the size (dimension) of word vectors.
- window represents the context size of words that would be considered.
- min_count specifies the minimum frequency based on which a word is considered.
- sg represents whether skip-gram used (when sg=1) or CBOW (when sg = 0) used.
- alpha is the learning rate (which we'll discuss next week on neural nets proper)


```
Other papers did not report an experiment on embedding dimension size. They are all using an arbitrary dimension on the order of hundreds (100 and 300 are used more frequently). The lack of experiments for embedding size implies that the performance is not very sensitive to this parameter and only the order of magnitude matters, and also other aspects of the model architecture are more important to investigate.
```

In [20]:
model = Word2Vec(size=100,window=5,min_count=30, sg=0, alpha = 0.025)

In [21]:
model.build_vocab(wordlist)
model.corpus_count


14640

In [23]:
model.wv.vocab.keys()

dict_keys(['virginamerica', 'said', 'plus', 'experience', 'today', 'must', 'mean', 'need', 'take', 'another', 'trip', 'really', 'amp', 'little', 'big', 'bad', 'thing', 'seriously', 'would', 'pay', '30', 'flight', 'seats', 'flying', 'yes', 'every', 'time', 'fly', 'go', 'away', 'missed', 'without', 'https', 'co', 'well', 'amazing', 'arrived', 'hour', 'early', 'good', 'know', 'second', 'cause', '10', '24', 'lt', '3', 'pretty', 'much', 'better', 'great', 'deal', 'already', '2nd', 'even', '1st', 'yet', 'u', 'travel', 'http', 'thanks', 'sfo', 'schedule', 'still', 'mia', 'first', 'country', 'lax', 'mco', 'heard', 'nothing', 'things', 'virgin', 'flew', 'nyc', 'last', 'week', 'sit', 'seat', 'due', 'two', 'either', 'help', 'awesome', 'bos', 'fll', 'please', 'want', 'may', 'three', 'times', 'available', 'love', 'feel', 'making', 'gt', 'las', 'non', 'stop', 'soon', 'guys', 'friends', 'gave', 'free', 'status', 'weeks', 'called', 'response', 'happened', '2', 'ur', 'food', 'options', 'least', 'say', 

In [25]:
model.train(wordlist, total_examples=model.corpus_count, epochs=100)

(8555260, 15367300)

Show the wordvector for a word

In [26]:
model['month']

  """Entry point for launching an IPython kernel.


array([ 2.4645264e-01,  1.7522665e+00,  5.3755965e-02,  2.5623145e+00,
       -3.6641023e-01,  1.0455021e+00,  3.1233358e-01, -8.3864528e-01,
        1.6878428e+00, -4.5925853e-01,  1.1221740e+00,  1.5017433e+00,
       -1.1384646e+00, -1.2850329e+00, -4.8522443e-01, -1.0051377e+00,
        1.2366996e+00,  1.0283921e+00,  2.1941327e-03, -3.2236233e-01,
        1.2023031e+00,  1.9695036e+00, -5.1865625e-01, -2.9258964e+00,
        6.1163485e-01,  1.8918989e+00,  1.5186785e+00, -3.4030903e-02,
        6.9862628e-01,  1.4631043e-01,  1.4218019e+00, -3.1243043e+00,
       -6.7166120e-01, -2.3415513e+00,  4.3108252e-01,  2.4061044e-01,
        7.1815002e-01, -3.2479334e+00,  2.4947050e+00, -5.5266297e-01,
        7.9838842e-01,  1.3533766e+00,  8.3027445e-02,  8.5906875e-01,
       -2.8035412e+00,  3.1235674e-01,  1.0476404e+00, -1.8752144e-01,
       -1.3371296e+00, -1.1365536e+00, -6.5343815e-01, -1.3539853e+00,
       -1.8860098e+00, -2.0132370e+00,  2.9466374e+00, -9.1813862e-01,
      

Cosine similarity of words are baked into Word2Vec. Use this as follows.

In [28]:
model.similarity('month','year')

  """Entry point for launching an IPython kernel.


0.5180526

In [33]:
model.most_similar('month')

  """Entry point for launching an IPython kernel.


[('years', 0.9994949698448181),
 ('takes', 0.9994686841964722),
 ('little', 0.9994019865989685),
 ('suck', 0.9993892312049866),
 ('mean', 0.9993865489959717),
 ('zero', 0.9993835091590881),
 ('paying', 0.9993681907653809),
 ('entire', 0.9993562698364258),
 ('fault', 0.9993394017219543),
 ('plus', 0.9993321299552917)]

let's see the output of most similar words to the word "month", when we run the model for a few number of epochs.


This shows that low epoch count leads to bad model fit

In [32]:
model = Word2Vec(size=100,window=5,min_count=30, sg=0)
model.build_vocab(wordlist)
model.train(wordlist, total_examples=model.corpus_count, epochs=5)
model.most_similar('month')

  after removing the cwd from sys.path.


[('years', 0.9994949698448181),
 ('takes', 0.9994686841964722),
 ('little', 0.9994019865989685),
 ('suck', 0.9993892312049866),
 ('mean', 0.9993865489959717),
 ('zero', 0.9993835091590881),
 ('paying', 0.9993681907653809),
 ('entire', 0.9993562698364258),
 ('fault', 0.9993394017219543),
 ('plus', 0.9993321299552917)]