In [1]:
import pickle
from datetime import datetime
import os
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords

import numpy as np 
import pandas as pd
import gensim
import nltk.data

from gensim.models import word2vec   # for model training
from gensim.models import Word2Vec   # for model loading

from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [2]:
# Read in data
all_data_pkl = open("C:\\Users\\YWang\\Desktop\\plg\\3rd meeting_0908\\all_data.pkl","rb")
all_data = pickle.load(all_data_pkl)
all_data[['call','ex_tag']] = all_data[['call','ex_tag']].astype('int')

all_data[:3]

Unnamed: 0,id,call,ex_tag,txt
0,CTRL000000096,0,0,Subject: FW: Quality Moves to utilize on your ...
1,CTRL000000097,0,0,"Subject: Schedule confirmation From: guzman, M..."
2,CTRL000000099,0,0,"Subject: Emergency Meeting From: ""williams iii..."


In [3]:
data_no_extag = all_data[all_data['ex_tag']==0]

print(data_no_extag.shape)
print("Positive %: ",sum(data_no_extag['call'])/data_no_extag.shape[0])
data_no_extag[:3]

(13418, 4)
Positive %:  0.15270532121


Unnamed: 0,id,call,ex_tag,txt
0,CTRL000000096,0,0,Subject: FW: Quality Moves to utilize on your ...
1,CTRL000000097,0,0,"Subject: Schedule confirmation From: guzman, M..."
2,CTRL000000099,0,0,"Subject: Emergency Meeting From: ""williams iii..."


# Outline
### 1. High-level intuition of word2vec models with a cooked-up example
### 2. Code walk-thru

#### Preprocessing
1. Parse raw text to sentences    

2. For each sentence, separate into words -- need a context around the word to optimize word vector
    
doc 

--> [[sentence_1], [sentence_2], ..., [sentence_n]] 

--> [[word_1 of sentence_1, ..., word_m of sentence_1], ..., [word_1 of sentence_n, ..., word_s of sentence_n]]

In [4]:
# function to preprocess the text of the documents
def txt_to_words(raw_txt, remove_stopwords):
    
    # Remove HTML markup
    txt_text = BeautifulSoup(raw_txt,"html.parser").get_text() 
    
    # Remove non-alphabets and doc starter
    letters_only = re.sub("[^a-zA-Z]", " ", txt_text).replace(u'\ufeff','')
    
    # To lower case
    letters_only_lower_word_list = letters_only.lower().split()
    
    # Remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        letters_only_lower_word_list = [w for w in letters_only_lower_word_list if not w in stops]
    
    return letters_only_lower_word_list

In [5]:
# Function to split a doc into parsed sentences
# Returns a list of sentences, where each sentence is a list of words
def doc_to_sentences(txt, tokenizer, remove_stopwords=True):
    
    # 1. Initialize a tokenizer
    paragraph = tokenizer.tokenize(txt.strip())
    
    # 2. Parse each sentence into separate words
    sentences = []
    for sentence in paragraph:
        if len(sentence) > 0:            
            sentences.append(txt_to_words(sentence, remove_stopwords))
    
    return sentences

In [6]:
# Disable uselss warnings from Beautiful soup
import warnings
warnings.filterwarnings("ignore")

# Use the NLTK tokenizer to split the paragraph into sentences
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# timing
t0 = datetime.today()
sentences = []  # Initialize an empty list of sentences

for txt in data_no_extag['txt']:
    sentences += doc_to_sentences(txt, sent_tokenizer, remove_stopwords=False)

# timing
print (datetime.today() - t0)

print (len(sentences))
sentences

0:06:30.482032
1085581


[['subject',
  'fw',
  'quality',
  'moves',
  'to',
  'utilize',
  'on',
  'your',
  'next',
  'date',
  'from',
  'alport',
  'kysa',
  'date',
  'thu',
  'nov',
  'to',
  'meyers',
  'bert',
  'original',
  'message',
  'from',
  'driscoll',
  'michael',
  'm',
  'sent',
  'wednesday',
  'november',
  'am',
  'to',
  'alport',
  'kysa',
  'subject',
  'fw',
  'quality',
  'moves',
  'to',
  'utilize',
  'on',
  'your',
  'next',
  'date',
  'original',
  'message',
  'from',
  'pete',
  'mehok',
  'mailto',
  'pm',
  'navistaff',
  'com',
  'sent',
  'friday',
  'november',
  'am',
  'to',
  'walls',
  'peeps',
  'scooter',
  'babar',
  'blotter',
  'anthony',
  'coz',
  'byrnes',
  'cibs',
  'cooter',
  'doyle',
  'driscoll',
  'michael',
  'm',
  'ferne',
  'fellrath',
  'fonz',
  'gimp',
  'hoss',
  'juice',
  'kiwi',
  'knute',
  'klondike',
  'kuna',
  'moss',
  'paps',
  'rammer',
  'rosie',
  'royer',
  'russ',
  'ryb',
  'saylor',
  'scarecrow',
  'spider',
  'sweetj',
  've

In [7]:
sentences[20:25]

[['aka', 'watersports'],
 ['pearl', 'necklace', 'well', 'known'],
 ['whenever',
  'you',
  'cum',
  'on',
  'the',
  'neck',
  'cleavage',
  'area',
  'of',
  'a',
  'girl',
  'it',
  'takes',
  'on',
  'the',
  'look',
  'of',
  'beautiful',
  'jewelry'],
 [],
 ['coyote',
  'this',
  'occurs',
  'when',
  'you',
  'wake',
  'up',
  'in',
  'the',
  'room',
  'of',
  'a',
  'nasty',
  'skank',
  'and',
  'you',
  'know',
  'you',
  've',
  'got',
  'to',
  'give',
  'her',
  'the',
  'slip']]

word2vec with the gensim module

In [7]:
# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 50   # Minimum word count                        
num_workers = 2       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words
sg = 1                # Default to use CBOW; sg = 1 for skip-gram

# timing
t0 = datetime.today()

model = word2vec.Word2Vec(sentences, 
                          workers = num_workers, 
                          size=num_features, 
                          min_count = min_word_count, 
                          window = context, 
                          sample = downsampling)

# timing
print (datetime.today() - t0)
# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

model_name = "sg_300features_50minwords_10context"
model.save(model_name)

0:04:23.454000


#### The model has semantic understanding!!

In [10]:
print("Most similar to 'REALLY':")
model.most_similar("really")

Most similar to 'REALLY':


[('quite', 0.5875177383422852),
 ('pretty', 0.5772615671157837),
 ('obviously', 0.533632218837738),
 ('watching', 0.5163030028343201),
 ('maybe', 0.5152464509010315),
 ('very', 0.5129753351211548),
 ('idea', 0.5014631748199463),
 ('interesting', 0.4983963370323181),
 ('something', 0.4907233417034149),
 ('thinking', 0.48977118730545044)]

In [11]:
print("Most similar to 'FOOTBALL':")
model.most_similar("football")

Most similar to 'FOOTBALL':


[('baseball', 0.5933674573898315),
 ('basketball', 0.5336013436317444),
 ('sports', 0.5090859532356262),
 ('nba', 0.5071444511413574),
 ('championship', 0.5032073259353638),
 ('stadium', 0.49982523918151855),
 ('espn', 0.48447105288505554),
 ('longhorn', 0.462421715259552),
 ('recruiting', 0.4622121751308441),
 ('bowl', 0.4613615870475769)]

In [12]:
model.doesnt_match("win sportsline nfl note".split())

'note'

#### Represent docs with learnt word vectors

In [13]:
# Load trained_model
model = Word2Vec.load("sg_300features_50minwords_10context")

# index2word contains all the model vocabulary
print (len(model.index2word))

# syn0 contains vector representation for each word
print (model.syn0.shape)

18268
(18268, 300)


In [105]:
# Function to average all of the word vectors in a doc
def makeFeatureVec(words, model, num_features):
    
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,))
    
    # Total words counter
    nwords = 0.
     
    # Convert to set, for speed 
    index2word_set = set(model.index2word)
    
    # Count words in a doc
    # Sum up word vectors for a doc
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])           
    
    # Calculate average word vector
    # If non of the words in a doc is in the vocabulary, avoid 0 in denominator
    if nwords == 0:
        nwords = 1
        
    featureVec = np.divide(featureVec,nwords)
    return featureVec

In [106]:
# Function to loop thru all docs to get average word vector
def getAvgFeatureVecs(docs, model, num_features):
    
    # Initialize a counter
    counter = 0.
    
    # Preallocate 13418 * 300 space for speed
    docsFeatureVecs = np.zeros((len(docs),num_features))
     
    # Loop through the docs
    for doc in docs:
       
       # Print a status message every 1000th review
        if counter%1000. == 0.:
            print ("Doc %d of %d" % (counter, len(docs)))
            
        # calculate feature vector for each doc
        docsFeatureVecs[counter] = makeFeatureVec(doc, model, num_features)
       
        # Increment the counter
        counter = counter + 1.
    return docsFeatureVecs

In [107]:
wordlist_nostop = []
for txt in data_no_extag['txt']:
    wordlist_nostop.append(txt_to_words(txt, remove_stopwords=True))
wordvec = getAvgFeatureVecs(wordlist_nostop, model, num_features)
wordvec_df = pd.DataFrame(wordvec)

Doc 0 of 13418
Doc 1000 of 13418
Doc 2000 of 13418
Doc 3000 of 13418
Doc 4000 of 13418
Doc 5000 of 13418
Doc 6000 of 13418
Doc 7000 of 13418
Doc 8000 of 13418
Doc 9000 of 13418
Doc 10000 of 13418
Doc 11000 of 13418
Doc 12000 of 13418
Doc 13000 of 13418


#### Train the model

In [108]:
# Train text split
X_train, X_test, y_train, y_test = train_test_split(wordvec_df, data_no_extag['call'], test_size = 0.95, random_state = 1)

In [118]:
wordvec_df.shape

(13418, 300)

In [110]:
rf = RandomForestClassifier(n_estimators = 200)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)


In [111]:
# A helper function to get metrics
def get_metrics(y_truth, y_predicted, p_label, return_flag = False):
    confusion_matrix = metrics.confusion_matrix(y_truth, y_predicted, labels = [1,0])
    accuracy = round((metrics.accuracy_score(y_truth, y_predicted)), 2)
    precision = round((metrics.precision_score(y_truth, y_predicted, pos_label = p_label, average = 'binary')), 2)
    recall = round((metrics.recall_score(y_truth, y_predicted, pos_label = p_label, average = 'binary')), 2)
    
    if(return_flag):
        return(pd.DataFrame([accuracy,precision,recall]).transpose())
    else:
        print ("Confusion Matrix:")
        print (confusion_matrix)
        print ()
        print ("Accuracy: %f;\nPrecision: %f;\nRecall: %f;" %(accuracy, precision, recall))

In [114]:
get_metrics(y_test,y_pred,p_label = 1)

Confusion Matrix:
[[ 1041   916]
 [   25 10766]]

Accuracy: 0.930000;
Precision: 0.980000;
Recall: 0.530000;


In [116]:
y_pred = pd.DataFrame(y_pred, index = y_test.index)
y_prob = pd.DataFrame(rf.predict_proba(X_test), index = y_test.index)
result = pd.merge(y_pred, y_prob, left_index=True, right_index=True)
result.columns = ["w2v_Predicted Label", "w2v_P(Call=0)", "w2v_P(Call=1)"]
result[:3]

Unnamed: 0,w2v_Predicted Label,w2v_P(Call=0),w2v_P(Call=1)
94738,0,0.995,0.005
67935,0,1.0,0.0
91131,0,0.76,0.24


In [117]:
temp = pd.concat([data_no_extag,result], axis = 1, join = 'outer')
temp.to_csv("output_092816.txt", header = True, index = True, encoding = 'utf-8')

#### TakeAways:
#### 1. Separate process of learning word representation and model training 
#### 2. Seems to develop some semantic understanding of your corpus

#### Next steps:
#### 1. Compare word2vec with bag-of-word type of models
#### 2. Parameter tweaking and process optimization