In [15]:
import nltk
import os, sys, re, collections, string
from operator import itemgetter as at
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from tqdm import tqdm
from ipywidgets import interact
sys.path.append("../python")
import data
%matplotlib inline

In [14]:
from sklearn.feature_extraction import text
X,y = data.preprocessed()
sample_doc = X[0]

## Building a vocabulary by distribution

In [16]:
word_count = collections.Counter()
for x in tqdm(X):
    word_count += collections.Counter(x.split())
total_word_count = sum(word_count.values())
word_count_hist = collections.Counter(word_count.values())
print ("Total word count: "+str(total_word_count))

100%|██████████| 44277/44277 [04:06<00:00, 179.64it/s]

Total word count: 147462785





In [17]:
@interact(lb = (1,10000), ub =(1000,1e7))
def vocab_coverage(lb=10,ub=10000):
    words_covered = sum([wc*n for wc, n in word_count_hist.items() if lb<wc<ub])
    corpus_percentage = words_covered/total_word_count
    vector_size = len([1 for wc, n in word_count_hist.items() if lb<wc<ub])
    return "Corpus Coverage: {c:.2f}%\n Vector Size: {v}".format(c=corpus_percentage*100,v=vector_size)

Term frequency Vector
---

In [18]:
#http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
cv = text.CountVectorizer(min_df=1e-06, max_df=0.05)
cv.fit(X[:100])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.05, max_features=None, min_df=1e-06,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [19]:
sample_vec = cv.transform([sample_doc])

Vector to bag of words
---

In [20]:
{k:v for k,v in zip(cv.get_feature_names(), sample_vec.toarray()[0]) if v>0}

{'abingdon': 1,
 'accessing': 1,
 'acknowledgements': 1,
 'administratively': 1,
 'advantage': 1,
 'allow': 1,
 'alpha': 22,
 'alphas': 4,
 'asset': 1,
 'barney': 1,
 'box': 1,
 'cdrom': 1,
 'competitors': 1,
 'comprise': 1,
 'contacts': 1,
 'contested': 1,
 'critical': 1,
 'employmentchange': 1,
 'exception': 1,
 'expedited': 1,
 'familiar': 1,
 'feasible': 1,
 'growth': 1,
 'highly': 1,
 'importance': 1,
 'inactions': 1,
 'incentives': 1,
 'internet': 2,
 'intranet': 1,
 'latest': 1,
 'lawsuits': 1,
 'levels': 1,
 'link': 1,
 'maintaining': 1,
 'measures': 1,
 'messenger': 1,
 'misuse': 1,
 'natural': 2,
 'necessarily': 2,
 'nominees': 1,
 'oppose': 1,
 'paper': 2,
 'performancebased': 1,
 'permanently': 1,
 'po': 1,
 'pricing': 1,
 'procurement': 1,
 'protective': 1,
 'ratio': 1,
 'recipients': 15,
 'recognizes': 1,
 'safeguard': 1,
 'send': 1,
 'significantly': 1,
 'site': 2,
 'sixtieth': 1,
 'smith': 1,
 'strategic': 1,
 'strategies': 1,
 'targets': 1,
 'telecopy': 1,
 'timing': 1

# Stemming

In [21]:
import nltk
from nltk.stem.porter import PorterStemmer

In [22]:
stemmer = PorterStemmer()
stemmer.stem("factorization")

'factor'

In [23]:
analyzer = text.CountVectorizer().build_analyzer()
def my_analyzer(txt):
    return [stemmer.stem(w) for w in analyzer(txt)]
cv = text.CountVectorizer(min_df=1e-06, max_df=0.05, analyzer=my_analyzer)
cv.fit(X[:100])
sample_vec = cv.transform([sample_doc])
{k:v for k,v in zip(cv.get_feature_names(), sample_vec.toarray()[0]) if v>0}

{'abingdon': 1,
 'advantag': 1,
 'alpha': 26,
 'barney': 1,
 'box': 1,
 'cdrom': 1,
 'critic': 1,
 'employmentchang': 1,
 'expedit': 1,
 'familiar': 1,
 'feasibl': 1,
 'growth': 1,
 'highli': 1,
 'import': 1,
 'internet': 2,
 'intranet': 1,
 'latest': 1,
 'lawsuit': 1,
 'link': 1,
 'messeng': 1,
 'misus': 1,
 'necessarili': 2,
 'oppos': 1,
 'performancebas': 1,
 'po': 1,
 'procur': 1,
 'ratio': 1,
 'safeguard': 1,
 'significantli': 1,
 'site': 2,
 'sixtieth': 1,
 'smith': 1,
 'strateg': 1,
 'unearn': 1,
 'va': 1,
 'vari': 1,
 'virginia': 1,
 'web': 1,
 'wide': 1,
 'wwwbenefitaccesscom': 1}

## Term frequency inverse document frequency (TFIDF)

In [24]:
#http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
tfidf = text.TfidfVectorizer(min_df=1e-06, max_df=0.05)
tfidf.fit(X[:100])
sample_vec = tfidf.transform([sample_doc])
{k:v for k,v in zip(cv.get_feature_names(), sample_vec.toarray()[0]) if v>0}

{'abiom': 0.03698348952694716,
 'actiga': 0.03698348952694716,
 'aemploye': 0.03698348952694716,
 'albanna': 0.031775212329687975,
 'allot': 0.028728565475930265,
 'attorneysinfact': 0.028728565475930265,
 'australia': 0.8136367695928376,
 'autom': 0.14793395810778864,
 'bryan': 0.028728565475930265,
 'cheryl': 0.03393684267318945,
 'constat': 0.03698348952694716,
 'depositari': 0.03698348952694716,
 'eighth': 0.028728565475930265,
 'embarrass': 0.03698348952694716,
 'exemplari': 0.03393684267318945,
 'exofficio': 0.03393684267318945,
 'forefeit': 0.03698348952694716,
 'lift': 0.03698348952694716,
 'miner': 0.03393684267318945,
 'motiv': 0.03393684267318945,
 'nonacceler': 0.030098521573929036,
 'nondisparag': 0.03698348952694716,
 'plaza': 0.031775212329687975,
 'procur': 0.030098521573929036,
 'reinsur': 0.03698348952694716,
 'rendit': 0.03698348952694716,
 'repeatedli': 0.028728565475930265,
 'shalhoub': 0.06019704314785807,
 'shortterm': 0.03698348952694716,
 'sureti': 0.0339368426

## Word2Vec - Pretrained word vectors

In [1]:
from gensim.models import Word2Vec
w2v = Word2Vec.load("../data/w2v.pickle")

In [4]:
w2v.most_similar("customer")

  """Entry point for launching an IPython kernel.


[('supplier', 0.8100154399871826),
 ('identities', 0.7488305568695068),
 ('vendor', 0.7211676836013794),
 ('customers', 0.719849705696106),
 ('client', 0.7191833853721619),
 ('wholesaler', 0.6979198455810547),
 ('retailer', 0.6949663162231445),
 ('referrer', 0.6916903853416443),
 ('subscriber', 0.6891459226608276),
 ('advertisers', 0.6872985363006592)]

In [12]:
w2v.wv["customer"]

array([-1.7971811 ,  1.7461873 , -1.7333413 , -0.9501938 ,  1.4369504 ,
       -4.573448  , -1.040323  ,  1.8452846 , -6.584528  ,  5.045666  ,
        2.3666062 , -2.4841917 , -1.2176125 , -5.4686084 , -1.5029901 ,
       -3.9456666 , -1.1921363 , -2.3597367 , -1.5567018 , -0.01446523,
        0.2594563 , -2.0609753 , -6.4029737 ,  1.6582427 , -3.0667682 ,
       -0.5999931 ,  0.2741809 ,  2.3116817 ,  1.7087582 , -5.688676  ,
       -3.125402  ,  2.4790525 , -1.5706098 ,  2.2393646 ,  1.1512741 ,
       -0.86816007,  1.7273517 ,  0.5174984 ,  3.1057346 ,  0.765412  ,
        4.303741  , -1.8025136 ,  3.2532973 ,  0.74829334,  0.51785934,
        0.21034734,  0.9509934 , -7.1567492 , -3.2851288 , -3.233999  ],
      dtype=float32)

## Averaging word vectors across the document

In [27]:
v = None
for w in nltk.word_tokenize(sample_doc):
    try:
        if v is not None:
            v+=w2v.wv[w]
            n+=1
        else:
            v=np.copy(w2v.wv[w])
            n=1
    except KeyError:
        continue
v/=n
v

array([-0.7507401 , -0.420542  , -0.10012057, -0.37326655,  0.5822965 ,
        0.77466947,  0.6441445 ,  0.3625388 ,  0.43520534, -0.54652923,
       -0.28740865, -1.3981526 , -0.412218  ,  0.32011726,  0.09712528,
        0.22358079, -0.75756776, -0.70142686, -0.21124128,  0.10119429,
       -0.8234234 ,  0.4335564 ,  0.67138153,  0.5240189 , -0.92881304,
        0.528634  ,  1.1133226 , -0.54446703, -0.12792982,  0.18315491,
       -0.5741413 ,  0.3953018 , -0.3179198 , -0.5529575 , -1.0513256 ,
        1.1475902 , -0.19089158,  0.1173111 ,  0.53352356, -1.6372716 ,
       -0.02028674, -0.2380265 ,  0.20073438,  0.55285066,  0.9066735 ,
       -0.6627023 , -0.7060955 , -0.14574422, -0.63928986, -0.58009225],
      dtype=float32)

Note that the word vectors averaging could be combined with TF/IDF