In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_lg')

In [25]:
from gensim.models import KeyedVectors

In [26]:
glove_embeddings = KeyedVectors.load_word2vec_format('/Users/vaibhav/MiscProjects/word-vectors/glove.6B/glove.6B.50d.w2v.txt')

In [3]:
import pandas as pd

In [64]:
import numpy as np

In [80]:
from sklearn.model_selection import train_test_split

In [91]:
import pickle

In [4]:
base_dir = '/Users/vaibhav/MiscProjects/question-classification/'

In [5]:
data_file = base_dir + 'questions_top5cat.xlsx'

In [36]:
data_df = pd.read_excel(data_file)

In [37]:
data_df.head()

Unnamed: 0,question,category,original
0,Describe the geographical exposure for your do...,Financials,1
1,What does the business look like in 5 years,Strategy,1
2,How comfortable are you with your leverage? Ba...,Financials,1
3,is the operating margin improvement all coming...,Financials,1
4,Can you elaborate on the 3-400bps of margin im...,Financials,1


In [38]:
# add row id
data_df['row_id'] = data_df.reset_index().index

In [39]:
# create binary flags
data_df = pd.concat([data_df, pd.get_dummies(data_df['category'], prefix='f')], axis=1)

In [40]:
data_df.head()

Unnamed: 0,question,category,original,row_id,f_Competition,f_Customer,f_Financials,f_Product,f_Strategy
0,Describe the geographical exposure for your do...,Financials,1,0,0,0,1,0,0
1,What does the business look like in 5 years,Strategy,1,1,0,0,0,0,1
2,How comfortable are you with your leverage? Ba...,Financials,1,2,0,0,1,0,0
3,is the operating margin improvement all coming...,Financials,1,3,0,0,1,0,0
4,Can you elaborate on the 3-400bps of margin im...,Financials,1,4,0,0,1,0,0


In [41]:
# create clean tokens
# remove named entities and numbers etc.

docs = list(nlp.pipe(data_df['question']))

In [42]:
clean_docs = []
for doc in docs:
    clean_doc = []
    for token in doc:
        if (token.is_alpha and
            token.ent_type_ == ''):
            clean_doc.append(token.text)
    clean_docs.append(clean_doc)

In [43]:
clean_docs

[['Describe',
  'the',
  'geographical',
  'exposure',
  'for',
  'your',
  'doors',
  'business'],
 ['What', 'does', 'the', 'business', 'look', 'like', 'in'],
 ['How',
  'comfortable',
  'are',
  'you',
  'with',
  'your',
  'leverage',
  'Based',
  'on',
  'quick',
  'math',
  'you',
  'will',
  'be',
  'at',
  'or',
  'so',
  'in'],
 ['is',
  'the',
  'operating',
  'margin',
  'improvement',
  'all',
  'coming',
  'from',
  'the',
  'new',
  'acquisitions'],
 ['Can',
  'you',
  'elaborate',
  'on',
  'of',
  'margin',
  'improvement',
  'in',
  'the',
  'new',
  'businesses',
  'and',
  'where',
  'it',
  'will',
  'come',
  'from'],
 ['What',
  'advantages',
  'do',
  'you',
  'have',
  'that',
  'allows',
  'you',
  'to',
  'expand',
  'operating',
  'margins',
  'in',
  'a',
  'way',
  'that',
  'legacy',
  'owners',
  'can',
  'not'],
 ['How', 'much', 'do', 'you', 'spend', 'on'],
 ['Is',
  'the',
  'defense',
  'electronics',
  'a',
  'segment',
  'you',
  'expect',
  'to',
  '

In [55]:
max_len = 0
for doc in clean_docs:
    max_len = max(max_len, len(doc))
print(max_len)

28


In [46]:
# find unique tokens
unique_tokens = set([token.lower() for doc in clean_docs for token in doc])

In [50]:
# check which unique tokens exist in glove embedding
found_tokens = []
count_not_found = 0
not_found_tokens = []
for token in unique_tokens:
    try:
        embedding_vector = glove_embeddings[token]
        found_tokens.append(token)
    except:
        count_not_found += 1
        not_found_tokens.append(token)
        
print(count_not_found)
print(not_found_tokens)

10
['iheartmedia', 'lnstagram', 'hipchat', 'zendesk', 'ringcentrai', 'defensibility', 'upwork', 'xpresswest', 'twilio', 'docusign']


In [52]:
found_tokens_dict = dict(zip(found_tokens, range(1, len(found_tokens)+1)))

In [56]:
def prepend_zeros(short_rec, min_len):
    l = len(short_rec)
    if l < min_len:
        zeros = ['0']*(min_len - l)
        zeros.extend(short_rec)
        return zeros
    else:
        return short_rec

In [58]:
# create dataframe of token ids
# write ids to file

with open(base_dir + 'token_ids.csv', 'wb') as ofb:
    for doc in clean_docs:
        rec = []
        for i, token in enumerate(doc):
            token = token.lower()
            try:
                rec.append(str(found_tokens_dict[token]))
            except:
                pass
        rec = prepend_zeros(rec, max_len)
        rec_str = ','.join(rec) + '\n'
        ofb.write(rec_str.encode('utf-8'))

In [59]:
# load token ids as a dataframe and join with data_df
token_ids = pd.read_csv(base_dir + 'token_ids.csv', header=None)

In [61]:
data_df2 = pd.concat([data_df, token_ids], axis=1)

In [63]:
data_df2.head(3)

Unnamed: 0,question,category,original,row_id,f_Competition,f_Customer,f_Financials,f_Product,f_Strategy,0,...,18,19,20,21,22,23,24,25,26,27
0,Describe the geographical exposure for your do...,Financials,1,0,0,0,1,0,0,0,...,0,0,325,51,197,863,1005,263,476,592
1,What does the business look like in 5 years,Strategy,1,1,0,0,0,0,1,0,...,0,0,0,324,701,51,592,334,557,857
2,How comfortable are you with your leverage? Ba...,Financials,1,2,0,0,1,0,0,0,...,940,845,685,852,910,193,423,561,421,857


In [66]:
# create embedding matrix for found tokens
EMBEDDING_DIM = 50
embedding_matrix = np.zeros((len(found_tokens) + 1, EMBEDDING_DIM))

for i, token in enumerate(found_tokens):
    embedding_vector = glove_embeddings[token]
    embedding_matrix[i + 1] = embedding_vector

In [None]:
# create train, test sets - use only originals in test set
# 30 records for each category in test set

In [None]:
# get glove embeddings
# try different dims
# build keras model - bilstm, bigru, w and w/o attention
# train on elmo embeddings
# train on bert embeddings
# for all runs, do cv
# add precision, recall to keras metrics

In [83]:
all_cats = data_df2['category'].unique()
for i, cat in enumerate(all_cats):
    data_df2_cat = data_df2[(data_df2['original'] == 1) & (data_df2['category'] == cat)]
    data_df2_cat_tr, data_df2_cat_ts, _, _ = train_test_split(data_df2_cat, data_df2_cat, 
                                                              test_size=25, random_state=42)
    if i == 0:
        data_df2_tr = data_df2_cat_tr
        data_df2_ts = data_df2_cat_ts
    else:
        data_df2_tr = pd.concat([data_df2_tr, data_df2_cat_tr], axis=0)
        data_df2_ts = pd.concat([data_df2_ts, data_df2_cat_ts], axis=0)

In [87]:
np.unique(data_df2_ts['category'], return_counts=True)

(array(['Competition', 'Customer', 'Financials', 'Product', 'Strategy'],
       dtype=object), array([25, 25, 25, 25, 25]))

In [89]:
# append other examples to train set
data_df2_tr = pd.concat([data_df2_tr, data_df2[data_df2['original'] == 0]], axis=0)

In [90]:
np.unique(data_df2_tr['category'], return_counts=True)

(array(['Competition', 'Customer', 'Financials', 'Product', 'Strategy'],
       dtype=object), array([80, 80, 80, 81, 80]))

In [92]:
# save train, test df to disc
with open(base_dir + 'data_df2_tr.pkl', 'wb') as ofb1, open(base_dir + 'data_df2_ts.pkl', 'wb') as ofb2:
    pickle.dump(data_df2_tr, ofb1)
    pickle.dump(data_df2_ts, ofb2)

In [93]:
# save embedding matrix
pickle.dump(embedding_matrix, open(base_dir + 'embedding_matrix.pkl', 'wb'))

In [95]:
# get elmo embeddings
max_len2 = 0
for doc in docs:
    max_len2 = max(max_len2, len(doc))
print(max_len2)

38


In [99]:
ntokens = np.sum([len(doc) for doc in docs])

In [97]:
from allennlp.commands.elmo import ElmoEmbedder

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [100]:
elmo = ElmoEmbedder(
        options_file='/Users/vaibhav/Downloads/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json', 
        weight_file='/Users/vaibhav/Downloads/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5')

In [103]:
i = 0
elmo_embeddings_matrix = np.zeros((ntokens + 1, 1024))
for doc in docs:
    tokens = [token.text for token in doc]
    #tokens = ["I", "ate", "an", "apple", "for", "breakfast"]
    vectors = elmo.embed_sentence(tokens)
    for j in range(0, len(tokens)):
        i += 1
        elmo_embeddings_matrix[i] = vectors[2][j]

In [107]:
# save embedding matrix
pickle.dump(elmo_embeddings_matrix, open(base_dir + 'elmo_embeddings_matrix.pkl', 'wb'))

In [108]:
# token ids for elmo
# write ids to file

i = 0
with open(base_dir + 'elmo_token_ids.csv', 'wb') as ofb:
    for doc in docs:
        tokens = [token.text for token in doc]
        rec = []
        for j, token in enumerate(tokens):
            i += 1
            rec.append(str(i))
        rec = prepend_zeros(rec, max_len2)
        rec_str = ','.join(rec) + '\n'
        ofb.write(rec_str.encode('utf-8'))
print(i)

6025


In [109]:
# load token ids as a dataframe and join with data_df
elmo_token_ids = pd.read_csv(base_dir + 'elmo_token_ids.csv', header=None)

In [110]:
elmo_token_ids['row_id'] = elmo_token_ids.reset_index().index

In [111]:
elmo_token_ids.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,29,30,31,32,33,34,35,36,37,row_id
0,0,0,0,0,0,0,0,0,0,0,...,0,1,2,3,4,5,6,7,8,0
1,0,0,0,0,0,0,0,0,0,0,...,9,10,11,12,13,14,15,16,17,1
2,0,0,0,0,0,0,0,0,0,0,...,34,35,36,37,38,39,40,41,42,2


In [112]:
data_df2_tr2 = pd.merge(data_df2_tr, elmo_token_ids, 
                        left_on='row_id', right_on='row_id',
                        how = 'left',
                        suffixes=('_l', '_r'))

In [114]:
data_df2_tr2.shape

(401, 75)

In [115]:
data_df2_ts2 = pd.merge(data_df2_ts, elmo_token_ids, 
                        left_on='row_id', right_on='row_id',
                        how = 'left',
                        suffixes=('_l', '_r'))
print(data_df2_ts2.shape)

(125, 75)


In [116]:
pickle.dump(data_df2_tr2, open(base_dir + 'data_df2_tr2.pkl', 'wb'))
pickle.dump(data_df2_ts2, open(base_dir + 'data_df2_ts2.pkl', 'wb'))

In [131]:
# p-means for elmo embeddings
elmo_sent_emb_mean = np.zeros((len(docs), 1024))
for i, doc in enumerate(docs):
    tokens = [token.text for token in doc]
    #tokens = ["I", "ate", "an", "apple", "for", "breakfast"]
    vectors = elmo.embed_sentence(tokens)
    elmo_sent_emb_mean[i] = np.mean(vectors[2, :, :], axis=0)
    if i%100 == 0:
        print(i)

0
100
200
300
400
500


In [128]:
np.median(vectors[2, :, :], axis=0).shape

(1024,)

In [132]:
elmo_sent_emb_mean_df = pd.DataFrame(data=elmo_sent_emb_mean)

In [133]:
elmo_sent_emb_mean_df['row_id'] = elmo_sent_emb_mean_df.reset_index().index

In [134]:
data_df2_tr3 = pd.merge(data_df2_tr, elmo_sent_emb_mean_df, 
                        left_on='row_id', right_on='row_id',
                        how = 'left',
                        suffixes=('_l', '_r'))

data_df2_ts3 = pd.merge(data_df2_ts, elmo_sent_emb_mean_df, 
                        left_on='row_id', right_on='row_id',
                        how = 'left',
                        suffixes=('_l', '_r'))

In [135]:
pickle.dump(data_df2_tr3, open(base_dir + 'data_df2_tr3.pkl', 'wb'))
pickle.dump(data_df2_ts3, open(base_dir + 'data_df2_ts3.pkl', 'wb'))