In [1]:
import torch
from pytorch_pretrained_bert import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel

In [2]:
import numpy as np
import pandas as pd

In [20]:
from sklearn.model_selection import train_test_split
import pickle

In [14]:
base_dir = '/Users/vaibhav/MiscProjects/question-classification/'

In [15]:
data_file = base_dir + 'questions_top5cat.xlsx'
data_df = pd.read_excel(data_file)

In [16]:
# add row id
data_df['row_id'] = data_df.reset_index().index

In [17]:
# create binary flags
data_df = pd.concat([data_df, pd.get_dummies(data_df['category'], prefix='f')], axis=1)

In [18]:
data_df2 = data_df

In [51]:
data_df2.shape

(526, 9)

In [21]:
all_cats = data_df2['category'].unique()
for i, cat in enumerate(all_cats):
    data_df2_cat = data_df2[(data_df2['original'] == 1) & (data_df2['category'] == cat)]
    data_df2_cat_tr, data_df2_cat_ts, _, _ = train_test_split(data_df2_cat, data_df2_cat, 
                                                              test_size=25, random_state=42)
    if i == 0:
        data_df2_tr = data_df2_cat_tr
        data_df2_ts = data_df2_cat_ts
    else:
        data_df2_tr = pd.concat([data_df2_tr, data_df2_cat_tr], axis=0)
        data_df2_ts = pd.concat([data_df2_ts, data_df2_cat_ts], axis=0)

In [53]:
# append other examples to train set
data_df2_tr = pd.concat([data_df2_tr, data_df2[data_df2['original'] == 0]], axis=0)

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained('/Users/vaibhav/MiscProjects/gpt-2/')

In [4]:
text_1 = "Who was Jim Henson ?"
indexed_tokens_1 = tokenizer.encode(text_1)
tokens_tensor_1 = torch.tensor([indexed_tokens_1])

In [5]:
indexed_tokens_1

[8241, 373, 5395, 367, 19069, 5633]

In [6]:
model = GPT2Model.from_pretrained('/Users/vaibhav/MiscProjects/gpt-2/')
model.eval()

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (h): ModuleList(
    (0): Block(
      (ln_1): BertLayerNorm()
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
      )
      (ln_2): BertLayerNorm()
      (mlp): MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
      )
    )
    (1): Block(
      (ln_1): BertLayerNorm()
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
      )
      (ln_2): BertLayerNorm()
      (mlp): MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
      )
    )
    (2): Block(
      (ln_1): BertLayerNorm()
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
      )
      (ln_2): BertLayerNorm()
      (mlp): MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
      )
    )
    (3): Block(
      (ln_1): BertLayerNorm()
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
      )
      (ln_2): BertLayerNorm()
     

In [7]:
# Predict hidden states features for each layer
with torch.no_grad():
    hidden_states_1, past = model(tokens_tensor_1)

In [22]:
hidden_states_1.numpy().shape

(1, 6, 768)

In [27]:
np.mean(hidden_states_1.numpy(), axis=1).shape

(1, 768)

In [10]:
tokenizer.tokenize('Will hardware providers capture a bigger share of the $55bn TAM that you claim to have access to?')

['Will',
 'Ġhardware',
 'Ġproviders',
 'Ġcapture',
 'Ġa',
 'Ġbigger',
 'Ġshare',
 'Ġof',
 'Ġthe',
 'Ġ$',
 '55',
 'bn',
 'ĠTAM',
 'Ġthat',
 'Ġyou',
 'Ġclaim',
 'Ġto',
 'Ġhave',
 'Ġaccess',
 'Ġto',
 '?']

In [48]:
def replace_oov(text):
    text = text.replace("don’t", "do not")
    return text

In [66]:
# p-means for elmo embeddings
gpt2_sent_emb_mean = np.zeros((data_df2.shape[0], 768))
for row in data_df2.itertuples():
    indexed_tokens = tokenizer.encode(replace_oov(row.question))
    tokens_tensor = torch.tensor([indexed_tokens])
    with torch.no_grad():
        hidden_states, past = model(tokens_tensor)
    gpt2_sent_emb_mean[row.Index] = np.mean(hidden_states.numpy(), axis=1)
    if row.Index%100 == 0:
        print(row.Index)

0
100
200
300
400
500


In [65]:
np.mean(hidden_states.numpy(), axis=1).shape

(1, 768)

In [67]:
gpt2_sent_emb_mean_df = pd.DataFrame(data=gpt2_sent_emb_mean)
gpt2_sent_emb_mean_df['row_id'] = gpt2_sent_emb_mean_df.reset_index().index

data_df2_tr6 = pd.merge(data_df2_tr, gpt2_sent_emb_mean_df, 
                        left_on='row_id', right_on='row_id',
                        how = 'left',
                        suffixes=('_l', '_r'))

data_df2_ts6 = pd.merge(data_df2_ts, gpt2_sent_emb_mean_df, 
                        left_on='row_id', right_on='row_id',
                        how = 'left',
                        suffixes=('_l', '_r'))

pickle.dump(data_df2_tr6, open(base_dir + 'data_df2_tr6.pkl', 'wb'))
pickle.dump(data_df2_ts6, open(base_dir + 'data_df2_ts6.pkl', 'wb'))