# Generating Word Vectors for Each Utterance

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import torch
import regex as re
from datetime import datetime as dt
from mod.LM.RoBERTa import RoBERTa

### 1. Importing data

In [None]:
start = dt.now()
PATH = 'data/'
dataset = 'scrape_data/reddit-data.tsv'
output_name = 'vecs/reddit-vecs.tsv'


df = pd.read_table(PATH + dataset, lineterminator='\n')
print('pre-dropping of irrelevant data', len(df))

df = df.drop_duplicates(subset=['author', 'body'])
df = df.loc[~df['body'].isna()]
df.index=range(len(df))

print(list(df))
print(PATH, len(df))

### 2. Setting up model components

In [None]:
level = [8,-1]
mod = RoBERTa(device='cuda', special_tokens=False, layers=level)
mod.eval()

### 3. Vectorizing data

We're going to stream the data. So to start, we set up a file that we'll save the data to.

In [None]:
meta_data_cols = ['subreddit','subId', 'sub_time', 'commentId', 'likes', 'time', 'author']

data = pd.DataFrame(columns=['_id'] + meta_data_cols + ['token', 'vec'])
data.to_csv(PATH+output_name, index=False, encoding='utf-8', sep='\t')

And then we'll use our word vector model to generate embeddings and save them to the appropriate directory.

In [None]:
ct = 0
with torch.no_grad():
    for k in df.index:
        text = df['body'].loc[k]
        meta_data = df[meta_data_cols].loc[k].values.tolist()
        ct+=1

        try:
            w, tokens = mod._tokenize(str(text).replace('\n', ' '))
            sel = ((tokens == np.array(['.', '!', '?', ',', '...', ':', ';', '. . .']).reshape(-1, 1)).sum(axis=0) == 0)
            vecs = mod.E(torch.LongTensor(w).view(-1)[sel])
            update = [[k] + meta_data + [tokens[sel][i], str(vec.cpu().view(-1).tolist())] for
                      i, vec in enumerate(vecs)]
            update = pd.DataFrame(np.array(update, dtype='object').reshape(-1, len(list(data))), columns=list(data))
            update.to_csv(PATH + output_name, index=False, encoding='utf-8', header=False, mode='a', sep='\t')

            if len(vecs) == 0:
                ct -= 1

        except ValueError:
            ct -= 1

        except IndexError:
            ct -= 1

        except RuntimeError:
            ct -= 1

        except AttributeError:
            ct -= 1

print('total passable items ({}/{}) in t(s)={}'.format(ct, len(df), dt.now()-start))