# Glove Baseline

In [21]:
import sys
import re
sys.path.append('.')
sys.path.append('..')

from subreddit_frequency import load_dataframe_from_jsonl
from tqdm import tqdm
tqdm.pandas()

import seaborn as sns
from datetime import datetime
sns.set('paper')

from ipywidgets import interact
import pandas as pd
from pathlib import Path
from matplotlib import pyplot as plt

from collections import defaultdict
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as torch_data
import torch.optim as optim

import numpy as np
from pprint import pprint

device = "cuda" if torch.cuda.is_available() else "cpu"

### Read in data

In [2]:
train_path = Path.cwd().parent / "aita" / "aita-train.pkl"
test_path = Path.cwd().parent / "aita" / "aita-test.pkl"

In [3]:
train_dataset_df = pd.read_pickle(train_path)
test_dataset_df = pd.read_pickle(test_path)

### Add one-hot no embedding

In [96]:
# Extract all of the words from our training set and count frequencies
word_counts = defaultdict(int)
for post in train_dataset_df.selftext.iteritems():
    text = post[1].strip().lower()
    words = re.findall(r"[\w']+|[.,!?;]", text)
    for word in words:
        word_counts[word] += 1
len(word_counts)

10223

In [97]:
# Create token mapping
word_to_token = dict()
token_to_word = dict()
word_to_token['<UNK>'] = 0
token_to_word[0] = '<UNK>'
i = 1
for word, count in word_counts.items():
    if count < 7:
        continue
    word_to_token[word] = i
    token_to_word[i] = word
    i += 1
len(word_to_token)

2349

In [166]:
# Tokenize and detokenize
def tokenize_post(post):
    text = post.strip().lower()
    words = re.findall(r"[\w']+|[.,!?;]", text)
    output = []
    for word in words:
        output.append(word_to_token.get(word, 0))
    return torch.eye(len(word_to_token))[np.array(output)].sum(axis=0)     

In [208]:
# Tokenize reviews in train dataset
train_dataset_df['tokenized_selftext'] = train_dataset_df.selftext.apply(tokenize_post)
test_dataset_df['tokenized_selftext'] = test_dataset_df.selftext.apply(tokenize_post)

### Add glove embeddings

In [4]:
embeddings_dict = {}
with open("glove.6B.300d.txt", 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        token = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[token] = vector

In [5]:
def selftext_to_glove(text, embedding_size=300):
    embeddings = [
        embeddings_dict[word.lower().strip()]
        for word in text.split()
        if word.lower().strip() in embeddings_dict
    ]
    if embeddings:
        glove_embeddings = np.stack(embeddings).mean(axis=0)
    else:
        glove_embeddings = np.zeros(embedding_size)
    return glove_embeddings

In [6]:
train_dataset_df['selftext_glove_300'] = train_dataset_df.selftext.apply(selftext_to_glove)
test_dataset_df['selftext_glove_300'] = test_dataset_df.selftext.apply(selftext_to_glove)

## Simple Feed Forward No Embeddings

In [7]:
class Feedforward(torch.nn.Module):
        def __init__(self, input_size, hidden_size, output_size):
            super(Feedforward, self).__init__()
            self.input_size = input_size
            self.hidden_size  = hidden_size
            self.output_size = output_size
            self.fc1 = torch.nn.Linear(self.input_size, self.hidden_size)
            self.relu = torch.nn.ReLU()
            self.fc2 = torch.nn.Linear(self.hidden_size, self.output_size)
            
        def forward(self, x):
            hidden = self.fc1(x)
            relu = self.relu(hidden)
            output = self.fc2(relu)
            return output

In [20]:
def build_features_and_ys(df, features):
    feature_df = df[features]
    xs = feature_df.apply(
        lambda x : np.hstack([np.array(a) for a in x]), axis=1
    ).tolist()
    label_index = sorted(train_dataset_df.label.unique())
    ys = np.array(list(map(label_index.index, df.label.to_list())))
    return torch.Tensor(xs), torch.LongTensor(ys)

def train_model(model, xs, ys, epochs=10, batch_size=10):
    model = model.to(device)
    opt = optim.Adam(model.parameters(), lr=0.001)
    dataset = torch_data.TensorDataset(xs, ys)
    loader = torch_data.DataLoader(dataset, 
               batch_size=batch_size,
               shuffle=True)
    loss_fn = torch.nn.CrossEntropyLoss()

    for epoch in range(epochs):
        epoch_loss = 0
        for context, label in loader:
            context = context.to(device)
            label = label.to(device)
            opt.zero_grad()
            # Get predictions
            outputs = model(context)
            # Calculate loss
            loss = loss_fn(outputs, label)
            loss.backward()
            opt.step()
            epoch_loss += loss
        print(f"EPOCH {epoch} LOSS = {epoch_loss}")
    return model

In [15]:
model = Feedforward(300, 512, len(train_dataset_df.label.unique()))
xs, ys = build_features_and_ys(train_dataset_df, ['selftext_glove_300'])
trained_model = train_model(model, xs, ys)

EPOCH 0 LOSS = 2136.147216796875
EPOCH 1 LOSS = 2111.3388671875
EPOCH 2 LOSS = 2098.670166015625
EPOCH 3 LOSS = 2090.91650390625
EPOCH 4 LOSS = 2082.97412109375
EPOCH 5 LOSS = 2080.221923828125
EPOCH 6 LOSS = 2075.304443359375
EPOCH 7 LOSS = 2069.415771484375
EPOCH 8 LOSS = 2068.8681640625
EPOCH 9 LOSS = 2063.454833984375


## Validate

In [28]:
def get_model_accuracy(model, xs, ys):
    model = model.to(device)
    xs = xs.to(device)
    print(np.mean((model(xs).to("cpu").argmax(axis=1) == ys).numpy()))

In [29]:
test_xs, test_ys = build_features_and_ys(test_dataset_df, ['selftext_glove_300'])
get_model_accuracy(trained_model, test_xs, test_ys)

0.5758957654723127
