# Experiments for CS224U Project

## Setup

### Imports

In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
import csv
from sklearn.metrics import classification_report, accuracy_score, f1_score

In [2]:
import utils
import sst
import scipy.stats
from sgd_classifier import BasicSGDClassifier

In [3]:
from sklearn.linear_model import LogisticRegression
import os

In [4]:
import tensorflow as tf
from tf_rnn_classifier import TfRNNClassifier

  from ._conv import register_converters as _register_converters


In [5]:
vsmdata_home = 'vsmdata'

glove_home = os.path.join(vsmdata_home, 'glove.6B')

In [6]:
import numpy as np

### Dataset

In [7]:
def read_array_from_csv(inputcsv):
    out = []
    with open(inputcsv, encoding='utf-8') as f:
        reader = csv.reader(f)
        for q in reader:
            out.append(q)
    return out

In [8]:
new_train = read_array_from_csv('data/train_data.csv')

In [9]:
anon_new_train = read_array_from_csv('data/anon_train_data.csv')

In [10]:
anon_new_test = read_array_from_csv('data/anon_test_data.csv')

### SST Machinery

In [11]:
def hansard_reader(
        src_filename,
        class_func=None):
    """Overview

    Parameters
    ----------
    src_filename : str
        Full path to the file to be read.
    class_func : None, or function mapping labels to labels or None
        If this is None, then the original 5-way labels are returned.
        Other options: `binary_class_func` and `ternary_class_func`
        (or you could write your own).


    Yields
    ------
    (tree, label)
        nltk.Tree, str in {'0','1','2','3','4'}

    """
    if class_func is None:
        class_func = lambda x: x
    with open(src_filename, encoding='utf-8') as f:
        reader = csv.reader(f)
        for q in reader:
            yield (q[0], class_func(q[1]))

We need a reader for each dataset, both for train and for test.

First, the standard data:

In [12]:
def train_reader(**kwargs):
    """Convenience function for reading the train file, full-trees only."""
    src = 'data/train_data.csv'
    return hansard_reader(src,**kwargs)

In [13]:
def test_reader(**kwargs):
    """Convenience function for reading the train file, full-trees only."""
    src = 'data/test_data.csv'
    return hansard_reader(src,**kwargs)

Next, the anonymised data:

In [14]:
def anon_train_reader(**kwargs):
    src = 'data/anon_train_data.csv'
    return hansard_reader(src,**kwargs)

In [15]:
def anon_test_reader(**kwargs):
    src = 'data/anon_test_data.csv'
    return hansard_reader(src,**kwargs)

The test readers won't be used until the *very* end.

### Class Functions

In [16]:
def cas_to_gov(label):
    if label == 'cas':
        return 'gov'
    else:
        return label

## Recurrent Neural Network

This should use the tensorflow RNN set-up.

### Build the Input Vectors

In [27]:
train_data = [(question.split(), label) for question, label in anon_train_reader(class_func=cas_to_gov)]
X, y = zip(*train_data)
X_rnn_train = list(X)
y_rnn_train = list(y)

In [28]:
test_data = [(question.split(), label) for question, label in anon_test_reader(class_func=cas_to_gov)]
X, y = zip(*test_data)
X_rnn_assess = list(X)
y_rnn_assess = list(y)

### Analyse the Inputs

The tensorflow implementation requires that we specify a maximum length up front.

In [29]:
utils.sequence_length_report(X_rnn_train, potential_max_length=150)

Max sequence length: 1,123
Min sequence length: 0
Mean sequence length: 70.29
Median sequence length: 68.00
Sequences longer than 150: 293 of 10,165


Based on this, we might take this maximum length to be 150.

#### Get the vocab

In [30]:
hansard_full_train_vocab = sst.get_vocab(X_rnn_train)

In [31]:
print("hansard_full_train_vocab has {:,} items".format(len(hansard_full_train_vocab)))

hansard_full_train_vocab has 43,131 items


In [32]:
hansard_train_vocab = sst.get_vocab(X_rnn_train, n_words=5000)

In [34]:
len(X_rnn_train)

10165

### Experiments

#### GloVe Embeddings

In [27]:
glove_lookup = utils.glove2dict(
    os.path.join(glove_home, 'glove.6B.50d.txt'))
sst_glove_vocab = sorted(set(glove_lookup) & set(hansard_train_vocab))

In [28]:
glove_embedding = np.array([glove_lookup[w] for w in sst_glove_vocab])

In [29]:
# Add $UNK and its random representation:

sst_glove_vocab.append("$UNK")

glove_embedding = np.vstack(
    (glove_embedding, utils.randvec(glove_embedding.shape[1])))

In [30]:
glove_tf_rnn = TfRNNClassifier(
    sst_glove_vocab,    
    embedding=glove_embedding,
    hidden_dim=80,
    max_length=150,
    hidden_activation=tf.nn.relu,
    cell_class=tf.nn.rnn_cell.LSTMCell,
    train_embedding=True,
    max_iter=500,
    eta=0.05) 

In [31]:
_ = glove_tf_rnn.fit(X_rnn_train, y_rnn_train)

Iteration 500: loss: 3.5648308098316193

In [32]:
tf_rnn_dev_predictions = glove_tf_rnn.predict(X_rnn_assess)

In [33]:
print(classification_report(y_rnn_assess, tf_rnn_dev_predictions))

             precision    recall  f1-score   support

        gov       0.69      0.41      0.52      1068
        opp       0.74      0.90      0.81      1982

avg / total       0.72      0.73      0.71      3050



Now with a different cell

In [34]:
glove_tf_rnn = TfRNNClassifier(
    sst_glove_vocab,    
    embedding=glove_embedding,
    hidden_dim=80,
    max_length=150,
    hidden_activation=tf.nn.relu,
    cell_class=tf.nn.rnn_cell.BasicRNNCell,
    train_embedding=True,
    max_iter=500,
    eta=0.05) 

In [35]:
_ = glove_tf_rnn.fit(X_rnn_train, y_rnn_train)

Iteration 500: loss: 2.6886065602302557

In [36]:
tf_rnn_dev_predictions = glove_tf_rnn.predict(X_rnn_assess)

In [37]:
print(classification_report(y_rnn_assess, tf_rnn_dev_predictions))

             precision    recall  f1-score   support

        gov       0.50      0.40      0.44      1068
        opp       0.71      0.78      0.74      1982

avg / total       0.63      0.65      0.64      3050

