# Experiments for CS224U Project

## Setup

### Imports

In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
import csv
from sklearn.metrics import classification_report, accuracy_score, f1_score

In [2]:
import utils
import sst
import scipy.stats
from sgd_classifier import BasicSGDClassifier

In [3]:
from sklearn.linear_model import LogisticRegression
import os

In [4]:
import tensorflow as tf
from tf_rnn_classifier import TfRNNClassifier

  from ._conv import register_converters as _register_converters


In [5]:
vsmdata_home = 'vsmdata'

glove_home = os.path.join(vsmdata_home, 'glove.6B')

In [6]:
import numpy as np

### Dataset

In [7]:
def read_array_from_csv(inputcsv):
    out = []
    with open(inputcsv, encoding='utf-8') as f:
        reader = csv.reader(f)
        for q in reader:
            out.append(q)
    return out

In [8]:
new_train = read_array_from_csv('data/train_data.csv')

In [9]:
anon_new_train = read_array_from_csv('data/anon_train_data.csv')

In [10]:
anon_new_test = read_array_from_csv('data/anon_test_data.csv')

### SST Machinery

In [11]:
def hansard_reader(
        src_filename,
        class_func=None):
    """Overview

    Parameters
    ----------
    src_filename : str
        Full path to the file to be read.
    class_func : None, or function mapping labels to labels or None
        If this is None, then the original 5-way labels are returned.
        Other options: `binary_class_func` and `ternary_class_func`
        (or you could write your own).


    Yields
    ------
    (tree, label)
        nltk.Tree, str in {'0','1','2','3','4'}

    """
    if class_func is None:
        class_func = lambda x: x
    with open(src_filename, encoding='utf-8') as f:
        reader = csv.reader(f)
        for q in reader:
            yield (q[0], class_func(q[1]))

We need a reader for each dataset, both for train and for test.

First, the standard data:

In [12]:
def train_reader(**kwargs):
    """Convenience function for reading the train file, full-trees only."""
    src = 'data/train_data.csv'
    return hansard_reader(src,**kwargs)

Next, the anonymised data:

In [13]:
def anon_train_reader(**kwargs):
    src = 'data/anon_train_data.csv'
    return hansard_reader(src,**kwargs)

The test readers won't be used until the *very* end.

In [14]:
def cas_to_gov(label):
    if label == 'cas':
        return 'gov'
    else:
        return label

In [33]:
train_data = [(question, label) for question, label in anon_train_reader(class_func=cas_to_gov)]
X, y = zip(*train_data)
X = list(X)
y = list(y)

## MLP Classifier

First, load up some stuff from sklearn

In [49]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.neural_network import MLPClassifier

In [18]:
# Specify the vectorising function
vectorizer = CountVectorizer()

In [46]:
X_vec = vectorizer.fit_transform(X)

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X_vec, y)

In [86]:
mlp = MLPClassifier(hidden_layer_sizes=(4,4,4),max_iter=2000)

In [87]:
mlp.fit(X_train,y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(4, 4, 4), learning_rate='constant',
       learning_rate_init=0.001, max_iter=2000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [88]:
predictions = mlp.predict(X_test)

In [91]:
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

        gov       0.63      0.57      0.60       899
        opp       0.78      0.81      0.80      1643

avg / total       0.72      0.73      0.73      2542

