# Experiments for CS224U Project

## Setup

### Imports

In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
import csv
from sklearn.metrics import classification_report, accuracy_score, f1_score

In [2]:
import utils
import sst
import scipy.stats
from sgd_classifier import BasicSGDClassifier

In [3]:
from sklearn.linear_model import LogisticRegression
import os

In [4]:
import tensorflow as tf
from tf_rnn_classifier import TfRNNClassifier

  from ._conv import register_converters as _register_converters


In [5]:
vsmdata_home = 'vsmdata'

glove_home = os.path.join(vsmdata_home, 'glove.6B')

In [18]:
import numpy as np

### Dataset

In [6]:
def read_array_from_csv(inputcsv):
    out = []
    with open(inputcsv, encoding='utf-8') as f:
        reader = csv.reader(f)
        for q in reader:
            out.append(q)
    return out

In [7]:
new_train = read_array_from_csv('data/train_data.csv')

In [8]:
anon_new_train = read_array_from_csv('data/anon_train_data.csv')

In [9]:
anon_new_test = read_array_from_csv('data/anon_test_data.csv')

### SST Machinery

In [10]:
def hansard_reader(
        src_filename,
        class_func=None):
    """Overview

    Parameters
    ----------
    src_filename : str
        Full path to the file to be read.
    class_func : None, or function mapping labels to labels or None
        If this is None, then the original 5-way labels are returned.
        Other options: `binary_class_func` and `ternary_class_func`
        (or you could write your own).


    Yields
    ------
    (tree, label)
        nltk.Tree, str in {'0','1','2','3','4'}

    """
    if class_func is None:
        class_func = lambda x: x
    with open(src_filename, encoding='utf-8') as f:
        reader = csv.reader(f)
        for q in reader:
            yield (q[0], class_func(q[1]))

We need a reader for each dataset, both for train and for test.

First, the standard data:

In [11]:
def train_reader(**kwargs):
    """Convenience function for reading the train file, full-trees only."""
    src = 'data/train_data.csv'
    return hansard_reader(src,**kwargs)

In [12]:
def test_reader(**kwargs):
    """Convenience function for reading the train file, full-trees only."""
    src = 'data/test_data.csv'
    return hansard_reader(src,**kwargs)

Next, the anonymised data:

In [13]:
def anon_train_reader(**kwargs):
    src = 'data/anon_train_data.csv'
    return hansard_reader(src,**kwargs)

In [14]:
def anon_test_reader(**kwargs):
    src = 'data/anon_test_data.csv'
    return hansard_reader(src,**kwargs)

The test readers won't be used until the *very* end.

### Class Functions

In [15]:
def cas_to_gov(label):
    if label == 'cas':
        return 'gov'
    else:
        return label

## Baselines

### Bag-of-words Feature Functions

A unigrams feature function

In [28]:
def unigrams_phi(question):
    """The basis for a unigrams feature function.
    Parameters
    ----------
    question : string
        The question to represent.
    
    Returns
    -------    
    defaultdict
        A map from strings to their counts in the question. (Counter maps a 
        list to a dict of counts of the elements in that list.)
    """
    unigrams = {}
    for word in question.split() :
        unigrams[word] = unigrams.get(word, 0) + 1
    return unigrams

A bigrams feature function

In [9]:
def bigrams_phi(question):
    """The basis for a unigrams feature function.
    
    Parameters
    ----------
    tree : nltk.tree
        The tree to represent.
    
    Returns
    -------    
    defaultdict
        A map from strings to their counts in `tree`. (Counter maps a 
        list to a dict of counts of the elements in that list.)
    
    """
    bigrams = {}
    qarray = question.split()
    for i in range(0, len(qarray)-1) :
        big = qarray[i] + '_' + qarray[i+1]
        bigrams[big] = bigrams.get(big, 0) + 1
    return bigrams

A basic bag-of-words unigrams and bigrams feature function

In [11]:
def uni_bigrams_phi(question):
    grams = unigrams_phi(question)
    grams.update(bigrams_phi(question))
    return grams

We found that *friend* seems to be a good indicator. What happens if we only give the classifier that feature? Or unigrams without it?

In [19]:
def only_friend_phi(question):
    if 'friend' in question.lower().split():
        return {'friend':1}
    else:
        return {'friend':0}

In [20]:
def no_friends_phi(question):
    unigrams = {}
    for word in question.split() :
        if word.lower() is not 'friend':
            unigrams[word.lower()] = unigrams.get(word.lower(), 0) + 1
    return unigrams

### SGD Classifier Baseline
Stochastic Gradient Descent

In [23]:
def fit_basic_sgd_classifier(X, y):    
    """Wrapper for `BasicSGDClassifier`.
    
    Parameters
    ----------
    X : 2d np.array
        The matrix of features, one example per row.        
    y : list
        The list of labels for rows in `X`.
    
    Returns
    -------
    BasicSGDClassifier
        A trained `BasicSGDClassifier` instance.
    
    """    
    mod = BasicSGDClassifier()
    mod.fit(X, y)
    return mod

#### Experiments

First, an SGD classifier trained on unigrams for the unmodified dataset.

In [29]:
_ = sst.experiment(
    unigrams_phi,
    fit_basic_sgd_classifier,
    train_reader=train_reader, 
    assess_reader=None, 
    train_size=0.7,
    class_func=cas_to_gov,
    score_func=utils.safe_macro_f1,
    verbose=True)

Accuracy: 0.775
             precision    recall  f1-score   support

        gov      0.707     0.587     0.642      1046
        opp      0.802     0.873     0.836      2004

avg / total      0.770     0.775     0.769      3050



Next, an SGD classifier trained on unigrams for the modified dataset.

In [29]:
_ = sst.experiment(
    unigrams_phi,
    fit_basic_sgd_classifier,
    train_reader=anon_train_reader, 
    assess_reader=None, 
    train_size=0.7,
    class_func=cas_to_gov,
    score_func=utils.safe_macro_f1,
    verbose=True)

Accuracy: 0.775
             precision    recall  f1-score   support

        gov      0.707     0.587     0.642      1046
        opp      0.802     0.873     0.836      2004

avg / total      0.770     0.775     0.769      3050



Finally, an SGD classifier trained on bigrams from the modified dataset.

In [25]:
_ = sst.experiment(
    bigrams_phi,
    fit_basic_sgd_classifier,
    train_reader=anon_train_reader, 
    assess_reader=None, 
    train_size=0.7,
    class_func=cas_to_gov,
    score_func=utils.safe_macro_f1,
    verbose=True)

Accuracy: 0.754
             precision    recall  f1-score   support

        gov      0.649     0.618     0.633      1047
        opp      0.805     0.825     0.815      2003

avg / total      0.752     0.754     0.753      3050



### Logistic Regression Baseline

In [27]:
def fit_maxent_classifier(X, y):   
    mod = LogisticRegression(fit_intercept=True)
    mod.fit(X, y)
    return mod

#### Experiments

Again, we start with unigrams for the basic dataset.

In [35]:
_ = sst.experiment(
    unigrams_phi,
    fit_maxent_classifier,
    train_reader=train_reader, 
    assess_reader=None, 
    train_size=0.7,
    class_func=cas_to_gov,
    score_func=utils.safe_macro_f1,
    verbose=True)

Accuracy: 0.782
             precision    recall  f1-score   support

        gov      0.750     0.577     0.652      1079
        opp      0.795     0.894     0.842      1971

avg / total      0.779     0.782     0.775      3050



Now unigrams for the modified dataset.

In [31]:
_ = sst.experiment(
    unigrams_phi,
    fit_maxent_classifier,
    train_reader=anon_train_reader, 
    assess_reader=None, 
    train_size=0.7,
    class_func=cas_to_gov,
    score_func=utils.safe_macro_f1,
    verbose=True)

Accuracy: 0.769
             precision    recall  f1-score   support

        gov      0.703     0.595     0.644      1072
        opp      0.797     0.863     0.829      1978

avg / total      0.764     0.769     0.764      3050



Now bigrams on the modified dataset.

In [31]:
_ = sst.experiment(
    bigrams_phi,
    fit_maxent_classifier,
    train_reader=anon_train_reader, 
    assess_reader=None, 
    train_size=0.7,
    class_func=cas_to_gov,
    score_func=utils.safe_macro_f1,
    verbose=True)

Accuracy: 0.769
             precision    recall  f1-score   support

        gov      0.703     0.595     0.644      1072
        opp      0.797     0.863     0.829      1978

avg / total      0.764     0.769     0.764      3050



### Logistic Regression without using sst.experiment

#### Setting up parameters

First, choose the feature function to use.

In [29]:
phi = unigrams_phi

Next, choose the reader used for testing. (None gives a random split.)

In [36]:
assess_reader = None

If we're doing a split, what size should we train on?

In [38]:
train_size = 0.7

Next, choose a function for the classes. (We probably want cas_to_gov.)

In [30]:
class_func = cas_to_gov

Do we want to vectorise?

In [32]:
vectorize = True

#### Building the Classifier

Which classifier are we to use?

In [61]:
classifier = LogisticRegression(fit_intercept=True)

Make it into a training function.

In [62]:
def train_func(X, y):   
    mod = classifier
    mod.fit(X, y)
    return mod

Build the dataset.

In [33]:
train = sst.build_dataset(train_reader, phi, class_func, vectorize=vectorize)

#### Setting up the experiment

First, get the data into standardised variables.

In [34]:
X_train = train['X']
y_train = train['y']
X_assess = None
y_assess = None

If we're not using an assess_reader, do a split on the training data. Otherwise, read in the assessment dataset.

In [39]:
if assess_reader == None:
     X_train, X_assess, y_train, y_assess = train_test_split(
            X_train, y_train, train_size=train_size, test_size=None)
else:
    # Assessment dataset using the training vectorizer:
    assess = sst.build_dataset(
        assess_reader,
        phi,
        class_func,
        vectorizer=train['vectorizer'],
        vectorize=vectorize)
    X_assess, y_assess = assess['X'], assess['y']

#### Experiment
Train the model.

In [None]:
mod = train_func(X_train, y_train)

Test the model.

In [None]:
predictions = mod.predict(X_assess)

Print a report.

In [46]:
print('Accuracy: %0.03f' % sst.accuracy_score(y_assess, predictions))
print(classification_report(y_assess, predictions, digits=3))

Accuracy: 0.768
             precision    recall  f1-score   support

        gov      0.677     0.603     0.638      1020
        opp      0.807     0.853     0.829      1987

avg / total      0.763     0.768     0.764      3007



## Logistic Regression with GloVe

This should be fairly straightforward. We do Logistic Regression again, but we use GloVe embeddings, rather than the bag of words embeddings from above.

### GloVe Setup

First, we need the GloVe lookup function.

In [16]:
glove_lookup = utils.glove2dict(
    os.path.join(glove_home, 'glove.6B.50d.txt'))

Now build a feature function based on the GloVe embeddings. The important parameter here is how we combine vectors for different words.

In [19]:
def vsm_words_phi(sentence, lookup, np_func=np.sum):
    """Represent `sentence` as a combination of the vector of its words.
    
    Parameters
    ----------
    tree : A string   
    lookup : dict
        From words to vectors.
    np_func : function (default: np.sum)
        A numpy matrix operation that can be applied columnwise, 
        like `np.mean`, `np.sum`, or `np.prod`. The requirement is that 
        the function take `axis=0` as one of its arguments (to ensure
        columnwise combination) and that it return a vector of a 
        fixed length, no matter what the size of the tree is.
    
    Returns
    -------
    np.array, dimension `X.shape[1]`
            
    """      
    allvecs = np.array([lookup[w] for w in sentence.split() if w in lookup])    
    if len(allvecs) == 0:
        dim = len(next(iter(lookup.values())))
        feats = np.zeros(dim)
    else:       
        feats = np_func(allvecs, axis=0)      
    return feats

In [20]:
def glove_words_phi(sentence, np_func=np.sum):
    return vsm_words_phi(sentence, glove_lookup, np_func=np_func)

#### Getting the Training/Test Data

Build the dataset.

In [47]:
train = sst.build_dataset(train_reader, glove_words_phi, class_func, vectorize=False)

In [48]:
X_train = train['X']
y_train = train['y']
X_assess = None
y_assess = None

If we're not using an assess_reader, do a split on the training data. Otherwise, read in the assessment dataset.

In [49]:
if assess_reader == None:
     X_train, X_assess, y_train, y_assess = train_test_split(
            X_train, y_train, train_size=train_size, test_size=None)
else:
    # Assessment dataset using the training vectorizer:
    assess = sst.build_dataset(
        assess_reader,
        phi,
        class_func,
        vectorizer=train['vectorizer'],
        vectorize=vectorize)
    X_assess, y_assess = assess['X'], assess['y']

### Experiments

For this, we'll do some grid searching. In order to show how to make this work in general, here we'll step through the process.

#### Setting Parameters

We need a base model to do a grid search on.

In [50]:
basemod = LogisticRegression()

We need to specify a parameter grid to search over.

In [51]:
param_grid = {'fit_intercept': [True, False], 
                  'C': [0.4, 0.6, 0.8, 1.0, 2.0, 3.0],
                  'penalty': ['l1','l2']}

How many fold cross-validation? (Default is None.)

In [52]:
cv = 5

What score metric should be used? (Some function is required here, unless the basemod provides its own.). Options include 'f1_macro', 'f1_micro' and 'accuracy'.

In [53]:
scoring = 'f1_macro'

#### Build the GridSearch

In [54]:
grid_classifier = GridSearchCV(basemod, param_grid, cv=cv, scoring=scoring,verbose=10)

In [56]:
grid_classifier.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] C=0.4, fit_intercept=True, penalty=l1 ...........................
[CV]  C=0.4, fit_intercept=True, penalty=l1, score=0.644543794940576, total=   4.2s
[CV] C=0.4, fit_intercept=True, penalty=l1 ...........................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.2s remaining:    0.0s


[CV]  C=0.4, fit_intercept=True, penalty=l1, score=0.665227082752856, total=   9.9s
[CV] C=0.4, fit_intercept=True, penalty=l1 ...........................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   14.2s remaining:    0.0s


[CV]  C=0.4, fit_intercept=True, penalty=l1, score=0.6361557844808419, total=   3.5s
[CV] C=0.4, fit_intercept=True, penalty=l1 ...........................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   17.7s remaining:    0.0s


[CV]  C=0.4, fit_intercept=True, penalty=l1, score=0.6396433754924321, total=   3.0s
[CV] C=0.4, fit_intercept=True, penalty=l1 ...........................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   20.7s remaining:    0.0s


[CV]  C=0.4, fit_intercept=True, penalty=l1, score=0.6061868716340213, total=   5.6s
[CV] C=0.4, fit_intercept=True, penalty=l2 ...........................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   26.4s remaining:    0.0s


[CV]  C=0.4, fit_intercept=True, penalty=l2, score=0.6427753561094345, total=   0.4s
[CV] C=0.4, fit_intercept=True, penalty=l2 ...........................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   26.8s remaining:    0.0s


[CV]  C=0.4, fit_intercept=True, penalty=l2, score=0.6635259277731884, total=   0.5s
[CV] C=0.4, fit_intercept=True, penalty=l2 ...........................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   27.3s remaining:    0.0s


[CV]  C=0.4, fit_intercept=True, penalty=l2, score=0.6396409552099758, total=   0.4s
[CV] C=0.4, fit_intercept=True, penalty=l2 ...........................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   27.7s remaining:    0.0s


[CV]  C=0.4, fit_intercept=True, penalty=l2, score=0.6396243291592129, total=   0.4s
[CV] C=0.4, fit_intercept=True, penalty=l2 ...........................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   28.1s remaining:    0.0s


[CV]  C=0.4, fit_intercept=True, penalty=l2, score=0.6152210028191704, total=   0.3s
[CV] C=0.4, fit_intercept=False, penalty=l1 ..........................
[CV]  C=0.4, fit_intercept=False, penalty=l1, score=0.651792916481798, total=   3.4s
[CV] C=0.4, fit_intercept=False, penalty=l1 ..........................
[CV]  C=0.4, fit_intercept=False, penalty=l1, score=0.6587110805860805, total=   4.2s
[CV] C=0.4, fit_intercept=False, penalty=l1 ..........................
[CV]  C=0.4, fit_intercept=False, penalty=l1, score=0.6476042467510847, total=   3.6s
[CV] C=0.4, fit_intercept=False, penalty=l1 ..........................
[CV]  C=0.4, fit_intercept=False, penalty=l1, score=0.6540214239214063, total=   3.0s
[CV] C=0.4, fit_intercept=False, penalty=l1 ..........................
[CV]  C=0.4, fit_intercept=False, penalty=l1, score=0.6332325888017853, total=   3.2s
[CV] C=0.4, fit_intercept=False, penalty=l2 ..........................
[CV]  C=0.4, fit_intercept=False, penalty=l2, score=0.649037

[CV]  C=1.0, fit_intercept=True, penalty=l1, score=0.6372937496921637, total=   3.7s
[CV] C=1.0, fit_intercept=True, penalty=l1 ...........................
[CV]  C=1.0, fit_intercept=True, penalty=l1, score=0.6384336216669436, total=   3.2s
[CV] C=1.0, fit_intercept=True, penalty=l1 ...........................
[CV]  C=1.0, fit_intercept=True, penalty=l1, score=0.6104961338101258, total=   5.5s
[CV] C=1.0, fit_intercept=True, penalty=l2 ...........................
[CV]  C=1.0, fit_intercept=True, penalty=l2, score=0.6422003741591371, total=   0.2s
[CV] C=1.0, fit_intercept=True, penalty=l2 ...........................
[CV]  C=1.0, fit_intercept=True, penalty=l2, score=0.6622281832519901, total=   0.2s
[CV] C=1.0, fit_intercept=True, penalty=l2 ...........................
[CV]  C=1.0, fit_intercept=True, penalty=l2, score=0.6396409552099758, total=   0.2s
[CV] C=1.0, fit_intercept=True, penalty=l2 ...........................
[CV]  C=1.0, fit_intercept=True, penalty=l2, score=0.63962432915

[CV]  C=3.0, fit_intercept=False, penalty=l2, score=0.6501148100552043, total=   0.3s
[CV] C=3.0, fit_intercept=False, penalty=l2 ..........................
[CV]  C=3.0, fit_intercept=False, penalty=l2, score=0.6562824012860635, total=   0.3s
[CV] C=3.0, fit_intercept=False, penalty=l2 ..........................
[CV]  C=3.0, fit_intercept=False, penalty=l2, score=0.6508159958671864, total=   0.2s
[CV] C=3.0, fit_intercept=False, penalty=l2 ..........................
[CV]  C=3.0, fit_intercept=False, penalty=l2, score=0.6571676551424852, total=   0.3s
[CV] C=3.0, fit_intercept=False, penalty=l2 ..........................
[CV]  C=3.0, fit_intercept=False, penalty=l2, score=0.6308084373242723, total=   0.3s


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:  7.1min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'fit_intercept': [True, False], 'C': [0.4, 0.6, 0.8, 1.0, 2.0, 3.0], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1_macro', verbose=10)

#### Get some reports

In [58]:
print("Best params", grid_classifier.best_params_)
print("Best score: %0.03f" % grid_classifier.best_score_)

Best params {'C': 0.8, 'fit_intercept': False, 'penalty': 'l1'}
Best score: 0.651


#### Final Model

In [59]:
final_mod = grid_classifier.best_estimator_

Test the model.

In [60]:
predictions = final_mod.predict(X_assess)

Print a report.

In [61]:
print('Accuracy: %0.03f' % sst.accuracy_score(y_assess, predictions))
print(classification_report(y_assess, predictions, digits=3))

Accuracy: 0.708
             precision    recall  f1-score   support

        gov      0.613     0.424     0.501      1055
        opp      0.738     0.859     0.794      1995

avg / total      0.695     0.708     0.693      3050



#### Combined Function

In [62]:
def run_logistic_grid(np_func=np.sum):
    def glove_words_phi(sentence, np_func=np.sum):
        return vsm_words_phi(sentence, glove_lookup, np_func=np_func)
    train = sst.build_dataset(train_reader, glove_words_phi, class_func, vectorize=False)
    X_train = train['X']
    y_train = train['y']
    X_assess = None
    y_assess = None
    if assess_reader == None:
         X_train, X_assess, y_train, y_assess = train_test_split(
                X_train, y_train, train_size=train_size, test_size=None)
    else:
        # Assessment dataset using the training vectorizer:
        assess = sst.build_dataset(
            assess_reader,
            phi,
            class_func,
            vectorizer=train['vectorizer'],
            vectorize=vectorize)
        X_assess, y_assess = assess['X'], assess['y']
    
    grid_classifier = GridSearchCV(LogisticRegression(), {'fit_intercept': [True, False], 
                  'C': [0.4, 0.6, 0.8, 1.0, 2.0, 3.0],
                  'penalty': ['l1','l2']}, cv=5, scoring='f1_macro',verbose=1)
    grid_classifier.fit(X_train, y_train)
    print("Best params", grid_classifier.best_params_)
    print("Best score: %0.03f" % grid_classifier.best_score_)
    final_mod = grid_classifier.best_estimator_
    predictions = final_mod.predict(X_assess)
    print('Accuracy: %0.03f' % sst.accuracy_score(y_assess, predictions))
    print(classification_report(y_assess, predictions, digits=3))

In [63]:
run_logistic_grid(np.add)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:  4.9min finished


Best params {'C': 0.4, 'fit_intercept': False, 'penalty': 'l1'}
Best score: 0.649
Accuracy: 0.711
             precision    recall  f1-score   support

        gov      0.624     0.400     0.487      1047
        opp      0.736     0.874     0.799      2003

avg / total      0.697     0.711     0.692      3050



In [64]:
run_logistic_grid(np.prod)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:  5.3min finished


Best params {'C': 1.0, 'fit_intercept': False, 'penalty': 'l1'}
Best score: 0.641
Accuracy: 0.711
             precision    recall  f1-score   support

        gov      0.643     0.411     0.502      1079
        opp      0.731     0.875     0.796      1971

avg / total      0.700     0.711     0.692      3050



In [65]:
run_logistic_grid(np.multiply)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:  5.5min finished


Best params {'C': 2.0, 'fit_intercept': False, 'penalty': 'l2'}
Best score: 0.643
Accuracy: 0.696
             precision    recall  f1-score   support

        gov      0.623     0.405     0.491      1105
        opp      0.718     0.861     0.783      1945

avg / total      0.684     0.696     0.677      3050



In [66]:
run_logistic_grid(np.mod)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed: 10.3min finished


Best params {'C': 0.6, 'fit_intercept': False, 'penalty': 'l1'}
Best score: 0.640
Accuracy: 0.712
             precision    recall  f1-score   support

        gov      0.629     0.439     0.517      1071
        opp      0.739     0.860     0.795      1979

avg / total      0.700     0.712     0.697      3050



## Recurrent Neural Network

This should use the tensorflow RNN set-up.

### Build the Input Vectors

In [46]:
train_data = [(question.split(), label) for question, label in train_reader(class_func=cas_to_gov)]
X, y = zip(*train_data)
X_rnn_train = list(X)
y_rnn_train = list(y)

Since we don't have a devoted dev set, make a 70-30 split in the training data

In [47]:
X_rnn_train, X_rnn_assess, y_rnn_train, y_rnn_assess = train_test_split(X_rnn_train, y_rnn_train, train_size=0.7, test_size=None)

### Analyse the Inputs

The tensorflow implementation requires that we specify a maximum length up front.

In [48]:
utils.sequence_length_report(X_rnn_train, potential_max_length=150)

Max sequence length: 1,123
Min sequence length: 0
Mean sequence length: 71.78
Median sequence length: 69.00
Sequences longer than 150: 225 of 7,115


Based on this, we might take this maximum length to be 150.

#### Get the vocab

In [49]:
hansard_full_train_vocab = sst.get_vocab(X_rnn_train)

In [52]:
print("hansard_full_train_vocab has {:,} items".format(len(hansard_full_train_vocab)))

hansard_full_train_vocab has 35,337 items


In [53]:
hansard_train_vocab = sst.get_vocab(X_rnn_train, n_words=5000)

### Experiments

#### Random Embeddings

In [83]:
tf_rnn = TfRNNClassifier(
    hansard_train_vocab,
    embed_dim=50,
    hidden_dim=50,
    max_length=150,
    hidden_activation=tf.nn.tanh,
    cell_class=tf.nn.rnn_cell.LSTMCell,
    train_embedding=True,
    max_iter=10,
    eta=0.05) 

In [84]:
_ = tf_rnn.fit(X_rnn_train, y_rnn_train)

Iteration 10: loss: 4.548222303390503

In [85]:
tf_rnn_dev_predictions = tf_rnn.predict(X_rnn_assess)

In [88]:
print(classification_report(y_rnn_assess, tf_rnn_dev_predictions))

             precision    recall  f1-score   support

        gov       0.00      0.00      0.00      1039
        opp       0.66      1.00      0.79      2011

avg / total       0.43      0.66      0.52      3050



## Convolutional Neural Network

## Long/Short Term Memory Neural Network