# Experiments for CS224U Project

## Setup

### Imports

In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
import csv
from sklearn.metrics import classification_report, accuracy_score, f1_score

In [2]:
import utils
import sst
import scipy.stats
from sgd_classifier import BasicSGDClassifier

In [3]:
from sklearn.linear_model import LogisticRegression
import os

In [4]:
import tensorflow as tf
from tf_rnn_classifier import TfRNNClassifier

  from ._conv import register_converters as _register_converters


In [5]:
vsmdata_home = 'vsmdata'

glove_home = os.path.join(vsmdata_home, 'glove.6B')

In [6]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier

### Dataset

In [7]:
def read_array_from_csv(inputcsv):
    out = []
    with open(inputcsv, encoding='utf-8') as f:
        reader = csv.reader(f)
        for q in reader:
            out.append(q)
    return out

In [8]:
new_train = read_array_from_csv('data/train_data.csv')

In [9]:
anon_new_train = read_array_from_csv('data/anon_train_data.csv')

In [10]:
anon_new_test = read_array_from_csv('data/anon_test_data.csv')

### SST Machinery

In [11]:
def hansard_reader(
        src_filename,
        class_func=None):
    """Overview

    Parameters
    ----------
    src_filename : str
        Full path to the file to be read.
    class_func : None, or function mapping labels to labels or None
        If this is None, then the original 5-way labels are returned.
        Other options: `binary_class_func` and `ternary_class_func`
        (or you could write your own).


    Yields
    ------
    (tree, label)
        nltk.Tree, str in {'0','1','2','3','4'}

    """
    if class_func is None:
        class_func = lambda x: x
    with open(src_filename, encoding='utf-8') as f:
        reader = csv.reader(f)
        for q in reader:
            yield (q[0], class_func(q[1]))

We need a reader for each dataset, both for train and for test.

First, the standard data:

In [12]:
def train_reader(**kwargs):
    """Convenience function for reading the train file, full-trees only."""
    src = 'data/train_data.csv'
    return hansard_reader(src,**kwargs)

In [13]:
def test_reader(**kwargs):
    """Convenience function for reading the train file, full-trees only."""
    src = 'data/test_data.csv'
    return hansard_reader(src,**kwargs)

Next, the anonymised data:

In [14]:
def anon_train_reader(**kwargs):
    src = 'data/anon_train_data.csv'
    return hansard_reader(src,**kwargs)

In [15]:
def anon_test_reader(**kwargs):
    src = 'data/anon_test_data.csv'
    return hansard_reader(src,**kwargs)

The test readers won't be used until the *very* end.

### Class Functions

In [16]:
def cas_to_gov(label):
    if label == 'cas':
        return 'gov'
    else:
        return label

## Random Forest

Setup a grid search like above.

### Feature Functions

A unigrams feature function

In [17]:
def unigrams_phi(question):
    """The basis for a unigrams feature function.
    Parameters
    ----------
    question : string
        The question to represent.
    
    Returns
    -------    
    defaultdict
        A map from strings to their counts in the question. (Counter maps a 
        list to a dict of counts of the elements in that list.)
    """
    unigrams = {}
    for word in question.split() :
        unigrams[word] = unigrams.get(word, 0) + 1
    return unigrams

A bigrams feature function

In [19]:
def bigrams_phi(question):
    """The basis for a unigrams feature function.
    
    Parameters
    ----------
    tree : nltk.tree
        The tree to represent.
    
    Returns
    -------    
    defaultdict
        A map from strings to their counts in `tree`. (Counter maps a 
        list to a dict of counts of the elements in that list.)
    
    """
    bigrams = {}
    qarray = question.split()
    for i in range(0, len(qarray)-1) :
        big = qarray[i] + '_' + qarray[i+1]
        bigrams[big] = bigrams.get(big, 0) + 1
    return bigrams

A basic bag-of-words unigrams and bigrams feature function

In [20]:
def uni_bigrams_phi(question):
    grams = unigrams_phi(question)
    grams.update(bigrams_phi(question))
    return grams

We found that *friend* seems to be a good indicator. What happens if we only give the classifier that feature? Or unigrams without it?

In [21]:
def only_friend_phi(question):
    if 'friend' in question.lower().split():
        return {'friend':1}
    else:
        return {'friend':0}

In [22]:
def no_friends_phi(question):
    unigrams = {}
    for word in question.split() :
        if word.lower() is not 'friend':
            unigrams[word.lower()] = unigrams.get(word.lower(), 0) + 1
    return unigrams

### Build Data

In [18]:
train = sst.build_dataset(anon_train_reader, unigrams_phi, cas_to_gov, vectorize=True)

In [19]:
X_train = train['X']
y_train = train['y']
X_assess = None
y_assess = None
if None == None:
     X_train, X_assess, y_train, y_assess = train_test_split(
            X_train, y_train, train_size=0.7, test_size=None)
else:
    # Assessment dataset using the training vectorizer:
    assess = sst.build_dataset(
        assess_reader,
        phi,
        class_func,
        vectorizer=train['vectorizer'],
        vectorize=vectorize)
    X_assess, y_assess = assess['X'], assess['y']

### Set up Grid Search

In [20]:
parameters = {'n_estimators': [20, 25, 35, 50], 
                  'max_features': [None],
                  'n_jobs': [-1],
                 'bootstrap' : [True, False],
                 'min_samples_leaf' : [1, 5, 10]}

In [21]:
grid_classifier = GridSearchCV(RandomForestClassifier(), parameters, scoring='f1_macro',verbose=3)

### Run Experiment

In [None]:
grid_classifier.fit(X_train, y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] bootstrap=True, max_features=None, min_samples_leaf=1, n_estimators=20, n_jobs=-1 
[CV]  bootstrap=True, max_features=None, min_samples_leaf=1, n_estimators=20, n_jobs=-1, score=0.6929118055201214, total= 3.2min
[CV] bootstrap=True, max_features=None, min_samples_leaf=1, n_estimators=20, n_jobs=-1 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.2min remaining:    0.0s


[CV]  bootstrap=True, max_features=None, min_samples_leaf=1, n_estimators=20, n_jobs=-1, score=0.6820652451577033, total= 3.0min
[CV] bootstrap=True, max_features=None, min_samples_leaf=1, n_estimators=20, n_jobs=-1 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  6.3min remaining:    0.0s


[CV]  bootstrap=True, max_features=None, min_samples_leaf=1, n_estimators=20, n_jobs=-1, score=0.6874472329911365, total= 2.9min
[CV] bootstrap=True, max_features=None, min_samples_leaf=1, n_estimators=25, n_jobs=-1 
[CV]  bootstrap=True, max_features=None, min_samples_leaf=1, n_estimators=25, n_jobs=-1, score=0.6868122774228942, total= 4.0min
[CV] bootstrap=True, max_features=None, min_samples_leaf=1, n_estimators=25, n_jobs=-1 
[CV]  bootstrap=True, max_features=None, min_samples_leaf=1, n_estimators=25, n_jobs=-1, score=0.6873138957816377, total= 3.9min
[CV] bootstrap=True, max_features=None, min_samples_leaf=1, n_estimators=25, n_jobs=-1 
[CV]  bootstrap=True, max_features=None, min_samples_leaf=1, n_estimators=25, n_jobs=-1, score=0.6913390357571356, total= 3.7min
[CV] bootstrap=True, max_features=None, min_samples_leaf=1, n_estimators=35, n_jobs=-1 
[CV]  bootstrap=True, max_features=None, min_samples_leaf=1, n_estimators=35, n_jobs=-1, score=0.6969936996676203, total= 5.6min
[CV

[CV]  bootstrap=False, max_features=None, min_samples_leaf=1, n_estimators=25, n_jobs=-1, score=0.6535535201721939, total= 5.6min
[CV] bootstrap=False, max_features=None, min_samples_leaf=1, n_estimators=25, n_jobs=-1 
[CV]  bootstrap=False, max_features=None, min_samples_leaf=1, n_estimators=25, n_jobs=-1, score=0.6412627948604273, total= 5.2min
[CV] bootstrap=False, max_features=None, min_samples_leaf=1, n_estimators=35, n_jobs=-1 
[CV]  bootstrap=False, max_features=None, min_samples_leaf=1, n_estimators=35, n_jobs=-1, score=0.6544231583864826, total= 7.3min
[CV] bootstrap=False, max_features=None, min_samples_leaf=1, n_estimators=35, n_jobs=-1 
[CV]  bootstrap=False, max_features=None, min_samples_leaf=1, n_estimators=35, n_jobs=-1, score=0.6597118696181545, total= 7.8min
[CV] bootstrap=False, max_features=None, min_samples_leaf=1, n_estimators=35, n_jobs=-1 
[CV]  bootstrap=False, max_features=None, min_samples_leaf=1, n_estimators=35, n_jobs=-1, score=0.6436830191514742, total= 7

### Get Predictions

In [28]:
print("Best params", grid_classifier.best_params_)
print("Best score: %0.03f" % grid_classifier.best_score_)
final_mod = grid_classifier.best_estimator_
predictions = final_mod.predict(X_assess)
print('Accuracy: %0.03f' % sst.accuracy_score(y_assess, predictions))
print(classification_report(y_assess, predictions, digits=3))

Best params {'bootstrap': True, 'max_features': None, 'n_estimators': 20, 'n_jobs': -1}
Best score: 0.699
Accuracy: 0.742
             precision    recall  f1-score   support

        gov      0.619     0.573     0.595      1009
        opp      0.796     0.826     0.811      2041

avg / total      0.738     0.742     0.739      3050

