# Experiments for CS224U Project

## Setup

### Imports

In [8]:
from sklearn.model_selection import train_test_split, GridSearchCV
import csv
from sklearn.metrics import classification_report, accuracy_score, f1_score

In [9]:
import utils
import sst
import scipy.stats
from sgd_classifier import BasicSGDClassifier

In [10]:
from sklearn.linear_model import LogisticRegression
import os

In [11]:
import numpy as np

### Dataset

In [12]:
def read_array_from_csv(inputcsv):
    out = []
    with open(inputcsv, encoding='utf-8') as f:
        reader = csv.reader(f)
        for q in reader:
            out.append(q)
    return out

In [13]:
new_train = read_array_from_csv('data/train_data.csv')

In [14]:
anon_new_train = read_array_from_csv('data/anon_train_data.csv')

In [15]:
anon_new_test = read_array_from_csv('data/anon_test_data.csv')

### SST Machinery

In [16]:
def hansard_reader(
        src_filename,
        class_func=None):
    """Overview

    Parameters
    ----------
    src_filename : str
        Full path to the file to be read.
    class_func : None, or function mapping labels to labels or None
        If this is None, then the original 5-way labels are returned.
        Other options: `binary_class_func` and `ternary_class_func`
        (or you could write your own).


    Yields
    ------
    (tree, label)
        nltk.Tree, str in {'0','1','2','3','4'}

    """
    if class_func is None:
        class_func = lambda x: x
    with open(src_filename, encoding='utf-8') as f:
        reader = csv.reader(f)
        for q in reader:
            yield (q[0], class_func(q[1]))

We need a reader for each dataset, both for train and for test.

First, the standard data:

In [17]:
def train_reader(**kwargs):
    """Convenience function for reading the train file, full-trees only."""
    src = 'data/train_data.csv'
    return hansard_reader(src,**kwargs)

In [18]:
def test_reader(**kwargs):
    """Convenience function for reading the train file, full-trees only."""
    src = 'data/test_data.csv'
    return hansard_reader(src,**kwargs)

Next, the anonymised data:

In [19]:
def anon_train_reader(**kwargs):
    src = 'data/anon_train_data.csv'
    return hansard_reader(src,**kwargs)

In [20]:
def anon_test_reader(**kwargs):
    src = 'data/anon_test_data.csv'
    return hansard_reader(src,**kwargs)

The test readers won't be used until the *very* end.

### Class Functions

In [21]:
def cas_to_gov(label):
    if label == 'cas':
        return 'gov'
    else:
        return label

## Baselines

### Bag-of-words Feature Functions

A unigrams feature function

In [22]:
def unigrams_phi(question):
    """The basis for a unigrams feature function.
    Parameters
    ----------
    question : string
        The question to represent.
    
    Returns
    -------    
    defaultdict
        A map from strings to their counts in the question. (Counter maps a 
        list to a dict of counts of the elements in that list.)
    """
    unigrams = {}
    for word in question.split() :
        unigrams[word] = unigrams.get(word, 0) + 1
    return unigrams

A bigrams feature function

In [23]:
def bigrams_phi(question):
    """The basis for a unigrams feature function.
    
    Parameters
    ----------
    tree : nltk.tree
        The tree to represent.
    
    Returns
    -------    
    defaultdict
        A map from strings to their counts in `tree`. (Counter maps a 
        list to a dict of counts of the elements in that list.)
    
    """
    bigrams = {}
    qarray = question.split()
    for i in range(0, len(qarray)-1) :
        big = qarray[i] + '_' + qarray[i+1]
        bigrams[big] = bigrams.get(big, 0) + 1
    return bigrams

A basic bag-of-words unigrams and bigrams feature function

In [24]:
def uni_bigrams_phi(question):
    grams = unigrams_phi(question)
    grams.update(bigrams_phi(question))
    return grams

We found that *friend* seems to be a good indicator. What happens if we only give the classifier that feature? Or unigrams without it?

In [25]:
def only_friend_phi(question):
    if 'friend' in question.lower().split():
        return {'friend':1}
    else:
        return {'friend':0}

In [26]:
def no_friends_phi(question):
    unigrams = {}
    for word in question.split() :
        if word.lower() is not 'friend':
            unigrams[word.lower()] = unigrams.get(word.lower(), 0) + 1
    return unigrams

### SGD Classifier Baseline
Stochastic Gradient Descent

### Logistic Regression without using sst.experiment

#### Setting up parameters

First, choose the feature function to use.

In [27]:
phi = unigrams_phi

Next, choose the reader used for testing. (None gives a random split.)

In [28]:
assess_reader = None

If we're doing a split, what size should we train on?

In [29]:
train_size = 0.7

Next, choose a function for the classes. (We probably want cas_to_gov.)

In [30]:
class_func = cas_to_gov

Do we want to vectorise?

In [31]:
vectorize = True

#### Building the Classifier

Which classifier are we to use?

In [32]:
# classifier = LogisticRegression(fit_intercept=True)

Make it into a training function.

In [33]:
def train_func(X, y):   
    mod = classifier
    mod.fit(X, y)
    return mod

Build the dataset.

In [34]:
train = sst.build_dataset(train_reader, phi, class_func, vectorize=vectorize)

#### Setting up the experiment

First, get the data into standardised variables.

In [34]:
X_train = train['X']
y_train = train['y']
X_assess = None
y_assess = None

If we're not using an assess_reader, do a split on the training data. Otherwise, read in the assessment dataset.

In [39]:
if assess_reader == None:
     X_train, X_assess, y_train, y_assess = train_test_split(
            X_train, y_train, train_size=train_size, test_size=None)
else:
    # Assessment dataset using the training vectorizer:
    assess = sst.build_dataset(
        assess_reader,
        phi,
        class_func,
        vectorizer=train['vectorizer'],
        vectorize=vectorize)
    X_assess, y_assess = assess['X'], assess['y']

#### Experiment
Train the model.

In [None]:
# mod = train_func(X_train, y_train)

Test the model.

In [None]:
# predictions = mod.predict(X_assess)

Print a report.

In [46]:
# print('Accuracy: %0.03f' % sst.accuracy_score(y_assess, predictions))
# print(classification_report(y_assess, predictions, digits=3))

Accuracy: 0.768
             precision    recall  f1-score   support

        gov      0.677     0.603     0.638      1020
        opp      0.807     0.853     0.829      1987

avg / total      0.763     0.768     0.764      3007



In [49]:
# hansard_full_train_vocab = sst.get_vocab(X_rnn_train)

In [52]:
# print("hansard_full_train_vocab has {:,} items".format(len(hansard_full_train_vocab)))

hansard_full_train_vocab has 35,337 items


In [53]:
# hansard_train_vocab = sst.get_vocab(X_rnn_train, n_words=5000)

### Experiments

## TF-IDF

In [44]:
from sklearn.feature_extraction.text import CountVectorizer

In [45]:
vectorizer = CountVectorizer()

In [48]:
anon_vec_ready = [item[0] for item in anon_new_train]

In [49]:
anon_vec_ready

[' I am sure that the whole House will join $pronoun in expressing our condolences to Neil and Jennifer Burdett, the parents of two-year-old Faye, who died on Valentine’s day of meningitis B. Since Faye’s death, 815,000 people have signed the petition calling on the Government to vaccinate more children against meningitis B. I am proud that the UK is the first country to have a vaccination programme for meningitis B, but could my $person ensure  that the Government look at what more could be done to prevent more children like Faye dying from this horrid disease?',
 ' The Chancellor of the Exchequer would not answer this question yesterday, so let $pronoun give the $person a try. How many of the so-called new private sector jobs that $pronoun crows about are people on zero-hours contracts?',
 "Following yesterday's debate and the Divisions at the end of it, there is disquiet in all sections of the House about many aspects of the legislation, as the $person properly recognises. Will $pro

In [55]:
X_train_vec = vectorizer.fit_transform(anon_vec_ready)

In [56]:
import numpy as np
from sklearn.decomposition import TruncatedSVD

In [57]:
svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)

In [59]:
yy =svd.fit(X_train_vec)

In [60]:
yy

TruncatedSVD(algorithm='randomized', n_components=5, n_iter=7,
       random_state=42, tol=0.0)

In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [65]:
vectorizer = TfidfVectorizer(min_df=1)
X = vectorizer.fit_transform(anon_vec_ready)
idf = vectorizer.idf_
out = dict(zip(vectorizer.get_feature_names(), idf))

In [67]:
import operator

In [68]:
sorted_out = sorted(out.items(), key=operator.itemgetter(1))

In [69]:
sorted_out

[('the', 1.0719764359609238),
 ('person', 1.152398003271498),
 ('to', 1.217112199517504),
 ('that', 1.2492788970416058),
 ('of', 1.2592999106906109),
 ('and', 1.302147795819773),
 ('in', 1.3359805220024645),
 ('pronoun', 1.4386681581431275),
 ('will', 1.4980542245283208),
 ('is', 1.5636076124707674),
 ('for', 1.6413916676134666),
 ('on', 1.8730714557436463),
 ('it', 2.0552046176613845),
 ('have', 2.080095045803529),
 ('not', 2.0876554191227825),
 ('my', 2.091457149100141),
 ('are', 2.095273387402595),
 ('this', 2.10472972264463),
 ('be', 2.127857349063508),
 ('with', 2.1944439500823476),
 ('has', 2.209496590318267),
 ('does', 2.2379218446976203),
 ('government', 2.243705157295955),
 ('we', 2.3213624489465614),
 ('by', 2.3486487615035236),
 ('as', 2.39360014936579),
 ('people', 2.437349531920175),
 ('at', 2.5401827630622282),
 ('can', 2.5587103889244425),
 ('who', 2.562457465908231),
 ('from', 2.568104539738549),
 ('what', 2.613973067599492),
 ('our', 2.693645290551285),
 ('their', 2.71