# Initial Notebook for CS224U Project

## Creating Data

In [1]:
from sklearn.model_selection import train_test_split
import csv
from sklearn.metrics import classification_report, accuracy_score, f1_score

In [2]:
questions = []
with open('../data/combinedQBank.csv', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for q in reader:
        questions.append([(q['memPart'] + '@' + q['date']), q['memAffi']])  

In [3]:
len(questions)

12707

In [4]:
X, y = zip(*list(questions))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

train_data = list(zip(X_train, y_train))
test_data = list(zip(X_test, y_test))

In [5]:
len(train_data)

10165

In [6]:
# with open('data/train_data_partydate.csv', 'w', encoding='utf-8') as f:
#     writer = csv.writer(f)
#     writer.writerows(train_data)

# with open('data/test_data_partydate.csv', 'w', encoding='utf-8') as f:
#     writer = csv.writer(f)
#     writer.writerows(test_data)

In [7]:
def read_array_from_csv(inputcsv):
    out = []
    with open(inputcsv, encoding='utf-8') as f:
        reader = csv.reader(f)
        for q in reader:
            out.append(q)
    return out

In [8]:
new_train = read_array_from_csv('data/train_data_partydate.csv')

In [9]:
len(new_train)

10165

## Simple Model

### First, some feature functions

A unigrams feature function

In [10]:
def names_phi(question):
    name = {}
    split = question.split('@')
    party = split[0]
    date = split[1]
#     name['party'] = party
#     name['date'] = date[0:4]
    name['partydate'] = party + date[0:4]
    return name

### Now, let's bring in all the machinery from SST

In [11]:
import utils
import sst
import scipy.stats
from sgd_classifier import BasicSGDClassifier

To build a dataset, we need a reader

In [12]:
def hansard_reader(
        src_filename,
        class_func=None):
    """Overview

    Parameters
    ----------
    src_filename : str
        Full path to the file to be read.
    class_func : None, or function mapping labels to labels or None
        If this is None, then the original 5-way labels are returned.
        Other options: `binary_class_func` and `ternary_class_func`
        (or you could write your own).


    Yields
    ------
    (tree, label)
        nltk.Tree, str in {'0','1','2','3','4'}

    """
    if class_func is None:
        class_func = lambda x: x
    with open(src_filename, encoding='utf-8') as f:
        reader = csv.reader(f)
        for q in reader:
            yield (q[0], class_func(q[1]))
            
def train_reader(**kwargs):
    """Convenience function for reading the train file, full-trees only."""
    src = 'data/train_data_partydate.csv'
    return hansard_reader(src,**kwargs)

In [13]:
train_dataset = sst.build_dataset(
    reader=train_reader,
    phi=names_phi,
    class_func=None,
    vectorizer=None)

In [14]:
print("Train dataset with unigram features has {:,} examples and {:,} features".format(
        *train_dataset['X'].shape))

Train dataset with unigram features has 10,165 examples and 157 features


In [15]:
new_train[1]
print(new_train[1][0].split('@')[0])
print(new_train[1][0].split('@')[1][0:4])

Con
2004


### Wrapper for SGD Classifier

In [16]:
def fit_basic_sgd_classifier(X, y):    
    """Wrapper for `BasicSGDClassifier`.
    
    Parameters
    ----------
    X : 2d np.array
        The matrix of features, one example per row.        
    y : list
        The list of labels for rows in `X`.
    
    Returns
    -------
    BasicSGDClassifier
        A trained `BasicSGDClassifier` instance.
    
    """    
    mod = BasicSGDClassifier()
    mod.fit(X, y)
    return mod

### Class Functions

In [17]:
def cas_to_gov(label):
    if label == 'cas':
        return 'gov'
    else:
        return label

### Experiments

In [18]:
_ = sst.experiment(
    names_phi,
    fit_basic_sgd_classifier,
    train_reader=train_reader, 
    assess_reader=None, 
    train_size=0.7,
    class_func=cas_to_gov,
    score_func=utils.safe_macro_f1,
    verbose=True)

Accuracy: 0.992
             precision    recall  f1-score   support

        gov      0.983     0.994     0.989      1044
        opp      0.997     0.991     0.994      2006

avg / total      0.992     0.992     0.992      3050



In [19]:
from sklearn.linear_model import LogisticRegression

In [20]:
def fit_maxent_classifier(X, y):   
    mod = LogisticRegression(fit_intercept=True)
    mod.fit(X, y)
    return mod

In [21]:
_ = sst.experiment(
    names_phi,
    fit_maxent_classifier,
    train_reader=train_reader, 
    assess_reader=None, 
    train_size=0.7,
    class_func=cas_to_gov,
    score_func=utils.safe_macro_f1,
    verbose=True)

Accuracy: 0.994
             precision    recall  f1-score   support

        gov      0.988     0.993     0.991      1037
        opp      0.997     0.994     0.995      2013

avg / total      0.994     0.994     0.994      3050



In [22]:
# Which feature functions?
phi = names_phi

# What reader do we use for testing? (None gives us a random split)
assess_reader = None

# What classifier function?
class_func = cas_to_gov

# Vectorise?
vectorize = True

#Train size
train_size = 0.7

Which model function thing are we to use?

In [23]:
classifier = LogisticRegression(fit_intercept=True)

In [24]:
def our_new_classifier(X, y):   
    mod = classifier
    mod.fit(X, y)
    return mod

train_func = our_new_classifier

In [25]:
train = sst.build_dataset(train_reader, phi, class_func, vectorize=vectorize)

In [26]:
# Manage the assessment set-up:
X_train = train['X']
y_train = train['y']
X_assess = None
y_assess = None
if assess_reader == None:
     X_train, X_assess, y_train, y_assess = train_test_split(
            X_train, y_train, train_size=train_size, test_size=None)
else:
    # Assessment dataset using the training vectorizer:
    assess = sst.build_dataset(
        assess_reader,
        phi,
        class_func,
        vectorizer=train['vectorizer'],
        vectorize=vectorize)
    X_assess, y_assess = assess['X'], assess['y']
# Train:
mod = train_func(X_train, y_train)
# Predictions:
predictions = mod.predict(X_assess)
# Report:
print('Accuracy: %0.03f' % sst.accuracy_score(y_assess, predictions))
print(classification_report(y_assess, predictions, digits=3))

Accuracy: 0.995
             precision    recall  f1-score   support

        gov      0.992     0.994     0.993      1078
        opp      0.996     0.995     0.996      1972

avg / total      0.995     0.995     0.995      3050

