# Initial Notebook for CS224U Project

## Creating Data

In [1]:
from sklearn.model_selection import train_test_split
import csv
from sklearn.metrics import classification_report, accuracy_score, f1_score

In [2]:
questions = []
with open('../declan/qbank.csv', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for q in reader:
        questions.append([q['Question'], q['memAffi']])

FileNotFoundError: [Errno 2] No such file or directory: '../declan/qbank.csv'

In [None]:
X, y = zip(*list(questions))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

train_data = set(zip(X_train, y_train))
test_data = set(zip(X_test, y_test))

In [None]:
with open('data/train_data.csv', 'w', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerows(train_data)

with open('data/test_data.csv', 'w', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerows(train_data)

In [3]:
def read_array_from_csv(inputcsv):
    out = []
    with open(inputcsv, encoding='utf-8') as f:
        reader = csv.reader(f)
        for q in reader:
            out.append(q)
    return out

In [4]:
new_train = read_array_from_csv('data/train_data.csv')

## Simple Model

### First, some feature functions

A unigrams feature function

In [5]:
def unigrams_phi(question):
    """The basis for a unigrams feature function.
    Parameters
    ----------
    question : string
        The question to represent.
    
    Returns
    -------    
    defaultdict
        A map from strings to their counts in the question. (Counter maps a 
        list to a dict of counts of the elements in that list.)
    """
    unigrams = {}
    for word in question.split() :
        unigrams[word] = unigrams.get(word, 0) + 1
    return unigrams

In [6]:
def unigrams_lower_phi(question):
    """The basis for a unigrams feature function.
    Parameters
    ----------
    question : string
        The question to represent.
    
    Returns
    -------    
    defaultdict
        A map from strings to their counts in the question. (Counter maps a 
        list to a dict of counts of the elements in that list.)
    """
    unigrams = {}
    for word in question.split() :
        unigrams[word.lower()] = unigrams.get(word.lower(), 0) + 1
    return unigrams

A bigrams feature function

In [7]:
def bigrams_phi(question):
    """The basis for a unigrams feature function.
    
    Parameters
    ----------
    tree : nltk.tree
        The tree to represent.
    
    Returns
    -------    
    defaultdict
        A map from strings to their counts in `tree`. (Counter maps a 
        list to a dict of counts of the elements in that list.)
    
    """
    bigrams = {}
    qarray = question.split()
    for i in range(0, len(qarray)-1) :
        big = qarray[i] + '_' + qarray[i+1]
        bigrams[big] = bigrams.get(big, 0) + 1
    return bigrams

In [8]:
def trigrams_phi(question):
    """The basis for a unigrams feature function.
    
    Parameters
    ----------
    tree : nltk.tree
        The tree to represent.
    
    Returns
    -------    
    defaultdict
        A map from strings to their counts in `tree`. (Counter maps a 
        list to a dict of counts of the elements in that list.)
    
    """
    bigrams = {}
    qarray = question.split()
    for i in range(0, len(qarray)-2) :
        big = qarray[i] + '_' + qarray[i+1] + '_' + qarray[i+2]
        bigrams[big] = bigrams.get(big, 0) + 1
    return bigrams

A basic bag-of-words unigrams and bigrams feature function

In [9]:
def uni_bigrams_phi(question):
    grams = unigrams_phi(question)
    grams.update(bigrams_phi(question))
    return grams

In [10]:
def first_word_phi(question):
    qarray = question.split()
    return {qarray[0]:1}

In [11]:
def second_word_phi(question):
    qarray = question.split()
    return {qarray[1]:1}

In [12]:
def constant_phi(question):
    return {'val':1}

In [13]:
def length_phi(question):
    return {'length_':len(question)}

In [14]:
from random import randint

In [15]:
def random_word_phi(question):
    qarray = question.split()
    word = qarray[randint(0, len(qarray)-1)]
    return {word:1}

In [16]:
def numwords_phi(question):
    return {'numwords_':len(question.split())}

We found that *friend* seems to be a good indicator. What happens if we only give the classifier that feature? Or unigrams without it?

In [17]:
def only_friend_phi(question):
    if 'friend' in question.lower().split():
        return {'friend':1}
    else:
        return {'friend':0}

In [18]:
def no_friends_phi(question):
    unigrams = {}
    for word in question.split() :
        if word.lower() is not 'friend':
            unigrams[word.lower()] = unigrams.get(word.lower(), 0) + 1
    return unigrams

### Now, let's bring in all the machinery from SST

In [19]:
import utils
import sst
import scipy.stats
from sgd_classifier import BasicSGDClassifier

To build a dataset, we need a reader

In [20]:
def hansard_reader(
        src_filename,
        class_func=None):
    """Iterator for the Penn-style distribution of the Stanford
    Sentiment Treebank. The iterator yields (tree, label) pairs.

    The root node of the tree is the label, so the root node itself is
    replaced with a string to ensure that it doesn't get used as a
    predictor. The subtree labels are retained. If they are used, it can
    feel like cheating (see `root_daughter_scores_phis` below), so take
    care!

    The labels are strings. They do not make sense as a linear order
    because negative ('0', '1'), neutral ('2'), and positive ('3','4')
    do not form a linear order conceptually, and because '0' is
    stronger than '1' but '4' is stronger than '3'.

    Parameters
    ----------
    src_filename : str
        Full path to the file to be read.
    include_subtrees : boolean (default: False)
        Whether to yield all the subtrees with labels or just the full
        tree. In both cases, the label is the root of the subtree.
    replace_root_score : boolean (default: True)
        The root node of the tree is the label, so, by default, the root
        node itself is replaced with a string to ensure that it doesn't
        get used as a predictor.
    class_func : None, or function mapping labels to labels or None
        If this is None, then the original 5-way labels are returned.
        Other options: `binary_class_func` and `ternary_class_func`
        (or you could write your own).


    Yields
    ------
    (tree, label)
        nltk.Tree, str in {'0','1','2','3','4'}

    """
    if class_func is None:
        class_func = lambda x: x
    with open(src_filename, encoding='utf-8') as f:
        reader = csv.reader(f)
        for q in reader:
            yield (q[0], class_func(q[1]))
            
def train_reader(**kwargs):
    """Convenience function for reading the train file, full-trees only."""
    src = 'data/train_data.csv'
    return hansard_reader(src,**kwargs)

In [21]:
train_dataset = sst.build_dataset(
    reader=train_reader,
    phi=uni_bigrams_phi,
    class_func=None,
    vectorizer=None)

In [22]:
print("Train dataset with unigram features has {:,} examples and {:,} features".format(
        *train_dataset['X'].shape))

Train dataset with unigram features has 5,364 examples and 182,932 features


In [23]:
new_train[80]

[' I have a question from Abby, who wants to train to be a midwife, and $pronoun says:  “I am 28 years old. This year I left my successful career to go back into university to re-train as a Midwife. I already have a debt of £25,000 from my first degree.  Well over half of my cohort have studied a first degree in another subject and many of my fellow colleagues have children and partners and elderly parents and mortgages.  Many people will be put off by the lack of financial support and massive debts.”  In the spirit of Christmas, will the $person have a word with $pronoun friend the Chancellor, who is sitting next to him—it can be done very quickly—to reverse  the cuts in the nurse bursary scheme, so that we do get people like Abby training to be midwives, which will help all of us in the future?',
 'opp']

### Wrapper for SGD Classifier

In [24]:
def fit_basic_sgd_classifier(X, y):    
    """Wrapper for `BasicSGDClassifier`.
    
    Parameters
    ----------
    X : 2d np.array
        The matrix of features, one example per row.        
    y : list
        The list of labels for rows in `X`.
    
    Returns
    -------
    BasicSGDClassifier
        A trained `BasicSGDClassifier` instance.
    
    """    
    mod = BasicSGDClassifier()
    mod.fit(X, y)
    return mod

### Class Functions

In [25]:
def cas_to_gov(label):
    if label == 'cas':
        return 'gov'
    else:
        return label

### Experiments

In [32]:
_ = sst.experiment(
    uni_bigrams_phi,
    fit_basic_sgd_classifier,
    train_reader=train_reader, 
    assess_reader=None, 
    train_size=0.7,
    class_func=cas_to_gov,
    score_func=utils.safe_macro_f1,
    verbose=True)

Accuracy: 0.801
             precision    recall  f1-score   support

        gov      0.761     0.784     0.773       695
        opp      0.832     0.813     0.823       915

avg / total      0.802     0.801     0.801      1610



In [33]:
from sklearn.linear_model import LogisticRegression

In [34]:
def fit_maxent_classifier(X, y):   
    mod = LogisticRegression(fit_intercept=True)
    mod.fit(X, y)
    return mod

In [35]:
_ = sst.experiment(
    bigrams_phi,
    fit_maxent_classifier,
    train_reader=train_reader, 
    assess_reader=None, 
    train_size=0.7,
    class_func=None,
    score_func=utils.safe_macro_f1,
    verbose=True)

Accuracy: 0.795
             precision    recall  f1-score   support

        cas      0.000     0.000     0.000         2
        gov      0.818     0.699     0.754       718
        opp      0.781     0.874     0.825       890

avg / total      0.796     0.795     0.792      1610



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [87]:
# Which feature functions?
phi = unigrams_phi

# What reader do we use for testing? (None gives us a random split)
assess_reader = None

# What classifier function?
class_func = cas_to_gov

# Vectorise?
vectorize = True

#Train size
train_size = 0.7

Which model function thing are we to use?

In [88]:
classifier = LogisticRegression(fit_intercept=True)

In [89]:
def our_new_classifier(X, y):   
    mod = classifier
    mod.fit(X, y)
    return mod

train_func = our_new_classifier

In [90]:
train = sst.build_dataset(train_reader, phi, class_func, vectorize=vectorize)

In [91]:
# Manage the assessment set-up:
X_train = train['X']
y_train = train['y']
X_assess = None
y_assess = None
if assess_reader == None:
     X_train, X_assess, y_train, y_assess = train_test_split(
            X_train, y_train, train_size=train_size, test_size=None)
else:
    # Assessment dataset using the training vectorizer:
    assess = sst.build_dataset(
        assess_reader,
        phi,
        class_func,
        vectorizer=train['vectorizer'],
        vectorize=vectorize)
    X_assess, y_assess = assess['X'], assess['y']
# Train:
mod = train_func(X_train, y_train)
# Predictions:
predictions = mod.predict(X_assess)
# Report:
print('Accuracy: %0.03f' % sst.accuracy_score(y_assess, predictions))
print(classification_report(y_assess, predictions, digits=3))

Accuracy: 0.802
             precision    recall  f1-score   support

        gov      0.802     0.736     0.768       716
        opp      0.802     0.855     0.827       894

avg / total      0.802     0.802     0.801      1610



In [92]:
print(mod.coef_)

[[ 0.         -0.10689693  0.2663828  ...  0.         -0.00073232
   0.        ]]


In [94]:
print(train['vectorizer'].get_feature_names())



In [104]:
attempt = zip(mod.coef_[0], train['vectorizer'].get_feature_names())

In [105]:
list_att = list(attempt)

In [107]:
sort_attempt = sorted(list_att, key=lambda x: x[0])

In [110]:
print(sort_attempt[-20:])

[(0.7986549445862445, 'failure'), (0.7996828580185086, 'Tory'), (0.8134631771076168, 'steel'), (0.8181241776900138, 'guarantee'), (0.8236511479695717, 'Member'), (0.8238846900659598, 'believe'), (0.8352042874409962, 'Gentleman’s'), (0.8365854777310051, 'meet'), (0.8883599556814755, 'explain'), (0.8975418848508137, 'Northern'), (0.9021877961464828, 'Minister,'), (0.9027913979243735, 'cut'), (0.913022383609288, 'question'), (0.92070934532461, 'cross-party'), (0.9804569590352616, 'Let'), (1.102323465770581, 'How'), (1.126159094044342, 'food'), (1.1292605384437484, 'cuts'), (1.5154121904010176, 'Why'), (1.7717824813098184, 'Gentleman')]


In [112]:
print(sort_attempt[:20])

[(-2.1147189734967706, 'Friend'), (-1.254584062842669, '27'), (-1.0813618595802457, 'hon.'), (-1.0765678295518877, 'June.'), (-0.9436728115754969, 'good'), (-0.9191593843265392, 'personal'), (-0.8853075388543251, 'Labour'), (-0.8801493148075545, 'agree'), (-0.8531177773296328, 'rightly'), (-0.8507901823229739, 'Will'), (-0.8332120372425409, 'Opposition'), (-0.8284918266701731, 'reduce'), (-0.8269652945084, 'work'), (-0.8237307346412344, 'join'), (-0.7978809384912877, 'drugs'), (-0.7881744938529317, 'As'), (-0.7844577534527231, 'deliver'), (-0.7809430334565846, 'congratulate'), (-0.777993276339937, 'day'), (-0.767124557599734, 'business')]
