In [5]:
from collections import Counter
import numpy as np
import os
import pandas as pd
from sklearn.linear_model import LogisticRegression
import torch.nn as nn

from torch_rnn_classifier import TorchRNNClassifier
from torch_tree_nn import TorchTreeNN
import sst
import utils

from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.tokenize.treebank import TreebankWordTokenizer
from nltk.tokenize.casual import TweetTokenizer
`
SST_HOME = os.path.join('data', 'sentiment')

In [6]:
tokenizerTB = TreebankWordTokenizer()
tokenizerTwtr = TweetTokenizer()

In [7]:
def unigrams_phi(text):
    return Counter(text.lower().split())

In [8]:
def unigrams_phi_tokenize(text):
    return Counter(tokenizerTwtr.tokenize(text))

In [2]:
# BERT
from transformers import BertModel, BertTokenizer
import vsm

# Instantiate a Bert model and tokenizer based on `bert_weights_name`:
bert_weights_name = 'bert-base-uncased'
##### YOUR CODE HERE
bert_tokenizer = BertTokenizer.from_pretrained(bert_weights_name)
bert_model = BertModel.from_pretrained(bert_weights_name)

def hf_cls_phi(text):
    # Get the ids. `vsm.hf_encode` will help; be sure to
    # set `add_special_tokens=True`.
    ##### YOUR CODE HERE
    text_token = vsm.hf_encode(text, bert_tokenizer, add_special_tokens=True)
    # Get the BERT representations. `vsm.hf_represent` will help:
    ##### YOUR CODE HERE
    bert_reps = vsm.hf_represent(text_token, bert_model) #  layer=-1

    # Index into `reps` to get the representation above [CLS].
    # The shape of `reps` should be (1, n, 768), where n is the
    # number of tokens. You need the 0th element of the 2nd dim:
    ##### YOUR CODE HERE
    #return reps.mean(axis=0)  # Another good, easy option.
    cls_rep = bert_reps[0][0]

    # These conversions should ensure that you can work with the
    # representations flexibly. Feel free to change the variable
    # name:        
    return cls_rep.cpu().numpy()

def hf_mean_phi(text):
    # Get the ids. `vsm.hf_encode` will help; be sure to
    # set `add_special_tokens=True`.
    ##### YOUR CODE HERE
    text_token = vsm.hf_encode(text, bert_tokenizer, add_special_tokens=True)
    # Get the BERT representations. `vsm.hf_represent` will help:
    ##### YOUR CODE HERE
    bert_reps = vsm.hf_represent(text_token, bert_model) #  layer=-1

    # Index into `reps` to get the representation above [CLS].
    # The shape of `reps` should be (1, n, 768), where n is the
    # number of tokens. You need the 0th element of the 2nd dim:
    ##### YOUR CODE HERE
    #return reps.mean(axis=0)  # Another good, easy option.
    cls_rep = bert_reps[0].mean(axis=0)

    # These conversions should ensure that you can work with the
    # representations flexibly. Feel free to change the variable
    # name:        
    return cls_rep.cpu().numpy()

In [9]:
from torch_shallow_neural_classifier import TorchShallowNeuralClassifier

def fit_softmax_classifier(X, y):
    mod = LogisticRegression(
        fit_intercept=True,
        solver='liblinear',
        multi_class='auto')
    mod.fit(X, y)
    return mod

class TorchSoftmaxClassifier(TorchShallowNeuralClassifier):

    def build_graph(self):
        return nn.Linear(self.input_dim, self.n_classes_)

def fit_torch_softmax(X, y):
    mod = TorchSoftmaxClassifier(l2_strength=0.0001)
    mod.fit(X, y)
    return mod

def fit_nn_classifier(X, y):
    mod = TorchShallowNeuralClassifier(
        hidden_dim=100,
        early_stopping=True,      # A basic early stopping set-up.
        validation_fraction=0.1,  # If no improvement on the
        tol=1e-5,                 # validation set is seen within
        n_iter_no_change=10)      # `n_iter_no_change`, we stop.
    mod.fit(X, y)
    return mod



In [4]:
#hf_mean_phi
_ = sst.experiment(
    sst.train_reader(SST_HOME, dedup=True),
    hf_mean_phi,
    fit_softmax_classifier,
    assess_dataframes=sst.dev_reader(SST_HOME),
    train_size=0.7,
    score_func=utils.safe_macro_f1,
    verbose=True,
    vectorize=False)

              precision    recall  f1-score   support

    negative      0.698     0.799     0.745       428
     neutral      0.360     0.135     0.197       229
    positive      0.710     0.840     0.770       444

    accuracy                          0.678      1101
   macro avg      0.590     0.592     0.571      1101
weighted avg      0.633     0.678     0.641      1101



In [10]:
%time
_ = sst.experiment(
    sst.train_reader(SST_HOME, include_subtrees=False, dedup=True),
    unigrams_phi_tokenize,
    fit_torch_softmax,
    assess_dataframes=sst.dev_reader(SST_HOME),
    train_size=0.7,
    score_func=utils.safe_macro_f1,
    verbose=True)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.39 µs


Stopping after epoch 351. Training loss did not improve more than tol=1e-05. Final error is 2.241021901369095.

              precision    recall  f1-score   support

    negative      0.631     0.694     0.661       428
     neutral      0.331     0.179     0.232       229
    positive      0.644     0.734     0.686       444

    accuracy                          0.603      1101
   macro avg      0.535     0.536     0.526      1101
weighted avg      0.574     0.603     0.582      1101



In [11]:
%time
_ = sst.experiment(
    sst.train_reader(SST_HOME, include_subtrees=False, dedup=True),
    unigrams_phi_tokenize,
    fit_nn_classifier,
    assess_dataframes=sst.dev_reader(SST_HOME),
    train_size=0.7,
    score_func=utils.safe_macro_f1,
    verbose=True)

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 19.8 µs


Stopping after epoch 34. Validation score did not improve by tol=1e-05 for more than 10 epochs. Final error is 0.33816414326429367

              precision    recall  f1-score   support

    negative      0.625     0.666     0.645       428
     neutral      0.307     0.183     0.230       229
    positive      0.648     0.741     0.691       444

    accuracy                          0.596      1101
   macro avg      0.526     0.530     0.522      1101
weighted avg      0.568     0.596     0.577      1101



In [12]:
%time
_ = sst.experiment(
    sst.train_reader(SST_HOME, include_subtrees=True, dedup=True),
    hf_cls_phi,
    fit_torch_softmax,
    assess_dataframes=sst.dev_reader(SST_HOME),
    train_size=0.8,
    score_func=utils.safe_macro_f1,
    verbose=True,
    vectorize=True)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.2 µs


KeyboardInterrupt: 

In [13]:
%time
_ = sst.experiment(
    sst.train_reader(SST_HOME, include_subtrees=False, dedup=True),
    hf_cls_phi,
    fit_nn_classifier,
    assess_dataframes=sst.dev_reader(SST_HOME),
    train_size=0.8,
    score_func=utils.safe_macro_f1,
    verbose=True,
    vectorize=False
    )

CPU times: user 2 µs, sys: 2 µs, total: 4 µs
Wall time: 6.68 µs


Stopping after epoch 21. Validation score did not improve by tol=1e-05 for more than 10 epochs. Final error is 5.096348285675049

              precision    recall  f1-score   support

    negative      0.693     0.827     0.754       428
     neutral      0.395     0.131     0.197       229
    positive      0.718     0.831     0.770       444

    accuracy                          0.684      1101
   macro avg      0.602     0.596     0.574      1101
weighted avg      0.641     0.684     0.645      1101



In [None]:
from torch_shallow_neural_classifier import TorchShallowNeuralClassifier

def fit_shallow_neural_classifier_with_hyperparameter_search(X, y):
    pass
    ##### YOUR CODE HERE    
    basemod = TorchShallowNeuralClassifier(early_stopping=True) 
    cv = 3
    param_grid = {
        'hidden_dim': [50, 100, 200],
        'hidden_activation': [nn.Tanh(), nn.ReLU()]}    
    
    bestmod = utils.fit_classifier_with_hyperparameter_search(
        X, y, basemod, cv, param_grid)
    return bestmod 