# Use BERT Representations with LogisticRegression Softmax Classifier

In [1]:
from collections import Counter
import os
import numpy as np
import pandas as pd
import torch
from torch import nn, optim
import torch.nn as nn
from torch.utils.data import TensorDataset, Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from transformers import BertTokenizer, BertModel, BertForSequenceClassification


import dataset
import vsm
import sst

In [2]:
TWITTER = 2
TWITTER_AIRLINES = 3
TWITTER_APPLE = 4

In [3]:
twitter_train, twitter_validate, twitter_test =  dataset.dataset_reader(TWITTER_APPLE)
[twitter_train, twitter_validate, twitter_test] = list(map(lambda ds : dataset.prune_columns(TWITTER_APPLE, ds), [twitter_train, twitter_validate, twitter_test]))

In [4]:
bert_weights_name = 'bert-base-cased'
bert_tokenizer = BertTokenizer.from_pretrained(bert_weights_name)
bert_model = BertModel.from_pretrained(bert_weights_name)
# model = BertForSequenceClassification.from_pretrained(bert_weights_name)
# Unique values of sentiment
twitter_sentiment_labels = twitter_train['sentiment'].unique()

In [5]:
def fit_softmax_classifier(X, y):
    mod = LogisticRegression(
        fit_intercept=True,
        solver='liblinear',
        multi_class='ovr')
    mod.fit(X, y)
    return mod

In [6]:
def hf_cls_phi(text):
    # Get the ids. `vsm.hf_encode` will help; be sure to
    # set `add_special_tokens=True`.
    ##### YOUR CODE HERE
    subtok_ids = vsm.hf_encode(text, bert_tokenizer, add_special_tokens=True)

    # Get the BERT representations. `vsm.hf_represent` will help:
    ##### YOUR CODE HERE
    subtok_reps = vsm.hf_represent(subtok_ids, bert_model, layer=-1)

    # Index into `reps` to get the representation above [CLS].
    # The shape of `reps` should be (1, n, 768), where n is the
    # number of tokens. You need the 0th element of the 2nd dim:
    ##### YOUR CODE HERE
    cls_rep = subtok_reps[0][:][0]

    # These conversions should ensure that you can work with the
    # representations flexibly. Feel free to change the variable
    # name:
    return cls_rep.cpu().numpy()

In [7]:
twitter_train.size, twitter_validate.size, twitter_test.size

(9327, 1164, 1167)

In [8]:
%%time
bert_experiment1500 = sst.experiment(
    twitter_train[:1500], # 
    hf_cls_phi,
    fit_softmax_classifier,
    assess_dataframes=[twitter_validate[:1000]],
    vectorize=False)

              precision    recall  f1-score   support

           1      0.730     0.654     0.690       136
           3      0.716     0.869     0.785       206
           5      0.688     0.297     0.415        37
not_relevant      0.000     0.000     0.000         9

    accuracy                          0.719       388
   macro avg      0.533     0.455     0.473       388
weighted avg      0.701     0.719     0.698       388

CPU times: user 23min 31s, sys: 14 s, total: 23min 45s
Wall time: 3min 59s


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
%%time
bert_experiment_full = sst.experiment(
    twitter_train, # 
    hf_cls_phi,
    fit_softmax_classifier,
    assess_dataframes=[twitter_validate],
    vectorize=False)

              precision    recall  f1-score   support

           1      0.761     0.654     0.704       136
           3      0.723     0.888     0.797       206
           5      0.722     0.351     0.473        37
not_relevant      0.000     0.000     0.000         9

    accuracy                          0.735       388
   macro avg      0.552     0.474     0.493       388
weighted avg      0.720     0.735     0.715       388

CPU times: user 40min 56s, sys: 23.8 s, total: 41min 20s
Wall time: 6min 57s


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
bert_experiment_full.keys()

dict_keys(['model', 'phi', 'train_dataset', 'assess_datasets', 'predictions', 'metric', 'scores'])

In [11]:
bert_experiment_full['scores']

[0.4934175514738175]

In [12]:
bert_experiment_full['metric']

'safe_macro_f1'

In [13]:
bert_experiment_full['model']

LogisticRegression(multi_class='ovr', solver='liblinear')

# Test BERT trained on Tweets on test set

In [14]:
def predict_one_bert(text):
    # List of tokenized examples:
    X = [bert_experiment_full['phi'](text)]
    # Standard `predict` step on a list of lists of str:
    preds = bert_experiment_full['model'].predict(X)
    # Be sure to return the only member of the predictions,
    # rather than the singleton list:
    return preds[0]

In [15]:
# %% time
# twitter_test['prediction'] = twitter_test['text'].apply(predict_one_bert)

In [16]:
import importlib
importlib.reload(sst)

<module 'sst' from '/mnt/c/Users/echya/Documents/XCS224U - 007 Natural Language Understanding/CS224-final-project/sst.py'>

In [17]:
%%time
bert_test = sst.evaluate(
    bert_experiment_full['model'],
    bert_experiment_full['phi'],
    assess_dataframes=[twitter_test],
    vectorizer=bert_experiment_full['assess_datasets'][0]['vectorizer'],
    vectorize=False
)

              precision    recall  f1-score   support

           1      0.739     0.675     0.705       126
           3      0.721     0.894     0.798       208
           5      0.800     0.267     0.400        45
not_relevant      0.000     0.000     0.000        10

    accuracy                          0.728       389
   macro avg      0.565     0.459     0.476       389
weighted avg      0.717     0.728     0.702       389

CPU times: user 5min 8s, sys: 2.89 s, total: 5min 11s
Wall time: 51.9 s


In [18]:
type(bert_test['predictions'][0])

numpy.ndarray

In [20]:
predictions_fname ='results/BERT_predictions_on_twitter_test_apple.csv'
df = bert_test['predictions'][0]
pd.DataFrame(df).to_csv(predictions_fname)

In [21]:
encoding_fname ='results/BERT_encodings_on_twitter_test_apple.csv'
encoded_test = bert_test['assess_datasets'][0]
pd.DataFrame(df).to_csv(encoding_fname)

In [22]:
predictions_df = pd.DataFrame(df)
predictions_df = predictions_df.set_index(twitter_test.index)
predictions_df

Unnamed: 0,0
3,5
8,3
14,1
16,1
26,1
...,...
3778,3
3817,3
3838,3
3840,5


In [23]:
twitter_test['BERT_sentiment'] = predictions_df

In [24]:
twitter_test

Unnamed: 0,tweet_id,text,sentiment,BERT_sentiment
3,623495516,I agree with @jimcramer that the #IndividualIn...,3,5
8,623495521,Apple Inc. Flash Crash: What You Need to Know ...,3,3
14,623495527,@apple Contact sync between Yosemite and iOS8 ...,1,1
16,623495529,WARNING IF YOU BUY AN IPHONE 5S UNLOCKED FROM ...,1,1
26,623495539,@ me RT @101Baemations: Can't stand those ppl ...,1,1
...,...,...,...,...
3778,623499308,#AAPL:Apple products may have Bah! Humbug! hol...,1,3
3817,623499347,Why Apple Watch May Be a Luxury Fashion Hit $...,3,3
3838,623499368,"Apple:Dec 9 Trading Daily Profit $3.48/sh, Wee...",3,3
3840,623499370,Lets go shopping!!! @apple http://t.co/uTFUHuoJIi,5,5


In [25]:
test_predictions_fname ='results/BERT_predictions_added_to_twitter_test_apple.csv'
twitter_test.to_csv(test_predictions_fname)

In [26]:
correct = twitter_test[twitter_test['sentiment'] == twitter_test['BERT_sentiment']]

In [27]:
correct

Unnamed: 0,tweet_id,text,sentiment,BERT_sentiment
8,623495521,Apple Inc. Flash Crash: What You Need to Know ...,3,3
14,623495527,@apple Contact sync between Yosemite and iOS8 ...,1,1
16,623495529,WARNING IF YOU BUY AN IPHONE 5S UNLOCKED FROM ...,1,1
26,623495539,@ me RT @101Baemations: Can't stand those ppl ...,1,1
45,623495558,RT @thehill: Justice Department cites 18th cen...,3,3
...,...,...,...,...
3764,623499294,"@Apple, you made a mistake. Now itå«s time to ...",1,1
3817,623499347,Why Apple Watch May Be a Luxury Fashion Hit $...,3,3
3838,623499368,"Apple:Dec 9 Trading Daily Profit $3.48/sh, Wee...",3,3
3840,623499370,Lets go shopping!!! @apple http://t.co/uTFUHuoJIi,5,5


In [28]:
incorrect = twitter_test[twitter_test['sentiment'] != twitter_test['BERT_sentiment']]

In [29]:
incorrect

Unnamed: 0,tweet_id,text,sentiment,BERT_sentiment
3,623495516,I agree with @jimcramer that the #IndividualIn...,3,5
28,623495541,Latest Apple Products Leading in Efficiency ht...,5,3
95,623495608,#whoknew Why isn't @Apple on Twitter? http://t...,3,1
144,623495657,@apple and @facebook I speak for all of humani...,1,3
145,623495658,@Apple I h8 everything about u,1,3
...,...,...,...,...
3480,623499010,@panic @jpetersen good call I thought @Apple w...,3,1
3535,623499065,RT @TeamCavuto: Protesters stage #DieIn protes...,1,3
3664,623499194,Have been brave and taken out an #AAPL CFD as ...,not_relevant,3
3749,623499279,#Apple wins patent for #3D iPhone UI w/ motion...,5,3
