# Use BERT Representations with LogisticRegression Softmax Classifier

In [1]:
from collections import Counter
import os
import numpy as np
import pandas as pd
import torch
from torch import nn, optim
import torch.nn as nn
from torch.utils.data import TensorDataset, Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from transformers import BertTokenizer, BertModel, BertForSequenceClassification


import dataset
import vsm
import sst

In [2]:
TWITTER = 2
TWITTER_AIRLINES = 3
TWITTER_APPLE = 4

In [3]:
twitter_train, twitter_validate, twitter_test =  dataset.dataset_reader(TWITTER_AIRLINES)
[twitter_train, twitter_validate, twitter_test] = list(map(lambda ds : dataset.prune_columns(TWITTER_AIRLINES, ds), [twitter_train, twitter_validate, twitter_test]))

In [4]:
# bert_weights_name = 'bert-base-cased'
# bert_tokenizer = BertTokenizer.from_pretrained(bert_weights_name)
# bert_model = BertModel.from_pretrained(bert_weights_name)
# model = BertForSequenceClassification.from_pretrained(bert_weights_name)
# Unique values of sentiment
twitter_sentiment_labels = twitter_train['sentiment'].unique()

In [5]:
twitter_train.size, twitter_validate.size, twitter_test.size

(46848, 5856, 5856)

In [6]:
%%time
bert_experiment1500 = sst.experiment(
    twitter_train[:1500], # 
    dataset.hf_cls_phi,
    dataset.fit_softmax_classifier,
    assess_dataframes=[twitter_validate[:1000]],
    vectorize=False)

              precision    recall  f1-score   support

    negative      0.798     0.914     0.852       606
     neutral      0.648     0.486     0.556       216
    positive      0.785     0.635     0.702       178

    accuracy                          0.772      1000
   macro avg      0.744     0.678     0.703      1000
weighted avg      0.763     0.772     0.761      1000

CPU times: user 28min 20s, sys: 28.3 s, total: 28min 48s
Wall time: 7min 7s


In [7]:
%%time
bert_experiment3000 = sst.experiment(
    twitter_train[:3000], # 
    dataset.hf_cls_phi,
    dataset.fit_softmax_classifier,
    assess_dataframes=[twitter_validate[:1000]],
    vectorize=False)

              precision    recall  f1-score   support

    negative      0.826     0.916     0.869       606
     neutral      0.681     0.574     0.623       216
    positive      0.795     0.652     0.716       178

    accuracy                          0.795      1000
   macro avg      0.767     0.714     0.736      1000
weighted avg      0.789     0.795     0.788      1000

CPU times: user 49min 46s, sys: 49.9 s, total: 50min 36s
Wall time: 12min 36s


In [8]:
%%time
bert_experiment6000 = sst.experiment(
    twitter_train[:6000], # 
    dataset.hf_cls_phi,
    dataset.fit_softmax_classifier,
    assess_dataframes=[twitter_validate[:1500]],
    vectorize=False)

              precision    recall  f1-score   support

    negative      0.838     0.920     0.877       910
     neutral      0.693     0.546     0.611       306
    positive      0.763     0.690     0.725       248

    accuracy                          0.803      1464
   macro avg      0.765     0.718     0.737      1464
weighted avg      0.795     0.803     0.795      1464

CPU times: user 1h 31min 42s, sys: 58.1 s, total: 1h 32min 40s
Wall time: 15min 37s


In [9]:
%%time
bert_experiment_full = sst.experiment(
    twitter_train, # 
    dataset.hf_cls_phi,
    dataset.fit_softmax_classifier,
    assess_dataframes=[twitter_validate],
    vectorize=False)

              precision    recall  f1-score   support

    negative      0.844     0.920     0.880       910
     neutral      0.717     0.588     0.646       306
    positive      0.783     0.698     0.738       248

    accuracy                          0.813      1464
   macro avg      0.781     0.735     0.755      1464
weighted avg      0.807     0.813     0.807      1464

CPU times: user 2h 18min 26s, sys: 1min 23s, total: 2h 19min 50s
Wall time: 23min 34s


In [10]:
bert_experiment_full.keys()

dict_keys(['model', 'phi', 'train_dataset', 'assess_datasets', 'predictions', 'metric', 'scores'])

In [11]:
bert_experiment_full['scores']

[0.7547285413846058]

In [12]:
bert_experiment_full['metric']

'safe_macro_f1'

In [13]:
bert_experiment_full['model']

LogisticRegression(multi_class='ovr', solver='liblinear')

# Test BERT trained on Tweets on test set

In [14]:
def predict_one_bert(text):
    # List of tokenized examples:
    X = [bert_experiment_full['phi'](text)]
    # Standard `predict` step on a list of lists of str:
    preds = bert_experiment_full['model'].predict(X)
    # Be sure to return the only member of the predictions,
    # rather than the singleton list:
    return preds[0]

In [15]:
# %% time
# twitter_test['prediction'] = twitter_test['text'].apply(predict_one_bert)

In [16]:
import importlib
importlib.reload(sst)

<module 'sst' from '/mnt/c/Users/echya/Documents/XCS224U - 007 Natural Language Understanding/CS224-final-project/sst.py'>

In [17]:
%%time
bert_test = sst.evaluate(
    bert_experiment_full['model'],
    bert_experiment_full['phi'],
    assess_dataframes=[twitter_test],
    vectorizer=bert_experiment_full['assess_datasets'][0]['vectorizer'],
    vectorize=False
)

              precision    recall  f1-score   support

    negative      0.864     0.927     0.894       918
     neutral      0.724     0.601     0.657       323
    positive      0.744     0.704     0.724       223

    accuracy                          0.821      1464
   macro avg      0.777     0.744     0.758      1464
weighted avg      0.815     0.821     0.816      1464

CPU times: user 13min 24s, sys: 7.66 s, total: 13min 31s
Wall time: 2min 15s


In [18]:
type(bert_test['predictions'][0])

numpy.ndarray

In [19]:
predictions_fname ='results/BERT_predictions_on_twitter_test_airline.csv'
df = bert_test['predictions'][0]
pd.DataFrame(df).to_csv(predictions_fname)

In [20]:
encoding_fname ='results/BERT_encodings_on_twitter_test_airline.csv'
encoded_test = bert_test['assess_datasets'][0]
pd.DataFrame(df).to_csv(encoding_fname)

In [21]:
predictions_df = pd.DataFrame(df)
predictions_df = predictions_df.set_index(twitter_test.index)
predictions_df

Unnamed: 0,0
3,negative
14,positive
19,neutral
21,positive
32,negative
...,...
14581,negative
14582,negative
14587,negative
14626,negative


In [22]:
twitter_test['BERT_sentiment'] = predictions_df

In [23]:
twitter_test

Unnamed: 0,tweet_id,text,sentiment,airline,BERT_sentiment
3,570301031407624196,@VirginAmerica it's really aggressive to blast...,negative,Virgin America,negative
14,570285904809598977,@VirginAmerica Thanks!,positive,Virgin America,positive
19,570267956648792064,@VirginAmerica you know what would be amazingl...,positive,Virgin America,neutral
21,570264145116819457,@VirginAmerica I love this graphic. http://t.c...,positive,Virgin America,positive
32,570088404156698625,"@VirginAmerica help, left expensive headphones...",negative,Virgin America,negative
...,...,...,...,...,...
14581,569596156927303681,@AmericanAir 30 minutes flight from OKC and th...,negative,American,negative
14582,569595899204255745,@AmericanAir seems like queue times are very h...,negative,American,negative
14587,569595333899997185,@AmericanAir I’ll play it by ear. I know that ...,negative,American,negative
14626,569589460226183168,@AmericanAir Flight 953 NYC-Buenos Aires has b...,negative,American,negative


In [24]:
test_predictions_fname ='results/BERT_predictions_added_to_twitter_test_airline.csv'
twitter_test.to_csv(test_predictions_fname)

In [25]:
correct = twitter_test[twitter_test['sentiment'] == twitter_test['BERT_sentiment']]

In [26]:
correct

Unnamed: 0,tweet_id,text,sentiment,airline,BERT_sentiment
3,570301031407624196,@VirginAmerica it's really aggressive to blast...,negative,Virgin America,negative
14,570285904809598977,@VirginAmerica Thanks!,positive,Virgin America,positive
21,570264145116819457,@VirginAmerica I love this graphic. http://t.c...,positive,Virgin America,positive
32,570088404156698625,"@VirginAmerica help, left expensive headphones...",negative,Virgin America,negative
41,570025482344898560,"@VirginAmerica Hey, first time flyer next week...",negative,Virgin America,negative
...,...,...,...,...,...
14573,569597220871282690,@AmericanAir You didn't respond to my DM. You ...,negative,American,negative
14581,569596156927303681,@AmericanAir 30 minutes flight from OKC and th...,negative,American,negative
14582,569595899204255745,@AmericanAir seems like queue times are very h...,negative,American,negative
14587,569595333899997185,@AmericanAir I’ll play it by ear. I know that ...,negative,American,negative


In [27]:
incorrect = twitter_test[twitter_test['sentiment'] != twitter_test['BERT_sentiment']]

In [28]:
incorrect

Unnamed: 0,tweet_id,text,sentiment,airline,BERT_sentiment
19,570267956648792064,@VirginAmerica you know what would be amazingl...,positive,Virgin America,neutral
47,570010571707256832,@VirginAmerica wow this just blew my mind,positive,Virgin America,negative
63,569986782567071744,@VirginAmerica @LadyGaga @CarrieUnderwood Sorr...,neutral,Virgin America,positive
67,569973821396152323,"@VirginAmerica Hi, Virgin! I'm on hold for 40-...",negative,Virgin America,neutral
86,569923394990419968,@VirginAmerica Can't bring up my reservation o...,neutral,Virgin America,negative
...,...,...,...,...,...
14358,569627839374467072,@AmericanAir @NY_NJairports Do you guys teach ...,negative,American,neutral
14405,569620512139145216,@AmericanAir please call us back to rebook!!!...,negative,American,neutral
14454,569613611510960128,@AmericanAir Hi guys checking in US/AA 639 JFK...,neutral,American,positive
14469,569611259357863936,@AmericanAir hung up on now many times trying...,neutral,American,negative


# Save Model

In [29]:
import pickle
model_fname = 'models/BERT_twitter_airline.sav'
pickle.dump(bert_experiment_full, open(model_fname, 'wb'))