<a href="https://colab.research.google.com/github/wid0gast/truefoundry-task/blob/master/classifier_testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm, trange
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn.neural_network import MLPClassifier
import spacy

In [2]:
from google.colab import files
uploaded = files.upload()

Saving airline_sentiment_analysis.csv to airline_sentiment_analysis.csv


In [3]:
data = pd.read_csv('airline_sentiment_analysis.csv').drop('Unnamed: 0', axis=1)

In [4]:
data.airline_sentiment.loc[data.airline_sentiment == 'positive'] = 1
data.airline_sentiment.loc[data.airline_sentiment == 'negative'] = 0

In [5]:
data['encoded'] = [[] for _ in range(len(data))]

In [6]:
data.groupby(by='airline_sentiment').count()

Unnamed: 0_level_0,text,encoded
airline_sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1
0,9178,9178
1,2363,2363


In [None]:
!python -m spacy download en_core_web_md

In [None]:
# !python -m spacy download en_core_web_lg

In [8]:
nlp = spacy.load('en_core_web_md')

In [9]:
for i in trange(len(data)):
    data.iloc[i].encoded = nlp(data.text[i]).vector

  0%|          | 0/11541 [00:00<?, ?it/s]

In [10]:
data.to_csv('data_encoded1.csv')

In [11]:
X_train, X_test, y_train, y_test = train_test_split(data.encoded.values, data.airline_sentiment.values, test_size=0.15, random_state=42)

In [12]:
X_train = np.array([arr for arr in X_train])
X_test = np.array([arr for arr in X_test])

In [13]:
X_test.shape

(1732, 300)

In [14]:
y_train = np.array([arr for arr in y_train])
y_test = np.array([arr for arr in y_test])

In [15]:
y_train.shape

(9809,)

In [24]:
def train_test(clf):
    # print(clf)
    clf.fit(X_train, y_train)
    print(np.sum(clf.predict(X_test) == y_test) / len(y_test))
def test(clf):
    print(np.sum(clf.predict(X_test) == y_test) / len(y_test))

In [40]:
data.iloc[0].text

"@VirginAmerica plus you've added commercials to the experience... tacky."

In [35]:
print(pickle.format_version)

4.0


## SVM

In [26]:
clf = svm.SVC()
train_test(clf)

0.913972286374134


In [27]:
import pickle

In [28]:
clf1 = make_pipeline(StandardScaler(), SVC())
train_test(clf1)

0.913972286374134


In [29]:
pickle.dump(clf1, open('svm_rbf_1_scaler.pkl', 'wb'))

In [30]:
try_clf = pickle.load(open('svm_rbf_1_scaler.pkl', 'rb'))

In [31]:
test(try_clf)

0.913972286374134


In [38]:
try_clf.predict(nlp("@VirginAmerica plus you've added commercials to the experience... tacky.").vector.reshape(1,-1))[0]

0

In [25]:
for k in ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']:
    for c in [0.5, 0.75, 1, 1.25, 1.5]:
        print(k, c)
        train_test(make_pipeline(StandardScaler(), SVC(C=c, kernel=k)))

linear 0.5
0.9093533487297921
linear 0.75
0.9110854503464203
linear 1
0.9116628175519631
linear 1.25
0.9116628175519631
linear 1.5
0.9099307159353349
poly 0.5
0.8851039260969977
poly 0.75
0.8920323325635104
poly 1
0.8966512702078522
poly 1.25
0.894919168591224
poly 1.5
0.8966512702078522
rbf 0.5
0.9093533487297921
rbf 0.75
0.9110854503464203
rbf 1
0.913972286374134
rbf 1.25
0.9151270207852193
rbf 1.5
0.9151270207852193
sigmoid 0.5
0.8469976905311778
sigmoid 0.75
0.8441108545034642
sigmoid 1
0.8418013856812933
sigmoid 1.25
0.8423787528868361
sigmoid 1.5
0.8498845265588915
precomputed 0.5


ValueError: ignored

## SGD

In [19]:
from sklearn.linear_model import SGDClassifier

In [20]:
sgd_clf = SGDClassifier(loss="modified_huber", penalty="l2", max_iter=200)
train_test(sgd_clf)

SGDClassifier(loss='modified_huber', max_iter=200)
0.8845265588914549


## Decision Trees

In [21]:
tree_clf = tree.DecisionTreeClassifier()
train_test(tree_clf)

DecisionTreeClassifier()
0.7979214780600462


## Neural Networks

In [22]:
nn_clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(16, 4), random_state=1, warm_start=True)
train_test(nn_clf)

MLPClassifier(alpha=1e-05, hidden_layer_sizes=(16, 4), random_state=1,
              solver='lbfgs', warm_start=True)
0.9116628175519631


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


## MISC

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_clf = RandomForestClassifier(n_estimators=20)
train_test(rf_clf)

RandomForestClassifier(n_estimators=20)
0.8724018475750578


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier

In [None]:
ab_clf = AdaBoostClassifier(n_estimators=200)
scores = cross_val_score(clf, X_train, y_train, cv=5)
scores.mean()

0.9145677094785629

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=2, random_state=0).fit(X_train, y_train)
clf.score(X_test, y_test)

0.8856812933025404

In [None]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(X_train)
X_scaled = scaler.transform(X_train)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = GaussianNB()

eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')

for clf, label in zip([clf1, clf2, clf3, eclf], ['Logistic Regression', 'Random Forest', 'naive Bayes', 'Ensemble']):
    scores = cross_val_score(clf, X_scaled, y_train, scoring='accuracy', cv=5)
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

## BERT

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.2-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 5.1 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 43.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 54.3 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.21.2


In [None]:
# data = pd.read_csv('data_encoded.csv')

In [None]:
X_train, X_val, y_train, y_val = train_test_split(data.text.values,
                                                  data.airline_sentiment.values, 
                                                  test_size=0.15, 
                                                  random_state=42, 
                                                  stratify=data.airline_sentiment.values)

In [None]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from transformers import BertForSequenceClassification
from transformers import get_linear_schedule_with_warmup

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                          do_lower_case=True)

encoded_data_train = tokenizer.batch_encode_plus(
    X_train,
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=512,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    X_val,
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=512,
    return_tensors='pt'
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
data

Unnamed: 0,airline_sentiment,text
0,positive,@VirginAmerica plus you've added commercials t...
1,negative,@VirginAmerica it's really aggressive to blast...
2,negative,@VirginAmerica and it's a really big bad thing...
3,negative,@VirginAmerica seriously would pay $30 a fligh...
4,positive,"@VirginAmerica yes, nearly every time I fly VX..."
...,...,...
11536,negative,@AmericanAir my flight was Cancelled Flightled...
11537,negative,@AmericanAir right on cue with the delays👌
11538,positive,@AmericanAir thank you we got on a different f...
11539,negative,@AmericanAir leaving over 20 minutes Late Flig...


In [None]:
y_train

array([1, 0, 0, ..., 0, 0, 0], dtype=object)

In [None]:
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(y_train.astype('long'))

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(y_val.astype('long'))

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 4

train_data = TensorDataset(input_ids_train, attention_masks_train, labels_train)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(input_ids_val, attention_masks_val, labels_val)
val_sampler = RandomSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

In [None]:
print(len(train_dataloader))

2453


In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [None]:
import torch.nn as nn
from transformers import BertModel

class BertClassifier(nn.Module):

    def __init__(self, freeze_bert=False):
        super(BertClassifier, self).__init__()
        D_in, H, D_out = 768, 50, 2

        self.bert = BertModel.from_pretrained('nlpaueb/legal-bert-base-uncased')

        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(H, D_out)
        )

        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        last_hidden_state_cls = outputs[0][:, 0, :]

        logits = self.classifier(last_hidden_state_cls)

        return logits, last_hidden_state_cls

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

def initialize_model(epochs=20):

    bert_classifier = BertClassifier(freeze_bert=False)
    bert_classifier.to(device)

    optimizer = AdamW(bert_classifier.parameters(),
                      lr=5e-5,
                      eps=1e-8)
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0,
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

In [None]:
initialize_model()

Downloading config.json:   0%|          | 0.00/0.99k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at nlpaueb/legal-bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


(BertClassifier(
   (bert): BertModel(
     (embeddings): BertEmbeddings(
       (word_embeddings): Embedding(30522, 768, padding_idx=0)
       (position_embeddings): Embedding(512, 768)
       (token_type_embeddings): Embedding(2, 768)
       (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
       (dropout): Dropout(p=0.1, inplace=False)
     )
     (encoder): BertEncoder(
       (layer): ModuleList(
         (0): BertLayer(
           (attention): BertAttention(
             (self): BertSelfAttention(
               (query): Linear(in_features=768, out_features=768, bias=True)
               (key): Linear(in_features=768, out_features=768, bias=True)
               (value): Linear(in_features=768, out_features=768, bias=True)
               (dropout): Dropout(p=0.1, inplace=False)
             )
             (output): BertSelfOutput(
               (dense): Linear(in_features=768, out_features=768, bias=True)
               (LayerNorm): LayerNorm((768,), eps=1e-12, 

In [None]:
import random
import time
import numpy as np

loss_fn = nn.CrossEntropyLoss()

def set_seed(seed_value=42):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, train_dataloader, val_dataloader=None, epochs=2, evaluation=False):
    print('Training Started......\n')
    for epoch_i in range(epochs):
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        t0_epoch, t0_batch = time.time(), time.time()
        total_loss, batch_loss, batch_counts = 0, 0, 0

        model.train()

        for step, batch in tqdm(enumerate(train_dataloader)):
            batch_counts += 1
            b_input_ids, b_attn_masks, b_labels = tuple(t.to(device) for t in batch)

            model.zero_grad()
            logits, cls_tmp = model(b_input_ids, b_attn_masks)

            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            loss.backward()
            
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

            if (step % 100 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                time_elapsed = time.time() - t0_batch

                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()
            
        avg_train_loss = total_loss / len(train_dataloader)
        print("-"*70)
        
        if evaluation == True:
            val_loss, val_accuracy = evaluate(model, val_dataloader)
            time_elapsed = time.time() - t0_epoch
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")

    print("Training complete!")

def evaluate(model, val_dataloader):
    model.eval()

    val_accuracy = []
    val_loss = []

    for batch in val_dataloader:
        b_input_ids, b_attn_masks, b_labels = tuple(t.to(device) for t in batch)
        
        with torch.no_grad():
            logits, cls_tmp = model(b_input_ids, b_attn_masks)

        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        preds = torch.argmax(logits, dim=1).flatten()

        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

In [None]:
set_seed(42)    # Set seed for reproducibility
bert_classifier, optimizer, scheduler = initialize_model(epochs=1)
train(bert_classifier, train_dataloader, val_dataloader, epochs=1, evaluation=True)

Some weights of the model checkpoint at nlpaueb/legal-bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training Started......

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------


0it [00:00, ?it/s]

   1    |   100   |   0.615439   |     -      |     -     |   39.48  
