## Experiments

Let's start playing around with our data

In [27]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import tqdm
from sklearn.model_selection import train_test_split
from datasets import load_dataset,Dataset,DatasetDict
from transformers import DataCollatorWithPadding,AutoModelForSequenceClassification,AutoModel,AutoConfig
from transformers import BertTokenizer, BertModel, BertConfig
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from imblearn.under_sampling import RandomUnderSampler
from torch.optim import Adam, AdamW
from torch.utils.data import DataLoader
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer
import joblib
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Read in training data
This dataset contains 100 annotated terms of service contracts, each row represents a sentence, which carries on it a label. The label corresponds to a different type of potential unfairness, as defined by the authors of CLAUDETTE, the previous paper from which this dataset came from. 

In [28]:
df = pd.read_csv('../data/dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,A,CH,CR,J,LAW,LTD,PINC,TER,USE,document,document_ID,label,text,TER_targets,LTD_targets,A_targets,CH_targets,CR_targets
0,0,0,0,0,0,0,0,0,0,0,Mozilla,0,0,websites & communications terms of use,,,,,
1,1,0,0,0,0,0,0,0,0,0,Mozilla,0,0,please read the terms of this entire document ...,,,,,
2,2,0,0,0,0,0,0,0,0,1,Mozilla,0,1,by accessing or signing up to receive communic...,,,,,
3,3,0,0,0,0,0,0,0,0,0,Mozilla,0,0,our websites include multiple domains such as ...,,,,,
4,4,0,0,0,0,0,0,0,0,0,Mozilla,0,0,you may also recognize our websites by nicknam...,,,,,


Before we were trying binary classification, which wasn't producing great results, let's see if we can get better results using the individual types of labels as classified by the text. The logic here is that because each type of potential unfairness likely has some semantic differences, conglomerating them all into one made it difficult to pick them all out. 

# Multi-Label Preprocessing

In [29]:
df['labels'] = df.apply(lambda row: (1 if row['A'] == 1 else 2 if row['CH'] == 1 else 3 if row['CR'] == 1 else 4 if row['J'] == 1 else 5 if row['LAW'] == 1 else 6 if row['LTD'] == 1 else 7 if row['PINC'] == 1 else 8 if row['TER'] == 1 else 9 if row['USE'] == 1 else 0),axis=1)
x_multi = df['text']
y_multi = df['labels']
df['labels'].value_counts(normalize=True)

labels
0    0.893324
6    0.029681
2    0.016849
8    0.015085
9    0.011853
3    0.010286
4    0.006661
5    0.006122
1    0.005192
7    0.004947
Name: proportion, dtype: float64

In [30]:
label2id = {
    'FAIR':0,
    'A':1,
    'CH':2,
    'CR':3,
    'J':4,
    'LAW':5,
    'LTD':6,
    'PINC':7,
    'TER':8,
    'USE':9
}
id2label = {v:k for k,v in label2id.items()}
id2label

{0: 'FAIR',
 1: 'A',
 2: 'CH',
 3: 'CR',
 4: 'J',
 5: 'LAW',
 6: 'LTD',
 7: 'PINC',
 8: 'TER',
 9: 'USE'}

# Binary Label Pre-Processing

#### BOW preprocessing

In [31]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from nltk.stem import PorterStemmer
from nltk.util import ngrams
nltk.download('stopwords')
nltk.download('punkt')

for column in df.columns:
    df[column] = df[column].fillna('')
    df[column] = df[column].astype(str)

texts = df['text'].astype(str).values
labels = df['label'].values

texts_train, texts_test, labels_train, labels_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# using porterstemmer reduce words to their roots
stemmer = PorterStemmer()

stop_words = set(stopwords.words('english'))

def preprocess(text, n=2):
    # text -> tokens
    tokens = word_tokenize(text)
    # stem and removing stopwords
    tokens = [stemmer.stem(word) for word in tokens if word.lower() not in stop_words and word.isalpha()]
    # n-gram generation
    n_grams = list(ngrams(tokens, n))
    # flattening list of n-grams
    n_grams = ['_'.join(gram) for gram in n_grams]
    return tokens + n_grams
# include n-grams in data processing
texts_train = [preprocess(text, n=2) for text in texts_train]
texts_test = [preprocess(text, n=2) for text in texts_test]
from sklearn.feature_extraction.text import CountVectorizer

# init CountVectorizer with 1-2 n-grams (should suffice for our purposes)
vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 2), token_pattern=r'\b\w+\b')

# preparing train data by joining tokens into strings (countvectorizor takes in strings)
texts_train_joined = [' '.join(text) for text in texts_train]
texts_test_joined = [' '.join(text) for text in texts_test]

# fitting vectorizer on the train data
X_train = vectorizer.fit_transform(texts_train_joined)

# transforming test data based on fitted vocab
X_test = vectorizer.transform(texts_test_joined)

[nltk_data] Downloading package stopwords to /home/jonat/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jonat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [32]:

df["preprocessed_text"] = df["text"].apply(lambda x: preprocess(x, n=2))
df["preprocessed_text"] = df["preprocessed_text"].apply(lambda x: ' '.join(x))
df.head()

Unnamed: 0.1,Unnamed: 0,A,CH,CR,J,LAW,LTD,PINC,TER,USE,...,document_ID,label,text,TER_targets,LTD_targets,A_targets,CH_targets,CR_targets,labels,preprocessed_text
0,0,0,0,0,0,0,0,0,0,0,...,0,0,websites & communications terms of use,,,,,,0,websit commun term use websit_commun commun_te...
1,1,0,0,0,0,0,0,0,0,0,...,0,0,please read the terms of this entire document ...,,,,,,0,pleas read term entir document term care expla...
2,2,0,0,0,0,0,0,0,0,1,...,0,1,by accessing or signing up to receive communic...,,,,,,9,access sign receiv commun agre bound term acce...
3,3,0,0,0,0,0,0,0,0,0,...,0,0,our websites include multiple domains such as ...,,,,,,0,websit includ multipl domain websit_includ inc...
4,4,0,0,0,0,0,0,0,0,0,...,0,0,you may also recognize our websites by nicknam...,,,,,,0,may also recogn websit nicknam bugzilla mozill...


In [33]:
texts_test_joined[0]

'ubisoft advis includ surnam user name ubisoft_advis advis_includ includ_surnam surnam_user user_name'

In [34]:
df["text"][0]

'websites & communications terms of use'

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# init/train classifier
bow = LogisticRegression()
bow.fit(X_train, labels_train)

# predicting on test set
labels_pred = bow.predict(X_test)


In [36]:
labels_pred

array(['0', '0', '0', ..., '1', '0', '0'], dtype=object)

In [37]:
bow.predict(vectorizer.transform([df["preprocessed_text"][0]]))

array(['0'], dtype=object)

Binary Label Preprocessing

In [38]:
x, y = df[['text','preprocessed_text']], df['label']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5, random_state=42)
rus = RandomUnderSampler(sampling_strategy=1)
x_train_res, y_train_res = rus.fit_resample(pd.DataFrame(x_train), pd.DataFrame(y_train))

In [39]:
x_train.head()

Unnamed: 0,text,preprocessed_text
14764,these terms supersede any prior agreements or ...,term supersed prior agreement earlier version ...
7537,"you may not upload , publish , post , distribu...",may upload publish post distribut dissemin una...
16178,world of warcraft requires the creation and re...,world warcraft requir creation retent electron...
6161,the minimum connection speed for sd quality is...,minimum connect speed sd qualiti mbp howev rec...
1090,regardless of the manner in which the arbitrat...,regardless manner arbitr conduct arbitr shall ...


In [40]:
train_df = pd.merge(x_train_res,y_train_res,left_index=True,right_index=True)
test_df = pd.merge(x_test,y_test,left_index=True,right_index=True)
val_df = pd.merge(x_val,y_val,left_index=True,right_index=True)
train_df = train_df.sample(frac=1)
subset_df = train_df.sample(300)
test_subset_df = test_df.sample(100)

In [41]:
dataset_dict = DatasetDict(
    {
    "train":Dataset.from_pandas(train_df),
    "test":Dataset.from_pandas(test_df),
    "val":Dataset.from_pandas(val_df),
    "train_subset":Dataset.from_pandas(subset_df),
    "test_subset":Dataset.from_pandas(test_subset_df)
    }
)

Split dataset into train, test and validation

In [42]:
tokenizer = BertTokenizer.from_pretrained('distilbert/distilbert-base-uncased')
def tokenize(batch):
    return tokenizer(batch['text'], max_length=512,padding='max_length')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [43]:
tokenized_dict = dataset_dict.map(tokenize, batched=True, batch_size=len(dataset_dict))
tokenized_dict.set_format('torch', columns=['input_ids', 'attention_mask',"token_type_ids",'label','text','preprocessed_text'])

Map:   0%|          | 0/3460 [00:00<?, ? examples/s]

Map:   0%|          | 0/2042 [00:00<?, ? examples/s]

Map:   0%|          | 0/2042 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Now let's construct a custom classifier classifier to classify sentences as potentially unfair or not 

In [44]:
class Classifier_v1(nn.Module):
    def __init__(self, model_name):
        super(Classifier_v1,self).__init__()
        self.model = AutoModel.from_pretrained(model_name,config = AutoConfig.from_pretrained(model_name,
                                                                                              output_attention = True,
                                                                                              output_hidden_state = True))
        # New Layer
        self.dropout = nn.Dropout(0.1)
        self.hidden = nn.Linear(self.model.config.hidden_size, 1)
        self.sigmoid = nn.Sigmoid()
        self.relu = nn.ReLU()
    
    def forward(self, input = None, attention_mask = None, token_type_ids = None,input_ids = None, labels = None):
        with torch.no_grad():
            outputs = self.model(input_ids=input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids)
        pooler_output = outputs.pooler_output[0]
        output = self.hidden(pooler_output)
        logit = self.sigmoid(output)
        return logit

In [45]:
from torch.utils.data import DataLoader
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_dataloader = DataLoader(
    tokenized_dict['train'], shuffle=True, batch_size=1, collate_fn=data_collator
)
test_dataloader = DataLoader(
    tokenized_dict['test'], shuffle=True, batch_size=1,collate_fn=data_collator
)
val_dataloader = DataLoader(
    tokenized_dict['val'], shuffle=True, batch_size=1,collate_fn=data_collator
)
train_subset_dataloader = DataLoader(
    tokenized_dict['train_subset'], shuffle=True, batch_size=1,collate_fn=data_collator
)
test_subset_dataloader = DataLoader(
    tokenized_dict['test_subset'], shuffle=True, batch_size=1,collate_fn=data_collator
)

## Training Loop

In [46]:
torch.cuda.empty_cache()

In [47]:
for batch in tokenized_dict["train"]:
    print(batch["text"])
    print(batch["preprocessed_text"])
    break

they also do not apply to membership of skype developer .
also appli membership skype develop also_appli appli_membership membership_skype skype_develop


# Full Loop

In [48]:
def train(model,train_dataloader,test_dataloader,epochs,loss_fn,optimizer,num_train_samples,num_test_samples,lr_scheduler=None,testing=False):
    for epoch in range(epochs):
        model.train()
        train_labels = []
        train_preds = []
        train_loss = 0
        for batch in tqdm.tqdm(train_dataloader):
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            label = batch['labels'].to(torch.float32).to(device)
            output = model(input_ids=input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids)
            loss = loss_fn(output[0],label)
            train_loss += loss.item()
            pred = torch.round(output[0])
            train_preds.append(pred.detach().cpu().numpy())
            train_labels.append(label.detach().cpu().numpy())
            loss.backward()
            optimizer.step()
            if lr_scheduler:
                lr_scheduler.step(loss)
        print(f"Train Epoch: {epoch + 1} | Accuracy: {accuracy_score(train_labels,train_preds)} | Precision: {precision_score(train_labels,train_preds)} | Recall: {recall_score(train_labels,train_preds)} | F1: {f1_score(train_labels,train_preds)} | Loss: {train_loss/num_train_samples}")
        if testing:
            model.eval()
            preds = []
            labels = []
            test_loss = 0
            for batch in tqdm.tqdm(test_dataloader):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                token_type_ids = batch['token_type_ids'].to(device)
                label = batch['labels'].to(torch.float32).to(device)
                with torch.no_grad():
                    output = model(input_ids=input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids)
                loss = loss_fn(output[0],label)
                test_loss += (loss.item())
                pred = torch.round(output[0])
                preds.append(pred.detach().cpu().numpy())
                labels.append(batch["labels"].detach().cpu().numpy())
            print(f"Test Epoch: {epoch + 1} | Accuracy: {accuracy_score(labels,preds)} | Precision: {precision_score(labels,preds)} | Recall: {recall_score(labels,preds)} | F1: {f1_score(labels,preds)} | Loss: {test_loss/num_test_samples}")

In [49]:
def test(model,test_dataloader,loss_fn,num_samples):
    model.eval()
    raw_preds = []
    preds = []
    labels = []
    test_loss = 0
    for batch in tqdm.tqdm(test_dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        label = batch['labels'].to(torch.float32).to(device)
        with torch.no_grad():
            output = model(input_ids=input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids)
        loss = loss_fn(output[0],label)
        test_loss += (loss.item())
        raw_preds.append(output[0].detach().cpu().numpy())
        pred = torch.round(output[0])
        preds.append(pred.detach().cpu().numpy())
        labels.append(batch["labels"].detach().cpu().numpy())
    print(f"Accuracy: {accuracy_score(labels,preds)} | Precision: {precision_score(labels,preds)} | Recall: {recall_score(labels,preds)} | F1: {f1_score(labels,preds)} | Loss: {test_loss/num_samples}")
    return raw_preds,preds,labels

Looking much better than before, but there is still a lot of room for improvement

This is not working, even with oversampling we are still not getting good results. I think the problem is that the pooler output, which is the (mean?) of all the 12 hidden states of BERT that we are using as input to our additional classifier is not capturing the information we need to understand the fairness of a sentence. Instead of using the pooler output, let's use a concatenation of all the hidden states of the BERT model

In [50]:
class Classifier_V2(nn.Module):
    '''
    This model is similar to the first one but instead of using the pooler output, it uses the hidden states of the model
    The 'hidden_states_used' parameter is used to determine how many hidden states to use, smaller values of this will be less computationally expensive, but likely less accurate
    '''
    def __init__(self, model_name ,num_labels,hidden_states_used):
        super(Classifier_V2,self).__init__()
        self.hidden_states_used = hidden_states_used
        self.model = BertModel.from_pretrained(model_name,config = BertConfig.from_pretrained(model_name,output_hidden_states = True,num_labels=num_labels))
        self.hidden1 = nn.Linear(self.model.config.hidden_size*self.model.config.max_position_embeddings*self.hidden_states_used, 64)
        self.hidden_p = nn.Linear(self.model.config.hidden_size, 64)
        self.fc = nn.Linear(64, num_labels)
        self.dropout = nn.Dropout(0.1)
        if num_labels == 1:
            self.activation = nn.Sigmoid()
        else:
            self.activation = nn.Softmax(dim=1)
    """
    In the forward function we take the hidden states of the model and concatenate them along the first dimension, and then we pass them through a linear layer
    We also take the pooler output of the sentence and pass it through a separate linear layer
    We then add the two outputs together and pass them through a final linear layer to classify the output
    We do this in hopes that the model will learn to use the hidden states to learn more about the semantic meaning of the text,
    while the pooler output will learn more about the overall meaning of the text
    """
    def forward(self, attention_mask = None, token_type_ids = None,input_ids = None):
        with torch.no_grad():
            outputs = self.model(input_ids=input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids)
        hidden_states = torch.cat(outputs.hidden_states[-self.hidden_states_used:],dim=0).view(1,-1)
        pooler_output = outputs.pooler_output
        x_pooler = self.hidden_p(self.dropout(pooler_output))
        x_hidden = self.hidden1(self.dropout(hidden_states))
        x = torch.add(x_pooler,x_hidden)
        output = self.fc(x)
        logit = self.activation(output)
        return logit

#### Attempted Hybrid Model
We are attempting to build a hybrid model with BERT and the logistic regression model built on bag of words. If we can't get it to work by Tuesday it will be left out of the report

In [51]:
class HybridModel(nn.Module):
    def __init__(self, num_labels):
        super(HybridModel, self).__init__()
        self.bert = BertModel.from_pretrained('distilbert/distilbert-base-uncased')
        self.bert_linear = nn.Linear(self.bert.config.hidden_size, 2) # reweights BERT output
        self.classifier = nn.Linear(4, num_labels)  # Combines BERT output with bow output into a classifier
        self.dropout = nn.Dropout(0.1)
        self.sigmoid = nn.Sigmoid()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask, bow_logits,token_type_ids):
        # BERT encoding
        with torch.no_grad():
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask,token_type_ids=token_type_ids)
        bert_pooler = outputs.pooler_output  # Using the pooled output
        # BoW pathway
        # Combine the outputs
        bert_output = self.bert_linear(bert_pooler)
        # print(bert_output)
        # print(weighted_bow)
        combined_output = torch.cat((bert_output[0], bow_logits), dim=0)
        # Classifier
        logits = self.classifier(combined_output)
        return self.sigmoid(logits)


In [52]:
torch.cat((torch.tensor([1,0]),torch.tensor([12,4,124,44,2])), dim=0)

tensor([  1,   0,  12,   4, 124,  44,   2])

In [53]:
def train_hybrid(model,train_dataloader,test_dataloader,epochs,loss_fn,optimizer,num_train_samples,num_test_samples,bow_classifier,testing=False):
    for epoch in range(epochs):
        model.train()
        train_labels = []
        train_preds = []
        train_loss = 0
        for batch in tqdm.tqdm(train_dataloader):
            optimizer.zero_grad()
            input_ids = batch['input_ids'].unsqueeze(0).to(device)
            attention_mask = batch['attention_mask'].unsqueeze(0).to(device)
            token_type_ids = batch['token_type_ids'].unsqueeze(0).to(device)
            label = torch.tensor(int(batch['label'])).to(torch.float32).to(device)
            preprocessed = batch['preprocessed_text']
            bow_input = vectorizer.transform([preprocessed])
            bow_logits = bow_classifier.predict_proba(bow_input)
            bow_logits = torch.tensor(bow_logits[0]).to(torch.float32).to(device)
            output = model(input_ids=input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids,bow_logits=bow_logits)
            loss = loss_fn(output[0],label)
            train_loss += loss.item()
            pred = torch.round(output[0])
            train_preds.append(pred.detach().cpu().numpy())
            train_labels.append(label.detach().cpu().numpy())
            loss.backward()
            optimizer.step()
        print(f"Train Epoch: {epoch + 1} | Accuracy: {accuracy_score(train_labels,train_preds)} | Precision: {precision_score(train_labels,train_preds)} | Recall: {recall_score(train_labels,train_preds)} | F1: {f1_score(train_labels,train_preds)} | Loss: {train_loss/num_train_samples}")
        if testing:
            model.eval()
            preds = []
            labels = []
            test_loss = 0
            for batch in tqdm.tqdm(test_dataloader):
                input_ids = batch['input_ids'].unsqueeze(0).to(device)
                attention_mask = batch['attention_mask'].unsqueeze(0).to(device)
                token_type_ids = batch['token_type_ids'].unsqueeze(0).to(device)
                label = torch.tensor(int(batch['label'])).to(torch.float32).to(device)
                preprocessed = batch['preprocessed_text']
                bow_input = vectorizer.transform([preprocessed])
                bow_logits = bow_classifier.predict_proba(bow_input)
                bow_logits = torch.tensor(bow_logits[0]).to(torch.float32).to(device)
                with torch.no_grad():
                    output = model(input_ids=input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids,bow_logits=bow_logits)
                loss = loss_fn(output[0],label)
                test_loss += (loss.item())
                pred = torch.round(output[0])
                preds.append(pred.detach().cpu().numpy())
                labels.append(label.detach().cpu().numpy())
            print(f"Test Epoch: {epoch + 1} | Accuracy: {accuracy_score(labels,preds)} | Precision: {precision_score(labels,preds)} | Recall: {recall_score(labels,preds)} | F1: {f1_score(labels,preds)} | Loss: {test_loss/num_test_samples}")

In [57]:
hybrid_model = HybridModel(1)
hybrid_model.to(device)
hybrid_model.train()
loss_fn = nn.BCELoss()
optimizer = Adam(hybrid_model.parameters(),lr =.00001)
train_hybrid(hybrid_model,tokenized_dict["train_subset"],tokenized_dict["test_subset"],2,loss_fn,optimizer,3460,2042,bow,testing=True)

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertModel were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.l

Train Epoch: 1 | Accuracy: 0.5566666666666666 | Precision: 0.5566666666666666 | Recall: 1.0 | F1: 0.715203426124197 | Loss: 0.06296422884643423


100%|██████████| 100/100 [00:25<00:00,  3.97it/s]


Test Epoch: 1 | Accuracy: 0.11 | Precision: 0.11 | Recall: 1.0 | F1: 0.1981981981981982 | Loss: 0.05178694828366907


100%|██████████| 300/300 [01:08<00:00,  4.39it/s]


Train Epoch: 2 | Accuracy: 0.5566666666666666 | Precision: 0.5566666666666666 | Recall: 1.0 | F1: 0.715203426124197 | Loss: 0.06295500936018938


100%|██████████| 100/100 [00:26<00:00,  3.79it/s]

Test Epoch: 2 | Accuracy: 0.11 | Precision: 0.11 | Recall: 1.0 | F1: 0.1981981981981982 | Loss: 0.05175898351818761





In [55]:
del loss_fn
del optimizer
del hybrid_model

In [17]:
model_v5 = Classifier_V2('distilbert/distilbert-base-uncased',1,6).to(device)


You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertModel were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.l

In [18]:
model_v5.load_state_dict(torch.load('../models/distil_bert_6_.pth'))

<All keys matched successfully>

In [19]:
train(model_v5,train_dataloader,test_dataloader,6,loss_fn,optimizer,num_train_samples=3460,num_test_samples=2042,testing=True)

100%|██████████| 3460/3460 [49:16<00:00,  1.17it/s] 


Train Epoch: 1 | Accuracy: 0.7742774566473989 | Precision: 0.772857964347326 | Recall: 0.776878612716763 | F1: 0.7748630729316806 | Loss: 0.4782019475275769


100%|██████████| 2042/2042 [09:51<00:00,  3.45it/s]


Test Epoch: 1 | Accuracy: 0.9015670910871695 | Precision: 0.5550239234449761 | Recall: 0.5178571428571429 | F1: 0.535796766743649 | Loss: 0.28950539659528585


100%|██████████| 3460/3460 [57:43<00:00,  1.00s/it] 


Train Epoch: 2 | Accuracy: 0.7823699421965318 | Precision: 0.778030734206033 | Recall: 0.7901734104046243 | F1: 0.7840550616575853 | Loss: 0.4751454434021796


100%|██████████| 2042/2042 [09:07<00:00,  3.73it/s]


Test Epoch: 2 | Accuracy: 0.9030362389813908 | Precision: 0.5691489361702128 | Recall: 0.47767857142857145 | F1: 0.5194174757281553 | Loss: 0.2811254669279515


100%|██████████| 3460/3460 [1:00:17<00:00,  1.05s/it]


Train Epoch: 3 | Accuracy: 0.7815028901734105 | Precision: 0.7798850574712644 | Recall: 0.784393063583815 | F1: 0.7821325648414985 | Loss: 0.46188540748368295


100%|██████████| 2042/2042 [10:08<00:00,  3.36it/s]


Test Epoch: 3 | Accuracy: 0.8780607247796278 | Precision: 0.4603174603174603 | Recall: 0.6473214285714286 | F1: 0.5380333951762524 | Loss: 0.33942507749725426


100%|██████████| 3460/3460 [56:06<00:00,  1.03it/s] 


Train Epoch: 4 | Accuracy: 0.7867052023121387 | Precision: 0.785385500575374 | Recall: 0.7890173410404624 | F1: 0.78719723183391 | Loss: 0.4650593039108981


100%|██████████| 2042/2042 [10:11<00:00,  3.34it/s]


Test Epoch: 4 | Accuracy: 0.7517140058765915 | Precision: 0.2846270928462709 | Recall: 0.8348214285714286 | F1: 0.4245175936435868 | Loss: 0.5086687265706147


100%|██████████| 3460/3460 [59:21<00:00,  1.03s/it] 


Train Epoch: 5 | Accuracy: 0.7864161849710982 | Precision: 0.7794698251551043 | Recall: 0.7988439306358381 | F1: 0.7890379674564659 | Loss: 0.46912313250904214


100%|██████████| 2042/2042 [10:32<00:00,  3.23it/s]


Test Epoch: 5 | Accuracy: 0.7233104799216454 | Precision: 0.2641770401106501 | Recall: 0.8526785714285714 | F1: 0.4033790918690602 | Loss: 0.5377601020477222


100%|██████████| 3460/3460 [1:00:31<00:00,  1.05s/it]


Train Epoch: 6 | Accuracy: 0.792485549132948 | Precision: 0.7865232163080408 | Recall: 0.8028901734104046 | F1: 0.7946224256292906 | Loss: 0.4562025490983368


100%|██████████| 2042/2042 [11:58<00:00,  2.84it/s]

Test Epoch: 6 | Accuracy: 0.6802154750244858 | Precision: 0.24062877871825877 | Recall: 0.8883928571428571 | F1: 0.3786869647954329 | Loss: 0.5897667355709787





In [22]:
results = test(model_v5,val_dataloader,loss_fn,2042)

100%|██████████| 2042/2042 [13:44<00:00,  2.48it/s]

Accuracy: 0.910871694417238 | Precision: 0.7674418604651163 | Recall: 0.2894736842105263 | F1: 0.42038216560509556 | Loss: 0.2507265433188207





In [20]:
torch.save(model_v5.state_dict(),'../models/distil_bert_6_.pth')

In [22]:
# del loss_fn
# del optimizer
# del model_v2
#del scheduler
torch.cuda.empty_cache()