## Experiments

Here you can organize all the experiments and exploration as you figure out how to collect and analyze your data and build your NLP tool. The experiments you conduct here will contribute to the report/presentation of your project.

Once you've finalized everything, you should then transfer the parts that are necessary for your demo to the code in the `nlp` folder.

In [2]:
# configure matplotlib to print pretty figures 
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('pdf', 'png')
plt.rcParams['savefig.dpi'] = 75

plt.rcParams['figure.autolayout'] = False
plt.rcParams['figure.figsize'] = 10, 6
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 14

plt.rcParams['text.usetex'] = True
plt.rcParams['font.family'] = "serif"
plt.rcParams['font.serif'] = "cm"

#other imports
import pandas as pd
import numpy as np
import random

  set_matplotlib_formats('pdf', 'png')


# Data

In [3]:
#pull data from hugging face
from datasets import load_dataset
ethics_data = load_dataset("hendrycks/ethics", "commonsense")

# Log Reg

In [4]:
#preprocessing:

#lets start with simple one hot encoding
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(binary=True)
vectorizer.fit(ethics_data['train']['input'])


In [5]:
from sklearn.linear_model import LogisticRegression

X_train = vectorizer.transform(ethics_data['train']['input'])
y_train = ethics_data['train']['label']

X_test = vectorizer.transform(ethics_data['test']['input'])
y_test = ethics_data['test']['label']

X_verify = vectorizer.transform(ethics_data['validation']['input'])
y_verify = ethics_data['validation']['label']

#setting a random seed for replicability
random.seed(561)

clf = LogisticRegression(max_iter=250)
clf.fit(X_train, y_train)

In [6]:
from sklearn.metrics import classification_report

print("On Test")
print(classification_report(y_test, clf.predict(X_test)))
print("On Validation")
print(classification_report(y_verify, clf.predict(X_verify)))

On Test
              precision    recall  f1-score   support

           0       0.48      0.59      0.53      1873
           1       0.53      0.42      0.47      2091

    accuracy                           0.50      3964
   macro avg       0.50      0.50      0.50      3964
weighted avg       0.51      0.50      0.50      3964

On Validation
              precision    recall  f1-score   support

           0       0.73      0.83      0.78      2069
           1       0.77      0.65      0.70      1816

    accuracy                           0.74      3885
   macro avg       0.75      0.74      0.74      3885
weighted avg       0.75      0.74      0.74      3885



So we did pretty bad huh.

# BERT Classfication 

In [7]:
from huggingface_hub import login

login("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")

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/hlyon/.cache/huggingface/token
Login successful


In [71]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

#class labels
id2label = {0: "ACCEPTABLE", 1: "UNETHICAL"}
label2id = {"ACCEPTABLE": 0, "UNETHICAL": 1}

#grabbing the model and requisite tokenizer
model_name = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [44]:
#process text data
tokenized_ethics = ethics_data.map(lambda d : tokenizer(d['input'], truncation=True, max_length=512), batched=True)

In [10]:
# set up some components of pytorch training sequence
from transformers import DataCollatorWithPadding
import evaluate


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):

    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [12]:
from transformers import TrainingArguments, Trainer

#define trainer
training_args = TrainingArguments(

    output_dir="distilbert-ethics-test",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,

)

trainer = Trainer(

    model=model,
    args=training_args,
    train_dataset=tokenized_ethics["train"],
    eval_dataset=tokenized_ethics["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,

)

trainer.train()


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/2610 [00:00<?, ?it/s]

{'loss': 0.6178, 'grad_norm': 5.169749736785889, 'learning_rate': 1.6168582375478928e-05, 'epoch': 0.57}


  0%|          | 0/248 [00:00<?, ?it/s]

Checkpoint destination directory distilbert-ethics-test/checkpoint-870 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'eval_loss': 1.158385992050171, 'eval_accuracy': 0.42684157416750756, 'eval_runtime': 53.618, 'eval_samples_per_second': 73.93, 'eval_steps_per_second': 4.625, 'epoch': 1.0}
{'loss': 0.5206, 'grad_norm': 5.875209808349609, 'learning_rate': 1.2337164750957855e-05, 'epoch': 1.15}
{'loss': 0.4466, 'grad_norm': 7.600653648376465, 'learning_rate': 8.505747126436782e-06, 'epoch': 1.72}


  0%|          | 0/248 [00:00<?, ?it/s]

Checkpoint destination directory distilbert-ethics-test/checkpoint-1740 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'eval_loss': 1.2441736459732056, 'eval_accuracy': 0.44046417759838546, 'eval_runtime': 48.2987, 'eval_samples_per_second': 82.073, 'eval_steps_per_second': 5.135, 'epoch': 2.0}
{'loss': 0.3762, 'grad_norm': 10.20341968536377, 'learning_rate': 4.674329501915709e-06, 'epoch': 2.3}
{'loss': 0.341, 'grad_norm': 7.073243618011475, 'learning_rate': 8.429118773946361e-07, 'epoch': 2.87}


  0%|          | 0/248 [00:00<?, ?it/s]

Checkpoint destination directory distilbert-ethics-test/checkpoint-2610 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'eval_loss': 1.5285307168960571, 'eval_accuracy': 0.4445005045408678, 'eval_runtime': 49.3857, 'eval_samples_per_second': 80.266, 'eval_steps_per_second': 5.022, 'epoch': 3.0}
{'train_runtime': 2872.7481, 'train_samples_per_second': 14.526, 'train_steps_per_second': 0.909, 'train_loss': 0.45475349133955567, 'epoch': 3.0}


TrainOutput(global_step=2610, training_loss=0.45475349133955567, metrics={'train_runtime': 2872.7481, 'train_samples_per_second': 14.526, 'train_steps_per_second': 0.909, 'train_loss': 0.45475349133955567, 'epoch': 3.0})

In [100]:
#now we can check the accuracy
from transformers import pipeline

classifier = pipeline("sentiment-analysis", model="distilbert-ethics-test", tokenizer=tokenizer)
classifier("I cut up the baby.")

[{'label': 'UNETHICAL', 'score': 0.9840058088302612}]

So now we have a model that will try to decide if any given sentence is acceptable or unethical, let's do a quick and dirty accuracy test.

In [102]:
#very inefficient transformation
def clean(class_list):
    return [label2id[c['label']] for c in class_list]

classification_test = classifier(ethics_data['test']['input'], **{'padding':True,'truncation':True,'max_length':512})
classification_validation = classifier(ethics_data['validation']['input'], **{'padding':True,'truncation':True,'max_length':512})
cleaned_test = clean(classification_test)
cleaned_validation = clean(classification_validation)

In [103]:
print("DistilBert On Test")
print(classification_report(y_test, cleaned_test))
print("DistilBert On Validation")
print(classification_report(y_verify, cleaned_validation))

DistilBert On Test
              precision    recall  f1-score   support

           0       0.43      0.54      0.48      1873
           1       0.47      0.36      0.40      2091

    accuracy                           0.44      3964
   macro avg       0.45      0.45      0.44      3964
weighted avg       0.45      0.44      0.44      3964

DistilBert On Validation
              precision    recall  f1-score   support

           0       0.83      0.87      0.85      2069
           1       0.85      0.80      0.82      1816

    accuracy                           0.84      3885
   macro avg       0.84      0.84      0.84      3885
weighted avg       0.84      0.84      0.84      3885

