In [None]:
!pip3 install datasets
!pip3 install transformers

In [4]:
import pandas as pd
import numpy as np
import pickle

from datasets import Dataset, load_from_disk
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# from google.colab import drive
# drive.mount('/content/drive')

# dir = "/content/drive/MyDrive/Colab Notebooks/Data Mining/HW2/data/"
dir = "./data"
split_path = dir + "data_identification.csv"
emotion_path = dir + "emotion.csv"
data_path = dir + "tweets_DM.json"
train_path = dir + "train.csv"
test_path = dir + "test.csv"
ss_path = dir + "sampleSubmission.csv"

Mounted at /content/drive


In [5]:
# Donwload the tokenizer and model you need
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=8)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'pre_clas

In [6]:
# emotion to number mapping
emotion_map = {
    "joy": 0,
    "anticipation": 1,
    "trust": 2,
    "surprise": 3,
    "sadness": 4,
    "fear": 5,
    "disgust": 6,
    "anger": 7,
}

emotion_list = ["joy", "anticipation", "trust", "surprise", "sadness", "fear", "disgust", "anger"]

In [None]:
# load dataset
train_ds = load_from_disk("train_dataset")
test_ds = load_from_disk("test_dataset")

In [None]:
# Because this training needs to run about 30 hours, so I did not execute the block here

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    total = predictions.shape[0]
    correct = 0
    for i in range(total):
        if predictions[i] == labels[i]:
            correct += 1
    return {'accuracy': correct / total, 'correct': correct}

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=5,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("start training!")

trainer.train()

trainer.save_model("DistilledBERT-Model")

print("saved model and predict!")


predictions = trainer.predict(test_ds)
pickle.dump(predictions, open("predictions.pkl", "wb"))
print(predictions)

In [None]:
# After training, we make submission file
sub_csv = pd.read_csv("test.csv", lineterminator='\n')
sub_csv["text"] = sub_csv["text"].str.lower()
sub_csv.drop(["Unnamed: 0"], axis=1, inplace=True)

# run tokenizer
input_ids = []
attention_mask = []

print("Running tokenizer...")

for ind in a.index:
  tokenized_text = tokenizer(a.loc[ind]["text"], truncation=True)
  input_ids.append(tokenized_text["input_ids"])
  attention_mask.append(tokenized_text["attention_mask"])

sub_csv["input_ids"] = input_ids
sub_csv["attention_mask"] = attention_mask

# turn DataFrame to Dataset
sub_ds = Dataset.from_pandas(sub_csv)
sub_ds.save_to_disk("submission_dataset")

In [None]:
print("Predicting...")

predictions = trainer.predict(sub_ds)
pickle.dump(predictions, open("submission.pkl", "wb"))
print(predictions)

In [None]:
sub_data = pickle.load(open("submission.pkl", "rb"))
print(sub_data.predictions.shape)

pred = np.argmax(sub_data.predictions, axis=-1)
print(pred.shape)

# read test.csv again to append emotion
sub_csv = pd.read_csv("test.csv", lineterminator='\n')
emo = [emotion_list[x] for x in pred]
sub_csv["emotion"] = emo
sub_csv.drop(["Unnamed: 0", "hashtags", "text"], axis=1, inplace=True)
sub_csv.columns = ["id", "emotion"]
sub_csv.to_csv("submission.csv", index=False)

print(sub_csv.head(20))