In [None]:
!pip3 install datasets
!pip3 install transformers

In [None]:
import pandas as pd
import numpy as np
import json

from datasets import Dataset
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

## Code for Google Colab
# from google.colab import drive
# drive.mount('/content/drive')

# dir = "/content/drive/MyDrive/Colab Notebooks/Data Mining/HW2/data/"
dir = "./data/"
split_path = dir + "data_identification.csv"
emotion_path = dir + "emotion.csv"
data_path = dir + "tweets_DM.json"
train_path = dir + "train.csv"
test_path = dir + "test.csv"
ss_path = dir + "sampleSubmission.csv"

Mounted at /content/drive


In [None]:
# Print the information of emotion.csv and data_identification.csv

emotion = pd.read_csv(emotion_path)
split = pd.read_csv(split_path)

emotion = emotion.convert_dtypes()
a = emotion["emotion"].value_counts()

print("Emotion distribution", end = ' ')
print(emotion.shape)
print("-------------")
print("%-15s %-14d %2f" % ("joy", a.joy, a.joy / emotion.shape[0]))
print("%-15s %-14d %2f" % ("anticipation", a.anticipation, a.anticipation / emotion.shape[0]))
print("%-15s %-14d %2f" % ("trust", a.trust, a.trust / emotion.shape[0]))
print("%-15s %-14d %2f" % ("sadness", a.sadness, a.sadness / emotion.shape[0]))
print("%-15s %-14d %2f" % ("disgust", a.disgust, a.disgust / emotion.shape[0]))
print("%-15s %-14d %2f" % ("fear", a.fear, a.fear / emotion.shape[0]))
print("%-15s %-14d %2f" % ("surprise", a.surprise, a.surprise / emotion.shape[0]))
print("%-15s %-14d %2f" % ("anger", a.anger, a.anger / emotion.shape[0]))

print("\n\n")

a = split["identification"].value_counts()
print("Data split distribution", end = ' ')
print(split.shape)
print("----------------------------")
print("%-15s %-14d %2f" % ("train", a.train, a.train / split.shape[0]))
print("%-15s %-14d %2f" % ("test", a.test, a.test / split.shape[0]))

Emotion distribution (1455563, 2)
-------------
joy             516017         0.354514
anticipation    248935         0.171023
trust           205478         0.141167
sadness         193437         0.132895
disgust         139101         0.095565
fear            63999          0.043969
surprise        48729          0.033478
anger           39867          0.027389



Data split distribution (1867535, 2)
----------------------------
train           1455563        0.779403
test            411972         0.220597


In [None]:
# read data file
f = open(data_path, "r")
lines = f.readlines()
f.close()

# convert json data to dataframe
dict_list = []
for line in lines:
    new_dict = json.loads(line)["_source"]["tweet"]
    dict_list.append(new_dict)
big_data = pd.DataFrame(dict_list)

print(big_data.shape)
print(big_data.head(3))

# save test data
ids = split[split["identification"] == "test"]["tweet_id"].tolist()
test_data = big_data[big_data["tweet_id"].isin(ids)]
# test_data.to_csv(dir + "test.csv", index=False)
# save train data
ids = split[split["identification"] == "train"]["tweet_id"].tolist()
train_data = big_data[big_data["tweet_id"].isin(ids)]
# train_data.to_csv(dir + "train.csv", index=False)

(1867535, 3)
                        hashtags  tweet_id  \
0                     [Snapchat]  0x376b20   
1  [freepress, TrumpLegacy, CNN]  0x2d5350   
2                   [bibleverse]  0x28b412   

                                                text  
0  People who post "add me on #Snapchat" must be ...  
1  @brianklaas As we see, Trump is dangerous to #...  
2  Confident of your obedience, I write to you, k...  


In [None]:
# load tokenizer
# You can pick the model you want to use
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# lower casing the text
train_data["text"] = train_data["text"].str.lower()
test_data["text"] = test_data["text"].str.lower()

# turn words to ids [train_data]
input_ids = []
attention_mask = []

for ind in train_data.index:
    tokenized_text = tokenizer(train_data.loc[ind]["text"], truncation=True)
    input_ids.append(tokenized_text["input_ids"])
    attention_mask.append(tokenized_text["attention_mask"])

train_data["input_ids"] = input_ids
train_data["attention_mask"] = attention_mask

# turn words to ids [test_data]
input_ids = []
attention_mask = []

for ind in test_data.index:
    tokenized_text = tokenizer(test_data.loc[ind]["text"], truncation=True)
    input_ids.append(tokenized_text["input_ids"])
    attention_mask.append(tokenized_text["attention_mask"])

test_data["input_ids"] = input_ids
test_data["attention_mask"] = attention_mask

In [None]:
# Append emotion label to dataset

# make hashmap [tweet_id: emotion]
hashmap = {}
for ind in emotion.index:
    hashmap[emotion.loc[ind]["tweet_id"]] = emotion.loc[ind]["emotion"]

# get emotion from hashmap
def get_emotion_from_id(id):
    return hashmap[id]

# Therefore, the whole process is O(N)
train_data["emotion"] = train_data["tweet_id"].apply(lambda x : get_emotion_from_id(x))
# If we do this command, we would need to spend O(N^2) time
# train_data["emotion"] = train_data["tweet_id"].apply(lambda x : emotion.loc[emotion["tweet_id"] == x].emotion.item())

In [None]:
# Turn emotion to number labels

emotion_map = {
    "joy": 0,
    "anticipation": 1,
    "trust": 2,
    "surprise": 3,
    "sadness": 4,
    "fear": 5,
    "disgust": 6,
    "anger": 7,
}
emotion_list = ["joy", "anticipation", "trust", "surprise", "sadness", "fear", "disgust", "anger"]
train_data["label"] = train_data["emotion"].apply(lambda x : emotion_map[x])

In [None]:
total = train_data.shape[0]
df1 = train_data.iloc[:int(total * 0.8), :]
df2 = train_data.iloc[int(total * 0.8):, :]

# prepare training dataset
train_ds = Dataset.from_pandas(df1)
train_ds.save_to_disk(dir + "train_dataset")
print(train_ds)
valid_ds = Dataset.from_pandas(df2)
valid_ds.save_to_disk(dir + "valid_dataset")
print(valid_ds)

# prepare submission dataset
test_ds = Dataset.from_pandas(test_data)
test_ds.save_to_disk(dir + "test_dataset")
print(test_ds)

Dataset({
    features: ['hashtags', 'tweet_id', 'text', 'input_ids', 'attention_mask', 'emotion', 'label', '__index_level_0__'],
    num_rows: 1164450
})
Dataset({
    features: ['hashtags', 'tweet_id', 'text', 'input_ids', 'attention_mask', 'emotion', 'label', '__index_level_0__'],
    num_rows: 291113
})
Dataset({
    features: ['hashtags', 'tweet_id', 'text', 'input_ids', 'attention_mask', '__index_level_0__'],
    num_rows: 411972
})
