In [1]:
import pandas as pd

##read original file
df = pd.read_json("kaggle_data/tweets_DM.json",lines = True)
df.head()



Unnamed: 0,_score,_index,_source,_crawldate,_type
0,391,hashtag_tweets,"{'tweet': {'hashtags': ['Snapchat'], 'tweet_id...",2015-05-23 11:42:47,tweets
1,433,hashtag_tweets,"{'tweet': {'hashtags': ['freepress', 'TrumpLeg...",2016-01-28 04:52:09,tweets
2,232,hashtag_tweets,"{'tweet': {'hashtags': ['bibleverse'], 'tweet_...",2017-12-25 04:39:20,tweets
3,376,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x1cd5...",2016-01-24 23:53:05,tweets
4,989,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x2de2...",2016-01-08 17:18:59,tweets


In [2]:
def get_id(source):
    tweet = source["tweet"]
    return tweet.get('tweet_id')

def get_hashtag(source):
    tweet = source["tweet"]
    return tweet.get('hashtags')

def get_text(source):
    tweet = source["tweet"]
    return tweet.get('text')

In [3]:
data = pd.DataFrame()
data["tweet_id"] = df["_source"].apply(lambda x: get_id(x))
data["hashtags"] = df["_source"].apply(lambda x: get_hashtag(x))
data["text"] = df["_source"].apply(lambda x: get_text(x))



In [4]:
data_identification = pd.read_csv("kaggle_data/data_identification.csv")
data_emotion = pd.read_csv("kaggle_data/emotion.csv")

print(len(data_identification))
print(len(data_emotion))

1867535
1455563


In [5]:
data_emotion["emotion"].unique()

array(['sadness', 'disgust', 'anticipation', 'joy', 'trust', 'anger',
       'fear', 'surprise'], dtype=object)

In [6]:
train_data_ids = data_identification[data_identification["identification"]=="train"]
train_data = pd.merge(train_data_ids, data_emotion, on="tweet_id", how='left')
train_data = pd.merge(train_data,data, on="tweet_id", how='left')

train_data = train_data[["tweet_id","hashtags","text","emotion"]]
print(len(train_data))

1455563


In [7]:
test_data_ids = data_identification[data_identification["identification"]=="test"]
test_data = pd.merge(test_data_ids, data, on="tweet_id", how='left')
test_data = test_data[["tweet_id","hashtags","text"]]
print(len(test_data))

411972


In [8]:
import re

def remove_mention(text):
    return re.sub("@[\w]+",'',text)

In [9]:
train_data["text"] = train_data["text"].apply(lambda x: remove_mention(x))

# bert tensorflow

In [10]:
import datasets
from datasets import Dataset

In [11]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


In [12]:
emo_dict = {'sadness': 0, 'disgust': 1,  'anticipation': 2, 'joy': 3, 'trust': 4, 'anger': 5, 'fear': 6, 'surprise': 7}

def map_emotion(emotion):
    return emo_dict.get(emotion)


#'sadness', 'disgust', 'anticipation', 'joy', 'trust', 'anger','fear', 'surprise'], dtype=object)

In [13]:
bert_train_data = train_data[["text","emotion"]]
bert_train_data["label"] = bert_train_data["emotion"].apply(lambda x: map_emotion(x))
bert_train_data = bert_train_data[["text","label"]] 
bert_train_data[:10]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bert_train_data["label"] = bert_train_data["emotion"].apply(lambda x: map_emotion(x))


Unnamed: 0,text,label
0,Huge Respect🖒 talking about losing his dad to...,3
1,Yoooo we hit all our monthly goals with the ne...,3
2,Well done team 🌟 <LH> of every one of you.,4
3,Come join on #PUBG while he strives for chick...,3
4,Blessings!My #strength little. My #bones brit...,2
5,Never give up. The manifestation of your goal ...,2
6,I Believe When No One Else Does... <LH> #Dream...,2
7,with due respect... Do u have any sympathies ...,3
8,I can't tell if I'm alive or in the after life...,0
9,#GRATEFUL!! WORLD GOODMORNING!,4


In [14]:
train_dataset = Dataset.from_pandas(bert_train_data[:1200000])
train_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 1200000
})

In [15]:
test_dataset = Dataset.from_pandas(bert_train_data[1200000:])
test_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 255563
})

In [16]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [17]:
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
print(tokenized_train_dataset)

Map:   0%|          | 0/1200000 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 1200000
})


In [18]:
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)
print(tokenized_test_dataset)

Map:   0%|          | 0/255563 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 255563
})


In [19]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [20]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [21]:
id2label = {0: 'sadness', 1: 'disgust', 2: 'anticipation', 3: 'joy', 4: 'trust', 5: 'anger', 6: 'fear', 7: 'surprise'}

label2id = {'sadness': 0, 'disgust': 1,  'anticipation': 2, 'joy': 3, 'trust': 4, 'anger': 5, 'fear': 6, 'surprise': 7}


In [22]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=8, id2label=id2label, label2id=label2id)
device = torch.device('cuda')
model.to(device)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid

In [None]:
training_args = TrainingArguments(
    output_dir="my_gpt2_model",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

The following columns in the training set don't have a corresponding argument in `GPT2ForSequenceClassification.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1200000
  Num Epochs = 10
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 750000
  Number of trainable parameters = 124445952
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


In [None]:
test_data["text"] = test_data["text"].apply(lambda x: remove_mention(x))

In [None]:
print(test_data["text"][0])

In [None]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis", model="my_model/checkpoint-28128", device = 0)


In [None]:
def classify_emotion(text):
    return classifier(text)[0].get("label")

In [None]:
test_data.head()


In [None]:
#test_data['emotion'] = test_data.apply(lambda x: classify_emotion(x["text"]), axis = 1)
#test_data.head()

In [None]:
from tqdm import tqdm

answer_list = []
for i in tqdm(range(len(test_data))):
    answer_list.append(classify_emotion(test_data['text'][i]))

test_data['emotion'] = answer_list

answer_df = test_data[["tweet_id", 'emotion']]
answer_df.reset_index(drop=True, inplace=True)


answer_df.rename(columns={"tweet_id": "id","emotion": "emotion"},inplace = True)
answer_df.to_csv("submission_bistilbert_10epoch.csv",index= False)

In [None]:
answer_df.rename(columns={"tweet_id": "id","emotion": "emotion"},inplace = True)
answer_df.to_csv("submission_bistilbert_20epoch.csv",index= False)