In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import numpy as np
import pandas as pd 

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

Data preprocessing

In [66]:
train_df=pd.read_csv('tweet_emotions.csv')
train_df.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [6]:
import string
punc=string.punctuation
def clean_text(text):
    text=str(text).lower()
    text=' '.join(w for w in text.split(' ') if w not in punc)
    return text

In [7]:
train_df['content']=train_df['content'].apply(clean_text)

In [8]:
num_labels=len(train_df['sentiment'].unique())
num_labels

13

In [9]:
train_df=train_df.drop('tweet_id', axis=1)
train_df=train_df.rename(columns={'content':'text', 'sentiment':'labels' })

In [10]:
train_df.head()

Unnamed: 0,labels,text
0,empty,@tiffanylue i know i was listenin to bad habit...
1,sadness,layin n bed with a headache ughhhh...waitin on...
2,sadness,funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends soon!
4,neutral,@dannycastillo we want to trade with someone w...


In [11]:
from sklearn.preprocessing import LabelEncoder
lb_ecd=LabelEncoder()
train_df['labels']=lb_ecd.fit_transform(train_df['labels'])

In [12]:
train_df.head()

Unnamed: 0,labels,text
0,2,@tiffanylue i know i was listenin to bad habit...
1,10,layin n bed with a headache ughhhh...waitin on...
2,10,funeral ceremony...gloomy friday...
3,3,wants to hang out with friends soon!
4,8,@dannycastillo we want to trade with someone w...


In [13]:
max_length=max(len(w.split()) for w in train_df['text'])
max_length

33

Model, tokenization

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
checkpoint ="cardiffnlp/twitter-roberta-base-sentiment-latest"
#"distilbert-base-uncased-finetuned-sst-2-english"
model=AutoModelForSequenceClassification.from_pretrained(checkpoint, 
                                                         num_labels=13,
                                                         ignore_mismatched_sizes = True
                                                         ).to(device) # 
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Prepare a dataset

In [None]:
from transformers import DataCollatorWithPadding
from datasets import Dataset, load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [67]:
train_dts=Dataset.from_pandas(train_df)
train_dts=train_dts.train_test_split(test_size=0.2)
train_dts

DatasetDict({
    train: Dataset({
        features: ['tweet_id', 'sentiment', 'content'],
        num_rows: 32000
    })
    test: Dataset({
        features: ['tweet_id', 'sentiment', 'content'],
        num_rows: 8000
    })
})

In [None]:
tokenizer.pad_token=tokenizer.eos_token
def token_func(data):
    return tokenizer(data['text'], truncation=True)
train_ecd=train_dts.map(token_func, batched=True)

Map:   0%|          | 0/32000 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 32000/32000 [00:00<00:00, 32034.54 examples/s]
Map: 100%|██████████| 8000/8000 [00:00<00:00, 37051.58 examples/s]


In [34]:
train_ecd

DatasetDict({
    train: Dataset({
        features: ['labels', 'text', 'input_ids', 'attention_mask'],
        num_rows: 32000
    })
    test: Dataset({
        features: ['labels', 'text', 'input_ids', 'attention_mask'],
        num_rows: 8000
    })
})

In [None]:
train_ecd=train_ecd.remove_columns('text')
train_ecd=train_ecd.with_format('torch')
data_collator=DataCollatorWithPadding(tokenizer)

In [None]:
import transformers
from transformers import TrainingArguments, Trainer

In [37]:
train_ecd

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 32000
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 8000
    })
})

Training

In [None]:
batch_size=32
train_args=TrainingArguments(
    output_dir='cls_fn',
    num_train_epochs=3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    eval_strategy='epoch'
)

In [31]:
from sklearn.metrics import accuracy_score
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

In [42]:
trainer=Trainer(
    model,
    train_args,
    train_dataset=train_ecd['train'],
    eval_dataset=train_ecd['test'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer=Trainer(


In [43]:
trainer.train()

 17%|█▋        | 500/3000 [01:25<07:22,  5.64it/s]

{'loss': 1.8458, 'grad_norm': 5.908685207366943, 'learning_rate': 4.166666666666667e-05, 'epoch': 0.5}


 33%|███▎      | 1000/3000 [02:51<05:24,  6.17it/s]

{'loss': 1.7676, 'grad_norm': 6.730182647705078, 'learning_rate': 3.3333333333333335e-05, 'epoch': 1.0}


                                                   
 33%|███▎      | 1001/3000 [03:04<2:13:13,  4.00s/it]

{'eval_loss': 1.7317336797714233, 'eval_accuracy': 0.416, 'eval_runtime': 11.3756, 'eval_samples_per_second': 703.258, 'eval_steps_per_second': 21.977, 'epoch': 1.0}


 50%|█████     | 1500/3000 [04:29<04:17,  5.82it/s]  

{'loss': 1.6035, 'grad_norm': 7.04753303527832, 'learning_rate': 2.5e-05, 'epoch': 1.5}


 67%|██████▋   | 2000/3000 [05:57<03:35,  4.65it/s]

{'loss': 1.5813, 'grad_norm': 7.134866714477539, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.0}


                                                   
 67%|██████▋   | 2001/3000 [06:10<1:07:54,  4.08s/it]

{'eval_loss': 1.7646369934082031, 'eval_accuracy': 0.412625, 'eval_runtime': 11.5033, 'eval_samples_per_second': 695.451, 'eval_steps_per_second': 21.733, 'epoch': 2.0}


 83%|████████▎ | 2500/3000 [07:36<01:25,  5.83it/s]  

{'loss': 1.3741, 'grad_norm': 9.706355094909668, 'learning_rate': 8.333333333333334e-06, 'epoch': 2.5}


100%|██████████| 3000/3000 [09:03<00:00,  5.90it/s]

{'loss': 1.3558, 'grad_norm': 9.023650169372559, 'learning_rate': 0.0, 'epoch': 3.0}


                                                   
100%|██████████| 3000/3000 [09:16<00:00,  5.39it/s]

{'eval_loss': 1.8553322553634644, 'eval_accuracy': 0.40275, 'eval_runtime': 11.688, 'eval_samples_per_second': 684.46, 'eval_steps_per_second': 21.389, 'epoch': 3.0}
{'train_runtime': 557.0056, 'train_samples_per_second': 172.35, 'train_steps_per_second': 5.386, 'train_loss': 1.588019755045573, 'epoch': 3.0}





TrainOutput(global_step=3000, training_loss=1.588019755045573, metrics={'train_runtime': 557.0056, 'train_samples_per_second': 172.35, 'train_steps_per_second': 5.386, 'total_flos': 2159187439338432.0, 'train_loss': 1.588019755045573, 'epoch': 3.0})

In [46]:
trainer.save_model('./clf_fn')

In [65]:
from transformers import pipeline
clf=pipeline('text-classification', model='clf_fn', device=device)
print(clf('I really enjoyed this match'))

[{'label': 'LABEL_7', 'score': 0.531326413154602}]


In [64]:
lb_ecd.inverse_transform([7])

array(['love'], dtype=object)