In [107]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/twitter-2016train-a/twitter-2016train-A.tsv

/kaggle/input/twitter/twitter-2016test-A.tsv

/kaggle/input/twitter-dev/twitter-2016dev-A.tsv


In [3]:
import torch
torch.cuda.is_available()

True

In [4]:
import pandas as pd
import re
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')




[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
def preprocess_tweet(tweet):
    if tweet is None or not isinstance(tweet, str):
        return ""
    
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    tweet = re.sub(r'@', '', tweet)
    tweet = re.sub(r'&[a-z]+;', '', tweet)

    emoticons = {
        ":)": "<smile>",
        ":(": "<sadface>",
        ":D": "<lolface>",
        ":-)": "<smile>",
        ":-(": "<sadface>",
        ":-D": "<lolface>",
        ";)": "<wink>",
        ";(": "<sadface>",
        ";D": "<lolface>",
        ";-)": "<wink>",
        ";-(": "<sadface>",
        ";-D": "<lolface>"
    }
    
    for emoticon, replacement in emoticons.items():
        tweet = tweet.replace(emoticon, replacement)

    tweet = tweet.lower()

    tweet = tweet.replace(" u ", " you ")

    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(tweet)
    filtered_tweet = [w for w in word_tokens if not w in stop_words]

    return ' '.join(filtered_tweet)


In [6]:
class TwitterSentimentDataset(Dataset):
    def __init__(self, tweets, labels, tokenizer):
        self.tweets = tweets
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, item):
        tweet = preprocess_tweet(str(self.tweets[item]))
        label = int(self.labels[item])

        encoding = self.tokenizer.encode_plus(
            tweet,
            add_special_tokens=True,
            return_token_type_ids=False,
            padding=False,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()
            
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
def load_data(filename):
    df = pd.read_csv(filename, sep='\t', header=None, names=['id', 'sentiment', 'tweet'])
    df['sentiment'] = df['sentiment'].map({'negative': 0, 'neutral': 1, 'positive': 2})
    df.dropna(subset=['tweet', 'sentiment'], inplace=True)
    df['tweet'] = df['tweet'].apply(preprocess_tweet)
    return df


In [11]:
df_train = load_data('/kaggle/input/twitter-2016train-a/twitter-2016train-A.tsv')
df_test = load_data('/kaggle/input/twitter-2016test/twitter-2016test-A.tsv')
df_val = load_data('/kaggle/input/twitter-dev/twitter-2016dev-A.tsv')

train_datasets = TwitterSentimentDataset(df_train['tweet'].to_numpy(), df_train['sentiment'].to_numpy(), tokenizer)
test_datasets = TwitterSentimentDataset(df_test['tweet'].to_numpy(), df_test['sentiment'].to_numpy(), tokenizer)
val_datasets = TwitterSentimentDataset(df_val['tweet'].to_numpy(), df_val['sentiment'].to_numpy(), tokenizer)

In [12]:
len(val_datasets)

1966

In [13]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [14]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'accuracy': acc,
    }


In [16]:
print(train_datasets[0])
print(test_datasets[0])
print(val_datasets[0])


{'input_ids': tensor([  101,  6203,  7513,  2047, 21511,  8873,  3401,  6097,  2307,  1010,
         1048,  6038,  2278, 10651,  1029,  1039,  1005, 12256,  1012,   102]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), 'labels': tensor(0)}
{'input_ids': tensor([  101,  3861,  4580,  1005,  1055,  1010,  5061, 12305,  1005,  1055,
         1010,  1005,  5074,  5380,  1024,  2813,  2140,  1011,  3098,  2756,
        17419,  2437,  5975,  1012,  3422,  9117,  5291,  2962,  1011,  2298,
         1012,  1012,  1012,   102]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), 'labels': tensor(1)}
{'input_ids': tensor([  101,  5709,  3786,  1011,  2745,  4027,  1011, 10874,  1006, 10965,
         5315,  3179,  1007,  1031, 10751,  1033,   102]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), 'labels': tensor(1)}


In [20]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="my_baseline_model",
    evaluation_strategy="epoch",
    num_train_epochs=1,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_datasets,
    eval_dataset=test_datasets,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

trainer.save_model("my_baseline_model")



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.040679,0.622638,0.609345,0.601023,0.609345


In [21]:
def model_init():
    return BertForSequenceClassification.from_pretrained("my_baseline_model")

learning_rates = [1e-5, 3e-5, 5e-5]
batch_sizes = [8, 16]

results = {}
best_loss = float("inf")
best_model = None

for lr in learning_rates:
    for batch_size in batch_sizes:
        training_args = TrainingArguments(
            output_dir=f'./results/lr_{lr}_bs_{batch_size}',
            learning_rate=lr,
            per_device_train_batch_size=batch_size,
            num_train_epochs=3,
            evaluation_strategy="epoch",
        )

        trainer = Trainer(
            model_init=model_init,
            args=training_args,
            train_dataset=train_datasets,
            eval_dataset=val_datasets,
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )

        trainer.train()

        eval_result = trainer.evaluate()
        eval_loss = eval_result["eval_loss"]
        results[f"lr_{lr}_bs_{batch_size}"] = eval_loss

        if eval_loss < best_loss:
            best_loss = eval_loss
            best_model_dir = f"./best_model/lr_{lr}_bs_{batch_size}"
            trainer.save_model(best_model_dir)
            best_model = trainer.model

for key, value in results.items():
    print(f"{key}: Loss = {value}")

print(f"Best setup: {best_model_dir} with loss = {best_loss}")



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.588631,0.582021,0.584435,0.582841,0.584435
2,0.201700,1.551923,0.584864,0.58647,0.582579,0.58647
3,0.209300,1.66152,0.586161,0.580366,0.579347,0.580366


Checkpoint destination directory ./results/lr_1e-05_bs_8/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/lr_1e-05_bs_8/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.




Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.271119,0.596751,0.585453,0.588589,0.585453
2,No log,1.432721,0.587164,0.589013,0.586158,0.589013
3,0.207600,1.452635,0.58367,0.579349,0.57763,0.579349


Checkpoint destination directory ./results/lr_1e-05_bs_16/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.




Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.120221,0.571846,0.576297,0.573586,0.576297
2,0.238500,1.504399,0.583181,0.584435,0.575963,0.584435
3,0.195400,2.030499,0.587638,0.584435,0.582751,0.584435


Checkpoint destination directory ./results/lr_3e-05_bs_8/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/lr_3e-05_bs_8/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.




Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.144184,0.578103,0.568667,0.572092,0.568667
2,No log,1.557032,0.58382,0.58647,0.580608,0.58647
3,0.200100,1.739992,0.594826,0.583418,0.583038,0.583418


Checkpoint destination directory ./results/lr_3e-05_bs_16/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.




Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.954095,0.570456,0.576806,0.568001,0.576806
2,0.286300,1.35304,0.579195,0.581384,0.576148,0.581384
3,0.209400,2.20765,0.574346,0.573245,0.569737,0.573245


Checkpoint destination directory ./results/lr_5e-05_bs_8/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/lr_5e-05_bs_8/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.




Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.045865,0.582929,0.570702,0.574895,0.570702
2,No log,1.490745,0.58088,0.579858,0.577941,0.579858
3,0.228600,1.765489,0.585895,0.576806,0.57551,0.576806


Checkpoint destination directory ./results/lr_5e-05_bs_16/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


lr_1e-05_bs_8: Loss = 1.66152024269104
lr_1e-05_bs_16: Loss = 1.4526346921920776
lr_3e-05_bs_8: Loss = 2.0304994583129883
lr_3e-05_bs_16: Loss = 1.7399917840957642
lr_5e-05_bs_8: Loss = 2.2076497077941895
lr_5e-05_bs_16: Loss = 1.7654887437820435
Best setup: ./best_model/lr_1e-05_bs_16 with loss = 1.4526346921920776


In [22]:
model = BertForSequenceClassification.from_pretrained('./best_model/lr_1e-05_bs_16')

training_args = TrainingArguments(
    output_dir=f'./results',
    do_train=False,
    do_predict=True
)

trainer = Trainer(
    model=best_model,
    args=training_args,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

predictions = trainer.predict(test_datasets)


print(predictions.metrics)




{'test_loss': 1.4426931142807007, 'test_precision': 0.6157293889833815, 'test_recall': 0.6028984102365258, 'test_f1': 0.5942734150381467, 'test_accuracy': 0.6028984102365258, 'test_runtime': 101.4047, 'test_samples_per_second': 203.462, 'test_steps_per_second': 12.721}
