In [24]:
import torch
import numpy as np
import pandas as pd
import ast
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score

In [3]:
# Load dataset
train = pd.read_csv('../train_data_mod.csv')
test = pd.read_csv('../test_data_mod.csv')

In [4]:
train_mod = train.copy()
test_mod = test.copy()

-  instead of using the preprocessed text, we will use the original text applied.
- This is because BERT can learn context and relationships between words (including mispelled words), which makes it different from standard preprocessing techniques. 
- It also has it's own special tokenizer


In [14]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

Downloading (…)solve/main/vocab.txt: 100%|██████████| 213k/213k [00:00<00:00, 513kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading (…)okenizer_config.json: 100%|██████████| 29.0/29.0 [00:00<?, ?B/s]
Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<?, ?B/s] 


In [15]:
tokenizer_distil = BertTokenizer.from_pretrained('distilbert-base-cased')

Downloading (…)solve/main/vocab.txt: 100%|██████████| 213k/213k [00:00<00:00, 6.16MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 29.0/29.0 [00:00<00:00, 3.61kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 411/411 [00:00<?, ?B/s] 
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [28]:
train_mod.columns

Index(['id', 'keyword', 'location', 'text', 'target', 'preprocess_text',
       'bigram', 'trigram', 'pos', 'keyword_encoded', 'tweet_length',
       'punctuation_count'],
      dtype='object')

In [29]:
# drop unnecessary columns
drop_cols = ['keyword', 'location', 'preprocess_text','bigram','trigram', 'pos']


In [30]:
train_mod.drop(drop_cols, axis=1, inplace=True)
test_mod.drop(drop_cols, axis=1, inplace=True)

In [31]:
train_mod.columns

Index(['id', 'text', 'target', 'keyword_encoded', 'tweet_length',
       'punctuation_count'],
      dtype='object')

In [32]:
test_mod.columns

Index(['id', 'text', 'keyword_encoded', 'tweet_length', 'punctuation_count'], dtype='object')

In [33]:
numerical_features = ['keyword_encoded', 'tweet_length', 'punctuation_count']

In [None]:
# Standardize the numerical features
from sklearn.preprocessing import StandardScaler

# Assuming you have a dataset X that you want to standardize

# Create an instance of the StandardScaler class
scaler = StandardScaler()

# Fit the scaler to the dataset
scaler.fit(train_mod[numerical_features])

# Transform the dataset using the fitted scaler
train_mod_scaled = scaler.transform(train_mod[numerical_features])


#### Custom Dataset
This is class that will contain the following:
- Constructor to take in text and numerical features, corresponding targets, BERT tokenizer, maximum tokens in 1 tweet sentence
- length function to return the length of the text
- get_items function so that given an input, it will retrieve the corresponding text, numerical feature, target value.

In [25]:
class CustomDataset(Dataset):
    def __init__(self, texts, numerical_features, targets, tokenizer, max_len):
        self.texts = texts
        self.numerical_features = numerical_features
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        num_feat = self.numerical_features[item]
        target = self.targets[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'numerical_features': torch.tensor(num_feat, dtype=torch.float),
            'targets': torch.tensor(target, dtype=torch.long)
        }


In [26]:
def max_length(text_column, tokenizer):
    max_len = 0
    for text in text_column:
        tokens = tokenizer.tokenize(text)
        # print(tokens)
        max_len = max(max_len, len(tokens))
    print("Max length: ", max_len, " tokens")
    return max_len

In [27]:
max_length(train_mod['text'], tokenizer)
max_length(test_mod['text'], tokenizer)

Max length:  104  tokens
Max length:  99  tokens


99

In [22]:
def create_data_loader(text_feature, numerical_features, targets, tokenizer, max_len, batch_size):
    dataset = CustomDataset(
        texts=text_feature.to_numpy(),
        numerical_features=numerical_features.to_numpy(),
        targets=targets.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )

    return DataLoader(
        dataset,
        batch_size=batch_size,
        num_workers=4
    )


In [None]:
class BertWithNumericalFeatures(BertForSequenceClassification):
    def __init__(self, config, num_numerical_features):
        super().__init__(config)
        self.num_numerical_features = num_numerical_features
        # Concatenates the numerical features to the output of the BERT model
        # This is then represented as a Linear Layer which is then passed to the Dropout and Classifier layers
        self.dense = torch.nn.Linear(config.hidden_size + num_numerical_features, config.hidden_size)
        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
        # We have an output layer with 2 nodes
        # This will output the logits, and then be transformed through a softmax function to get the probabilities
        self.classifier = torch.nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, input_ids, numerical_features, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
        outputs = self.bert(input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            position_ids=position_ids,
                            head_mask=head_mask,
                            inputs_embeds=inputs_embeds)

        pooled_output = outputs[1]
        pooled_output = torch.cat((pooled_output, numerical_features), dim=1)
        pooled_output = self.dense(pooled_output)
        pooled_output = torch.nn.ReLU()(pooled_output)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        loss = None
        if labels is not None:
            if self.num_labels == 1:
                loss_fct = torch.nn.BCEWithLogitsLoss()
                loss = loss_fct(logits.view(-1), labels.view(-1).float())
            else:
                loss_fct = torch.nn.CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return ((logits,),) + outputs[2:] if loss is None else ((loss, logits),) + outputs[2:]



In [None]:
def train_epoch(model, data_loader, optimizer, device, scheduler):
    model = model.train()
    losses = []

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        numerical_features = d["numerical_features"].to(device)
        targets = d["targets"].to(device)

        outputs = model(input_ids=input_ids, numerical_features=numerical_features, attention_mask=attention_mask, labels=targets)
        loss = outputs[0]

        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return np.mean(losses)

In [None]:
def eval_model(model, data_loader, device):
    model = model.eval()
    outputs = []
    targets_list = []

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            numerical_features = d["numerical_features"].to(device)
            targets = d["targets"].to(device)

            logits = model(input_ids=input_ids, numerical_features=numerical_features, attention_mask=attention_mask)[0]
            _, preds = torch.max(logits, dim=1)

            outputs.extend(preds)
            targets_list.extend(targets)

    outputs = torch.stack(outputs).cpu()
    targets_list = torch.stack(targets_list).cpu()

    return accuracy_score(targets_list, outputs), f1_score(targets_list, outputs, average='weighted')


In [None]:
def main(df, n_splits=5, epochs=5):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Set up cross-validation
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    fold_scores = []

    for fold, (train_index, val_index) in enumerate(kf.split(df, df.target)):
        print(f'FOLD {fold + 1}/{n_splits}')

        train_df = df.loc[train_index].reset_index(drop=True)
        val_df = df.loc[val_index].reset_index(drop=True)

        # Set up DataLoaders
        train_data_loader = create_data_loader(train_df, tokenizer, max_len=128, batch_size=16)
        val_data_loader = create_data_loader(val_df, tokenizer, max_len=128, batch_size=16)

        model = BertWithNumericalFeatures.from_pretrained('bert-base-cased', num_numerical_features=train_df.numerical_features.shape[1], num_labels=2)
        model.to(device)

        optimizer = AdamW(model.parameters(), lr=2e-5)
        total_steps = len(train_data_loader) * epochs
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(0.1 * total_steps), num_training_steps=total_steps)
        best_f1 = 0
        for epoch in range(epochs):
            print(f'Epoch {epoch + 1}/{epochs}')
            train_loss = train_epoch(model, train_data_loader, optimizer, device, scheduler)
            print(f'Train loss: {train_loss}')

            acc, f1 = eval_model(model, val_data_loader, device)
            print(f'Val Accuracy: {acc}, F1: {f1}')

            if f1 > best_f1:
                best_f1 = f1
                torch.save(model.state_dict(), f'model_{fold + 1}.bin')

        fold_scores.append(best_f1)

    print(f'Mean F1 score: {np.mean(fold_scores)}, std: {np.std(fold_scores)}')

