In [1]:
%%capture
!pip install transformers[sentencepiece]
import transformers

Processing the data.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
PATH_TO_FOLDER = "drive/MyDrive/Name2Demographics/"

In [4]:
%cd $PATH_TO_FOLDER

/content/drive/MyDrive/Name2Demographics


In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 


import sys  
sys.path.insert(0, 'Models/CBSEData/CBSEPreprocessing/')
from parse_cbse import *

In [6]:
cbse_df = CBSEData()

In [7]:
%%capture
!pip install datasets

In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")
model = AutoModelForSequenceClassification.from_pretrained("google/muril-base-cased", num_labels=2)


from transformers import AutoConfig
config = AutoConfig.from_pretrained("google/muril-base-cased")

Downloading:   0%|          | 0.00/181 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.02M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/113 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/909M [00:00<?, ?B/s]

In [9]:
from sklearn.model_selection import train_test_split

X = list(cbse_df["Name"])
y = list(cbse_df["Gender"].values.astype('int'))
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True)

In [10]:
import torch

# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [11]:
from transformers import TrainingArguments
from transformers import Trainer

In [12]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import EarlyStoppingCallback


# Define Trainer parameters
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    f_recall = recall_score(y_true=labels, y_pred=pred, pos_label=0)
    f_precision = precision_score(y_true=labels, y_pred=pred, pos_label=0)
    f_f1 = f1_score(y_true=labels, y_pred=pred,pos_label=0)
    m_recall = recall_score(y_true=labels, y_pred=pred, pos_label=1)
    m_precision = precision_score(y_true=labels, y_pred=pred, pos_label=1)
    m_f1 = f1_score(y_true=labels, y_pred=pred,pos_label=1)
    return {"accuracy": accuracy, "female_precision": f_precision, "female_recall":f_recall, "female_f1": f_f1,
        "male_precision": m_precision, "male_recall": m_recall, "male_f1": m_f1 }

# Define Trainer
args = TrainingArguments(
    output_dir="output",
    evaluation_strategy="steps",
    eval_steps=500,
    per_device_train_batch_size=50,
    per_device_eval_batch_size=50,
    num_train_epochs=1,
    seed=42,
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

# Train pre-trained model
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,Female Precision,Female Recall,Female F1,Male Precision,Male Recall,Male F1
500,0.2711,0.152946,0.956603,0.953493,0.948853,0.951167,0.95908,0.962828,0.96095
1000,0.1322,0.113085,0.964197,0.967594,0.951487,0.959473,0.96155,0.974406,0.967935
1500,0.1196,0.111019,0.965468,0.956742,0.966158,0.961427,0.972602,0.964914,0.968743


Step,Training Loss,Validation Loss,Accuracy,Female Precision,Female Recall,Female F1,Male Precision,Male Recall,Male F1
500,0.2711,0.152946,0.956603,0.953493,0.948853,0.951167,0.95908,0.962828,0.96095
1000,0.1322,0.113085,0.964197,0.967594,0.951487,0.959473,0.96155,0.974406,0.967935
1500,0.1196,0.111019,0.965468,0.956742,0.966158,0.961427,0.972602,0.964914,0.968743
2000,0.1022,0.10049,0.969005,0.967671,0.962573,0.965115,0.970066,0.974171,0.972114
2500,0.0925,0.097676,0.968418,0.958278,0.97139,0.964789,0.976766,0.966031,0.971369


TrainOutput(global_step=2864, training_loss=0.13651401770181495, metrics={'train_runtime': 2621.4154, 'train_samples_per_second': 54.619, 'train_steps_per_second': 1.093, 'total_flos': 956521741765200.0, 'train_loss': 0.13651401770181495, 'epoch': 1.0})

In [16]:
metrics=trainer.evaluate()
print(metrics)

{'eval_loss': 0.09767594933509827, 'eval_accuracy': 0.9684179649305782, 'eval_female_precision': 0.9582776915580901, 'eval_female_recall': 0.9713898949987195, 'eval_female_f1': 0.9647892441860465, 'eval_male_precision': 0.9767656059660694, 'eval_male_recall': 0.9660309717610414, 'eval_male_f1': 0.9713686325493439, 'eval_runtime': 152.6722, 'eval_samples_per_second': 401.933, 'eval_steps_per_second': 8.043, 'epoch': 1.0}


In [15]:
# # Load test data
# test_data = pd.read_csv("test.csv")
# X_test = list(test_data["review"])
# X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

# # Create torch dataset
# test_dataset = Dataset(X_test_tokenized)

# # Load trained model
# model_path = "output/checkpoint-50000"
# model = BertForSequenceClassification.from_pretrained(model_path, num_labels=2)

# # Define test trainer
# test_trainer = Trainer(model)

# # Make prediction
# raw_pred, _, _ = test_trainer.predict(test_dataset)

# # Preprocess raw predictions
# y_pred = np.argmax(raw_pred, axis=1)

In [17]:
metrics

{'epoch': 1.0,
 'eval_accuracy': 0.9684179649305782,
 'eval_female_f1': 0.9647892441860465,
 'eval_female_precision': 0.9582776915580901,
 'eval_female_recall': 0.9713898949987195,
 'eval_loss': 0.09767594933509827,
 'eval_male_f1': 0.9713686325493439,
 'eval_male_precision': 0.9767656059660694,
 'eval_male_recall': 0.9660309717610414,
 'eval_runtime': 152.6722,
 'eval_samples_per_second': 401.933,
 'eval_steps_per_second': 8.043}

In [18]:
torch.save(model.state_dict(), "Models/CBSEData/MurilBERT/SavedModel/MurilBERT")
