In [1]:
%%capture
!pip install transformers[sentencepiece]
import transformers

Processing the data.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd drive/MyDrive/BTP/

/content/drive/MyDrive/BTP


In [4]:
# Step 1: Have a df with Gender and Name.

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import sys  
sys.path.insert(0, 'PreProcessing/')

In [6]:
cb4 = pd.read_csv('CBSEData/2014_complete_data.csv', dtype=object)
cb5 = pd.read_csv('CBSEData/2015_complete_data.csv', dtype=object)

In [7]:
def preprocessData(data_df):
    name_df = pd.DataFrame()
    # Ignore these for now, they aren't parsed correctly
    father_df = pd.DataFrame()
    mother_df = pd.DataFrame()
    father_df['Name'] = data_df['father_name'].dropna()
    father_df['Gender'] = 'MALE'
    mother_df['Name'] = data_df['mother_name'].dropna()
    mother_df['Gender'] = 'FEMALE'
    name_df = name_df.append(father_df, ignore_index = True)
    name_df = name_df.append(mother_df, ignore_index = True)
    
    # for now drop the rows that have 
    name_df = name_df.drop_duplicates()
    # considering only 2 genders for now
    name_df = name_df.loc[name_df['Gender'].isin({'MALE','FEMALE'})]
    name_df = name_df.reset_index(drop=True)
    return name_df


c14 = preprocessData(cb4)
c15 = preprocessData(cb5)

In [8]:
gender_df = pd.concat([c14, c15])

In [9]:
gender_df['label'] = 1
labels = []
for elem in gender_df['Gender']:
  if elem == 'MALE':
    labels.append(0)
  else:
    labels.append(1)

gender_df['label'] = labels
gender_df['Name'] = [str(name).strip()  for name in gender_df['Name']]
gender_df['Name'] = [name.lower() for name in gender_df['Name']]

In [10]:
gender_df['male'] = gender_df['Name'].map(gender_df[gender_df['label']==0]['Name'].value_counts())

In [11]:
gender_df['female'] = gender_df['Name'].map(gender_df[gender_df['label']==1]['Name'].value_counts())

In [12]:
# Drop duplicates and update labels
gender_df = gender_df.drop_duplicates(subset='Name', keep='last')

In [13]:
gender_df.shape

(204543, 5)

In [14]:
gender_df = gender_df.reset_index(drop=True)

In [15]:
labels = []
for i, name in enumerate(gender_df['Name']):
  if np.isnan(gender_df['male'][i]) or gender_df['female'][i] > gender_df['male'][i]:
    # labels.append(1)
    gender_df.at[i,'new_labels'] = 1
  else:
    # labels.append(0)
    gender_df.at[i,'new_labels'] = 0

In [16]:
gender_df = gender_df.sort_values(by='Name')

In [17]:
final_df = pd.DataFrame()
final_df['name'] = gender_df['Name']
final_df['labels'] = gender_df['new_labels']

In [18]:
final_df.to_csv("temp/gender_df.csv")

In [19]:
%%capture
!pip install datasets

In [20]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")
model = AutoModelForSequenceClassification.from_pretrained("google/muril-base-cased", num_labels=2)


from transformers import AutoConfig
config = AutoConfig.from_pretrained("google/muril-base-cased")

Downloading:   0%|          | 0.00/181 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.02M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/113 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/909M [00:00<?, ?B/s]

In [21]:
from sklearn.model_selection import train_test_split

X = list(gender_df["Name"])
y = list(gender_df["label"])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True)

In [22]:
final_df.groupby('labels').count()

Unnamed: 0_level_0,name
labels,Unnamed: 1_level_1
0.0,113900
1.0,90643


In [23]:
import torch

# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [24]:
from transformers import TrainingArguments
from transformers import Trainer

In [25]:
print("1")

1


In [26]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import EarlyStoppingCallback


# Define Trainer parameters
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

# Define Trainer
args = TrainingArguments(
    output_dir="output",
    evaluation_strategy="steps",
    eval_steps=500,
    per_device_train_batch_size=50,
    per_device_eval_batch_size=50,
    num_train_epochs=1,
    seed=42,
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

# Train pre-trained model
trainer.train()

Step,Training Loss,Validation Loss


In [27]:
metrics=trainer.evaluate()
print(metrics)

{'eval_loss': 0.09649517387151718, 'eval_accuracy': 0.9665922461418118, 'eval_precision': 0.9580500089944235, 'eval_recall': 0.9678697343074183, 'eval_f1': 0.9629348376365082, 'eval_runtime': 109.2832, 'eval_samples_per_second': 561.505, 'eval_steps_per_second': 11.237, 'epoch': 1.0}


In [28]:
metrics

{'epoch': 1.0,
 'eval_accuracy': 0.9665922461418118,
 'eval_f1': 0.9629348376365082,
 'eval_loss': 0.09649517387151718,
 'eval_precision': 0.9580500089944235,
 'eval_recall': 0.9678697343074183,
 'eval_runtime': 109.2832,
 'eval_samples_per_second': 561.505,
 'eval_steps_per_second': 11.237}

In [None]:
# # Load test data
# test_data = pd.read_csv("test.csv")
# X_test = list(test_data["review"])
# X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

# # Create torch dataset
# test_dataset = Dataset(X_test_tokenized)

# # Load trained model
# model_path = "output/checkpoint-50000"
# model = BertForSequenceClassification.from_pretrained(model_path, num_labels=2)

# # Define test trainer
# test_trainer = Trainer(model)

# # Make prediction
# raw_pred, _, _ = test_trainer.predict(test_dataset)

# # Preprocess raw predictions
# y_pred = np.argmax(raw_pred, axis=1)