In [22]:
import torch
from pathlib import Path
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd

from transformers import AutoModel, AutoTokenizer


import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder


In [23]:
with open("../data/french_clean.txt") as f:
        text = f.readlines()
nouns = []
labels = []
for x in text:
        n, l = x.split(",")
        nouns.append(n)
        labels.append(l.strip())

labels

['masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masculine',
 'masc

In [24]:
df = pd.read_csv("../data/wiktionary_raw.csv")
df = df.dropna().reset_index()
df

Unnamed: 0,index,noun,gender,lang
0,85,abandonnataire,feminine,fr
1,86,abélite,feminine,fr
2,87,Abkhaze,feminine,fr
3,88,Ablon,feminine,fr
4,89,abolitioniste,feminine,fr
...,...,...,...,...
498124,517171,zythophile,masculine,fr
498125,517172,zythum,masculine,fr
498126,517173,zyzel,masculine,fr
498127,517174,zāy,masculine,fr


In [25]:
french_df = df[df['lang'] == 'fr']
french_df = french_df.drop(columns=['index'])
french_df

Unnamed: 0,noun,gender,lang
0,abandonnataire,feminine,fr
1,abélite,feminine,fr
2,Abkhaze,feminine,fr
3,Ablon,feminine,fr
4,abolitioniste,feminine,fr
...,...,...,...
498124,zythophile,masculine,fr
498125,zythum,masculine,fr
498126,zyzel,masculine,fr
498127,zāy,masculine,fr


In [26]:
X = french_df['noun'].to_list()
y = french_df['gender'].to_list()

X_train, X_test, y_train, y_test = train_test_split(X, y)

print(len(X))
print(len(y))

82532
82532


In [27]:
class NounDataset(Dataset):
    def __init__(self, X, y, tokenizer, max_length):
        self.X = X
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.le = LabelEncoder()
        self.y = self.le.fit_transform(y)


    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        text = self.X[idx]
        label = torch.tensor(self.y[idx])

        encoded_text = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        return  {
        'input_ids': encoded_text['input_ids'].squeeze(),
        'attention_mask': encoded_text['attention_mask'].squeeze(),
        'label': label
    }

In [35]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(set(y_train)))
# Example usage
# Assuming you have X_train, y_train, X_test, y_test defined

# Load pre-trained BERT model and tokenizer



# Define the optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()
# Create datasets and loaders
max_length = 32  # Adjust based on your preference
train_dataset = NounDataset(X_train, y_train, tokenizer, max_length)
test_dataset = NounDataset(X_test, y_test, tokenizer, max_length)


train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
class GenderBert(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size, epochs, device, model):
        super(GenderBert, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tagset_size = tagset_size
        self.epochs = epochs
        self.device = device
        self.bert = model.bert

        

    def train(self, train_loader):
        

        num_epochs = self.epochs
        model.to(self.device)
        for epoch in range(num_epochs):
            # model.train()
            for batch in train_loader:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['label'].squeeze().long().to(self.device)  # Ensure labels are of type long
                optimizer.zero_grad()

                outputs = model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits

                loss = criterion(logits, labels)
                loss.backward()
                optimizer.step()

            print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}')
        

    def predict(self, test_loader):
        # Evaluation
        # model.eval()
        correct = 0
        total = 0

        with torch.no_grad():
            for batch in test_loader:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['label'].to(self.device)

                outputs = model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                _, predicted = torch.max(logits, 1)

                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        accuracy = correct / total
        print(f'Test Accuracy: {accuracy * 100:.2f}%')
        return accuracy



In [10]:
# Set your desired values for these parameters
embedding_dim = 768  # Adjust based on your model architecture
hidden_dim = 256  # Adjust based on your model architecture
vocab_size = 32000  # Adjust based on your dataset vocabulary size
tagset_size = 2  # Assuming binary classification (e.g., male/female)
epochs = 5  # Adjust based on your training preferences
device = 'cuda' if torch.cuda.is_available() else 'cpu'  # Use GPU if available

folder_path = 'saved_models'
file_name = 'GenderBert_params.pth'

file_path = Path(folder_path) / file_name


    # Initialize model
if file_path.exists():
    gender_clf = GenderBert(
        embedding_dim=embedding_dim,
        hidden_dim=hidden_dim,
        vocab_size=vocab_size,
        tagset_size=tagset_size,
        epochs=epochs,
        model=model,
        device=device
        )

    gender_clf.load_state_dict(torch.load(file_path))
    print(f"Model loaded successfully from {file_path}.")
    result = gender_clf.predict(test_loader)
else:
    print('does not exist')
    gender_clf = GenderBert(embedding_dim, hidden_dim, vocab_size, tagset_size, epochs, device, model)
    torch.save(gender_clf.state_dict(), 'saved_models/GenderBert_params.pth')

does not exist


In [11]:
gender_clf.train(train_loader)

Epoch 1/5, Loss: 0.20045024156570435
Epoch 2/5, Loss: 0.2990168333053589
Epoch 3/5, Loss: 0.1385294646024704
Epoch 4/5, Loss: 0.061162617057561874
Epoch 5/5, Loss: 0.05526687949895859


In [None]:
gender_clf.predict(test_loader)

Test Accuracy: 86.24%


0.8624048853777928

In [18]:
args = ['cat']
src_langs = "-".join([lang for lang in args])
src_langs

'cat'

In [43]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification

class GenderBert2(nn.Module):
    def __init__(self, num_labels=None):
        super(GenderBert2, self).__init__()

        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

    def train_model(self, train_loader, device, num_epochs=3):
        self.model.to(device)  # Move model to the desired device
        self.model.train()
        optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-5)
        criterion = torch.nn.CrossEntropyLoss()

        for epoch in range(num_epochs):
            for batch in train_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].squeeze().long().to(device)
                optimizer.zero_grad()

                outputs = self.model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits

                loss = criterion(logits, labels)
                loss.backward()
                optimizer.step()

            print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}')

    def evaluate(self, data_loader, device, mode='test'):
        self.model.to(device)  # Move model to the desired device
        self.model.eval()
        correct = 0
        total = 0

        with torch.no_grad():
            for batch in data_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)

                outputs = self.model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                _, predicted = torch.max(logits, 1)

                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        accuracy = correct / total
        print(f'{mode.capitalize()} Accuracy: {accuracy * 100:.2f}%')
        return accuracy


In [44]:
clf = GenderBert2(len(y_train))
clf.train_model(train_loader, device='cuda', num_epochs=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/6, Loss: 0.5338936448097229
Epoch 2/6, Loss: 0.18931609392166138
Epoch 3/6, Loss: 0.14981752634048462
Epoch 4/6, Loss: 0.09604091942310333
Epoch 5/6, Loss: 0.03908614441752434
Epoch 6/6, Loss: 0.47648361325263977


In [45]:
clf.evaluate(test_loader, device='cuda')

Test Accuracy: 86.21%


0.862114089080599

In [46]:
torch.save({
    'model_state_dict': clf.model.state_dict(),
    'tokenizer': clf.tokenizer
}, '../saved_models/bert_fr_to_fr.pth')