# Bert model

In [2]:
!pip install tensorflow
!pip install transformers
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [3]:
import pandas as pd
import torch
from transformers import CamembertTokenizer, CamembertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm

In [None]:
# Load training data
training_data = 'https://raw.githubusercontent.com/thebrisly/UNIL_Geneva_DSML/main/data/training_data.csv'
train_df = pd.read_csv(training_data, encoding='utf-8')
train_df['difficulty'] = train_df['difficulty'].replace(['A1', 'A2', 'B1', 'B2', 'C1', 'C2'], [0, 1, 2, 3, 4, 5])

# Load test data
test_data = 'https://raw.githubusercontent.com/thebrisly/UNIL_Geneva_DSML/main/data/unlabelled_test_data.csv'
test_df = pd.read_csv(test_data)

# Tokenization and padding
tokenizer = CamembertTokenizer.from_pretrained('camembert/camembert-large', do_lower_case=True)

MAX_LEN = 64
batch_size = 16
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def tokenize_sentences(sentences, max_len=64):
    input_ids = []
    attention_masks = []

    for sent in tqdm(sentences, desc="Tokenizing sentences"):
        encoded_dict = tokenizer.encode_plus(
            sent,
            add_special_tokens=True,
            max_length=max_len,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

# Tokenize training data
train_input_ids, train_attention_masks = tokenize_sentences(train_df['sentence'], max_len=MAX_LEN)
labels = torch.tensor(train_df['difficulty'].values)

# Split the data
train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks = train_test_split(
    train_input_ids, labels, train_attention_masks, random_state=42, test_size=0.1
)

# Define DataLoader for training and validation data
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

# Model configuration
model = CamembertForSequenceClassification.from_pretrained("camembert/camembert-large", num_labels=6)

# Training parameters
epochs = 8
lr = 2e-5
optimizer = AdamW(model.parameters(), lr=lr)

# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1} training"):
        input_ids, attention_mask, label = batch
        input_ids, attention_mask, label = input_ids.to(device), attention_mask.to(device), label.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=label)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_dataloader)

    # Validation
    model.eval()
    val_loss = 0

    for batch in tqdm(validation_dataloader, desc=f"Epoch {epoch + 1} validation"):
        input_ids, attention_mask, label = batch
        input_ids, attention_mask, label = input_ids.to(device), attention_mask.to(device), label.to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask, labels=label)

        loss = outputs.loss
        val_loss += loss.item()

    avg_val_loss = val_loss / len(validation_dataloader)
    print(f"Epoch {epoch + 1}: Avg Training Loss={avg_train_loss}, Avg Validation Loss={avg_val_loss}")

# Apply the model to the unlabelled test data
test_input_ids, test_attention_masks = tokenize_sentences(test_df['sentence'], max_len=MAX_LEN)
test_dataset = TensorDataset(test_input_ids, test_attention_masks)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

model.eval()
predictions = []

for batch in tqdm(test_dataloader, desc="Predicting on test data"):
    input_ids, attention_mask = batch
    input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    preds = torch.argmax(logits, dim=1).cpu().numpy()
    predictions.extend(preds)

sentencepiece.bpe.model:   0%|          | 0.00/809k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/456 [00:00<?, ?B/s]

Tokenizing sentences:   0%|          | 0/4800 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Tokenizing sentences: 100%|██████████| 4800/4800 [00:03<00:00, 1548.32it/s]


pytorch_model.bin:   0%|          | 0.00/1.35G [00:00<?, ?B/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert/camembert-large and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 training: 100%|██████████| 270/270 [02:50<00:00,  1.58it/s]
Epoch 1 validation: 100%|██████████| 30/30 [00:05<00:00,  5.28it/s]


Epoch 1: Avg Training Loss=1.253029015329149, Avg Validation Loss=1.1933197279771168


Epoch 2 training: 100%|██████████| 270/270 [02:50<00:00,  1.58it/s]
Epoch 2 validation: 100%|██████████| 30/30 [00:05<00:00,  5.23it/s]


Epoch 2: Avg Training Loss=0.8845867201134011, Avg Validation Loss=1.0120660523573557


Epoch 3 training: 100%|██████████| 270/270 [02:50<00:00,  1.58it/s]
Epoch 3 validation: 100%|██████████| 30/30 [00:05<00:00,  5.22it/s]


Epoch 3: Avg Training Loss=0.631242119382929, Avg Validation Loss=1.178465978304545


Epoch 4 training: 100%|██████████| 270/270 [02:50<00:00,  1.58it/s]
Epoch 4 validation: 100%|██████████| 30/30 [00:05<00:00,  5.22it/s]


Epoch 4: Avg Training Loss=0.41078822764533535, Avg Validation Loss=1.2706415196259817


Epoch 5 training: 100%|██████████| 270/270 [02:50<00:00,  1.58it/s]
Epoch 5 validation: 100%|██████████| 30/30 [00:05<00:00,  5.23it/s]


Epoch 5: Avg Training Loss=0.24889355010732456, Avg Validation Loss=1.29386611978213


Epoch 6 training:  17%|█▋        | 47/270 [00:29<02:21,  1.58it/s]

In [None]:
model.save_pretrained('./geneva_model')

In [None]:
# Convert predictions to difficulty levels
difficulty_levels = {0: 'A1', 1: 'A2', 2: 'B1', 3: 'B2', 4: 'C1', 5: 'C2'}
predicted_difficulties = [difficulty_levels[p] for p in predictions]

# Create a submission dataframe
submission_df = pd.DataFrame({'id': test_df['id'], 'difficulty': predicted_difficulties})

# Save the submission dataframe to a CSV file
submission_df.to_csv('final_submission.csv', index=False)