In [1]:
import torch
import tqdm
from tqdm import tqdm
import pandas as pd
import pickle
import numpy as np
from transformers import CamembertTokenizer, CamembertModel
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torch.nn as nn
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
%run data_preprocessing.ipynb

Downloading...
From: https://drive.google.com/uc?id=16rfiy-WrqBVBsrmE5VZk-Czk10wMAAmF
To: /home/onyxia/work/sexism_tweets/tweets.csv
100%|██████████| 1.39M/1.39M [00:00<00:00, 44.4MB/s]


In [4]:
df = pd.read_csv('tweets_prepro.csv')

In [None]:
### EXAMPLE OF EMBEDDING (c est pour avoir l idee de comment ça marche)
"""
import torch

# Step 1: Tokenize and encode in one go
encoded_sentence = tokenizer.encode('J'aime le camembert !', return_tensors='pt')

# encoded_sentence is now a tensor of shape (1, sequence_length)

# Step 2: Pass through CamemBERT
outputs = camembert(encoded_sentence)

# Step 3: Extract token embeddings
embeddings = outputs.last_hidden_state  # shape: (1, seq_len, hidden_dim)


#sentence_embedding = embeddings.mean(dim=1)  # average over tokens

embeddings.shape
"""

In [5]:
annotated_corpus = pd.read_csv('corpus_SexistContent.csv', sep='\t', header=None, names=['tweet_id', 'label'])
df_whole = pd.merge(df, annotated_corpus, on = 'tweet_id')

In [None]:
# Load model and tokenizer
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
model = CamembertModel.from_pretrained("camembert-base").to(device)
model.eval()  # Put model in evaluation mode

# Dictionary to store everything
embeddings_dict = {}

# Process each row
for _, row in tqdm(df_whole.iterrows()):
    sentence = row['text_clean']
    label = row['label']
    
    # Tokenize and encode sentence
    inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=False)
    inputs = {key: val.to(device) for key, val in inputs.items()}  # Move inputs to GPU
    
    with torch.no_grad():
        outputs = model(**inputs)
        # Extract per-token embeddings and move to CPU before converting to numpy
        token_embeddings = outputs.last_hidden_state.squeeze(0).cpu().numpy()
    
    # Save to dictionary
    embeddings_dict[sentence] = {
        "embeddings": token_embeddings,  # shape: (num_tokens, 768)
        "label": label
    }

# Save to .pkl
with open("labeled_token_embeddings.pkl", "wb") as f:
    pickle.dump(embeddings_dict, f)


In [6]:
df_whole.drop('Unnamed: 0', axis=1, inplace=True)

In [7]:
df_whole

Unnamed: 0,tweet_id,text,text_clean,label
0,326796299179548672,@MamzelleMNa Une très humble femme! #Ironie,mamzellemna une très humble femme ironie,0
1,334548844731826176,"BLOGUE - «Tsé, la parité homme-femme...» au se...",blogue tsé la parité hommefemme au sein de vi...,0
2,334424362033762304,Je suis une femme matérialiste et superficiell...,je suis une femme matérialiste et superficiell...,0
3,421708259716636672,Mise en ligne de mon article sur @PayeTaShnek ...,mise en ligne de mon article sur payetashnek p...,0
4,420942263154249728,Achat du jour : le très bon livre #PayeTaShnek...,achat du jour le très bon livre payetashnek ...,0
...,...,...,...,...
7026,990112883215360000,#SégolèneRoyal a participé activement au décli...,ségolèneroyal a participé activement au déclin...,1
7027,991730416158568449,"Depuis l'affaire DSK, les féministes ne veulen...",depuis laffaire dsk les féministes ne veulent ...,1
7028,991744626984980480,"Analogie. C’est comme dire à une femme, vous ê...",analogie cest comme dire à une femme vous êtes...,1
7029,991769637506486274,"Si t'as une bite a la place du coeur, t'étonne...",si tas une bite a la place du coeur tétonne pa...,1


In [11]:
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
model = CamembertModel.from_pretrained("camembert-base").to(device)

In [12]:
train_df, test_df = train_test_split(
    df_whole[['text_clean', 'label']],
    test_size=0.2,
    stratify=df_whole['label'],
    random_state=42
)

In [13]:
class TextDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=256):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.loc[idx, "text_clean"]
        label = self.df.loc[idx, "label"]
        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.float)
        }


In [22]:
batch_size = 128 #as in the article

train_dataset = TextDataset(train_df, tokenizer)
test_dataset = TextDataset(test_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [31]:
import torch.nn as nn
from transformers import CamembertModel

class CamembertCNNLSTMClassifier(nn.Module):
    def __init__(self, conv_out_dim=256, hidden_dim=128):
        super().__init__()
        self.backbone = CamembertModel.from_pretrained("camembert-base")
        
        # Freeze CamemBERT
        for param in self.backbone.parameters():
            param.requires_grad = False

        # Conv1D: in_channels=768 (Camembert output dim), out_channels=conv_out_dim
        self.conv1d = nn.Conv1d(in_channels=768, out_channels=conv_out_dim, kernel_size=3, padding=1)
        self.relu_conv = nn.ReLU()

        # Bidirectional LSTM
        self.lstm1 = nn.LSTM(input_size=conv_out_dim, hidden_size=hidden_dim,
                             batch_first=True, bidirectional=True)
        self.relu = nn.ReLU()
        self.lstm2 = nn.LSTM(input_size=hidden_dim * 2, hidden_size=hidden_dim,
                             batch_first=True, bidirectional=True)

        # Classifier
        self.classifier = nn.Linear(hidden_dim * 2, 1)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        
        sequence_output = outputs.last_hidden_state  # shape: [batch, seq_len, 768]

        # Conv1D expects: [batch, channels, seq_len]
        x = sequence_output.permute(0, 2, 1)  # → [batch, 768, seq_len]
        x = self.conv1d(x)                   # → [batch, conv_out_dim, seq_len]
        x = self.relu_conv(x)
        x = x.permute(0, 2, 1)               # → [batch, seq_len, conv_out_dim]

        # LSTM layers
        lstm_out1, _ = self.lstm1(x)
        relu_out = self.relu(lstm_out1)
        lstm_out2, _ = self.lstm2(relu_out)

        cls_token_out = lstm_out2[:, 0, :]  # Get the [CLS] token output

        logits = self.classifier(cls_token_out)
        return logits.squeeze(1)


In [32]:
num_pos = (df_whole['label'] == 1).sum()
num_neg = (df_whole['label'] == 0).sum()
# Calculate the ratio of class imbalance
pos_weight = torch.tensor([num_neg / num_pos]).to(device)

# Define the weighted loss function
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

In [33]:
model = CamembertCNNLSTMClassifier().to(device)

# Freeze all CamemBERT layers
for param in model.backbone.parameters():
    param.requires_grad = False

# Unfreeze ONLY the last transformer layer (layer 11, 0-indexed)
for param in model.backbone.encoder.layer[-1].parameters():
    param.requires_grad = True
optimizer = torch.optim.Adam(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=1e-4
)


In [34]:
num_epochs = 25

for epoch in tqdm(range(num_epochs)):
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        logits = model(input_ids, attention_mask).squeeze()
        loss = criterion(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    
    model.eval()
    all_preds = []
    all_probs = []
    all_labels = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            logits = model(input_ids, attention_mask).squeeze()
            probs = torch.sigmoid(logits)

            all_probs.extend(probs.cpu().numpy())
            all_preds.extend((probs > 0.5).int().cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    auc = roc_auc_score(all_labels, all_probs)

    print(f"Epoch {epoch+1}/{num_epochs} — Loss: {total_loss/len(train_loader):.4f} | "
          f"Accuracy: {accuracy:.4f} | F1: {f1:.4f} | AUC: {auc:.4f}")


  4%|▍         | 1/25 [01:43<41:34, 103.95s/it]

Epoch 1/25 — Loss: 0.9447 | Accuracy: 0.6944 | F1: 0.5594 | AUC: 0.7584


  8%|▊         | 2/25 [03:29<40:13, 104.92s/it]

Epoch 2/25 — Loss: 0.8204 | Accuracy: 0.7328 | F1: 0.6453 | AUC: 0.8122


 12%|█▏        | 3/25 [05:15<38:41, 105.52s/it]

Epoch 3/25 — Loss: 0.7204 | Accuracy: 0.7399 | F1: 0.6642 | AUC: 0.8290


 16%|█▌        | 4/25 [07:02<37:08, 106.10s/it]

Epoch 4/25 — Loss: 0.6872 | Accuracy: 0.7697 | F1: 0.6817 | AUC: 0.8368


 20%|██        | 5/25 [08:49<35:29, 106.49s/it]

Epoch 5/25 — Loss: 0.6691 | Accuracy: 0.7719 | F1: 0.6453 | AUC: 0.8416


 24%|██▍       | 6/25 [10:39<34:03, 107.53s/it]

Epoch 6/25 — Loss: 0.6607 | Accuracy: 0.7598 | F1: 0.6859 | AUC: 0.8423


 28%|██▊       | 7/25 [12:28<32:22, 107.91s/it]

Epoch 7/25 — Loss: 0.6408 | Accuracy: 0.7434 | F1: 0.6802 | AUC: 0.8451


 32%|███▏      | 8/25 [14:15<30:28, 107.56s/it]

Epoch 8/25 — Loss: 0.6239 | Accuracy: 0.7484 | F1: 0.6856 | AUC: 0.8453


 36%|███▌      | 9/25 [16:00<28:32, 107.03s/it]

Epoch 9/25 — Loss: 0.6213 | Accuracy: 0.7733 | F1: 0.6894 | AUC: 0.8487


 40%|████      | 10/25 [17:46<26:38, 106.60s/it]

Epoch 10/25 — Loss: 0.6119 | Accuracy: 0.7598 | F1: 0.6927 | AUC: 0.8506


 44%|████▍     | 11/25 [19:32<24:47, 106.26s/it]

Epoch 11/25 — Loss: 0.6165 | Accuracy: 0.7918 | F1: 0.6880 | AUC: 0.8512


 48%|████▊     | 12/25 [21:17<22:57, 105.94s/it]

Epoch 12/25 — Loss: 0.5878 | Accuracy: 0.7832 | F1: 0.6935 | AUC: 0.8538


 52%|█████▏    | 13/25 [23:02<21:08, 105.67s/it]

Epoch 13/25 — Loss: 0.5873 | Accuracy: 0.7761 | F1: 0.6974 | AUC: 0.8540


 56%|█████▌    | 14/25 [24:47<19:20, 105.51s/it]

Epoch 14/25 — Loss: 0.5919 | Accuracy: 0.7669 | F1: 0.6882 | AUC: 0.8537


 60%|██████    | 15/25 [26:32<17:33, 105.37s/it]

Epoch 15/25 — Loss: 0.5797 | Accuracy: 0.7946 | F1: 0.6869 | AUC: 0.8566


 64%|██████▍   | 16/25 [28:17<15:47, 105.23s/it]

Epoch 16/25 — Loss: 0.5682 | Accuracy: 0.7790 | F1: 0.6918 | AUC: 0.8566


 68%|██████▊   | 17/25 [30:02<14:01, 105.14s/it]

Epoch 17/25 — Loss: 0.6025 | Accuracy: 0.7846 | F1: 0.7015 | AUC: 0.8548


 72%|███████▏  | 18/25 [31:47<12:15, 105.10s/it]

Epoch 18/25 — Loss: 0.5579 | Accuracy: 0.7854 | F1: 0.7085 | AUC: 0.8598


 76%|███████▌  | 19/25 [33:32<10:30, 105.05s/it]

Epoch 19/25 — Loss: 0.5527 | Accuracy: 0.7925 | F1: 0.6812 | AUC: 0.8572


 80%|████████  | 20/25 [35:17<08:45, 105.02s/it]

Epoch 20/25 — Loss: 0.5453 | Accuracy: 0.7910 | F1: 0.7072 | AUC: 0.8626


 84%|████████▍ | 21/25 [37:02<07:00, 105.03s/it]

Epoch 21/25 — Loss: 0.5388 | Accuracy: 0.7910 | F1: 0.7024 | AUC: 0.8632


 88%|████████▊ | 22/25 [38:47<05:15, 105.02s/it]

Epoch 22/25 — Loss: 0.5329 | Accuracy: 0.7960 | F1: 0.6850 | AUC: 0.8600


 92%|█████████▏| 23/25 [40:32<03:30, 105.06s/it]

Epoch 23/25 — Loss: 0.5205 | Accuracy: 0.7882 | F1: 0.6996 | AUC: 0.8598


 96%|█████████▌| 24/25 [42:17<01:45, 105.09s/it]

Epoch 24/25 — Loss: 0.5273 | Accuracy: 0.7441 | F1: 0.6907 | AUC: 0.8586


100%|██████████| 25/25 [44:02<00:00, 105.70s/it]

Epoch 25/25 — Loss: 0.5200 | Accuracy: 0.7918 | F1: 0.6989 | AUC: 0.8580





In [38]:
torch.save(model.state_dict(), "camembert_cnn_lstm_weights.pth")

In [41]:
def predict_label(text, model, tokenizer, device, threshold=0.5):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(device)
    with torch.no_grad():
        logits = model(inputs['input_ids'], inputs['attention_mask'])
    prob = torch.sigmoid(logits).item()
    prediction = 1 if prob >= threshold else 0
    return prediction, prob

In [46]:
text = "la discrimination contre les femmes est un problème"
label, probability = predict_label(text, model, tokenizer, device)

print(f"Predicted label: {label} (probability: {probability:.4f})")

Predicted label: 1 (probability: 0.7639)


In [52]:
#we can notice that even if we tried to avoid false positives, there is still a slight problem