In [None]:
import torch
import tqdm
from tqdm import tqdm
import pandas as pd
import pickle
import numpy as np
from transformers import CamembertTokenizer, CamembertModel
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torch.nn as nn
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
%run data_preprocessing.ipynb

In [None]:
df = pd.read_csv('tweets_prepro.csv')

In [None]:
### EXAMPLE OF EMBEDDING (c est pour avoir l idee de comment ça marche)
"""
import torch

# Step 1: Tokenize and encode in one go
encoded_sentence = tokenizer.encode('J'aime le camembert !', return_tensors='pt')

# encoded_sentence is now a tensor of shape (1, sequence_length)

# Step 2: Pass through CamemBERT
outputs = camembert(encoded_sentence)

# Step 3: Extract token embeddings
embeddings = outputs.last_hidden_state  # shape: (1, seq_len, hidden_dim)


#sentence_embedding = embeddings.mean(dim=1)  # average over tokens

embeddings.shape
"""

In [None]:
annotated_corpus = pd.read_csv('corpus_SexistContent.csv', sep='\t', header=None, names=['tweet_id', 'label'])
df_whole = pd.merge(df, annotated_corpus, on = 'tweet_id')

In [None]:
# Load model and tokenizer
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
model = CamembertModel.from_pretrained("camembert-base").to(device)
model.eval()  # Put model in evaluation mode

# Dictionary to store everything
embeddings_dict = {}

# Process each row
for _, row in tqdm(df_whole.iterrows()):
    sentence = row['text_clean']
    label = row['label']
    
    # Tokenize and encode sentence
    inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=False)
    inputs = {key: val.to(device) for key, val in inputs.items()}  # Move inputs to GPU
    
    with torch.no_grad():
        outputs = model(**inputs)
        # Extract per-token embeddings and move to CPU before converting to numpy
        token_embeddings = outputs.last_hidden_state.squeeze(0).cpu().numpy()
    
    # Save to dictionary
    embeddings_dict[sentence] = {
        "embeddings": token_embeddings,  # shape: (num_tokens, 768)
        "label": label
    }

# Save to .pkl
with open("labeled_token_embeddings.pkl", "wb") as f:
    pickle.dump(embeddings_dict, f)


In [None]:
df_whole.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
df_whole

In [None]:
train_df, test_df = train_test_split(
    df_whole[['text_clean', 'label']],
    test_size=0.2,
    stratify=df_whole['label'],
    random_state=42
)

In [None]:
class TextDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=256):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.loc[idx, "text_clean"]
        label = self.df.loc[idx, "label"]
        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.float)
        }


In [None]:
batch_size = 64 #as in the article

train_dataset = TextDataset(train_df, tokenizer)
test_dataset = TextDataset(test_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
class CamembertLSTMClassifier(nn.Module):
    def __init__(self, hidden_dim=128, lstm_layers=1):
        super().__init__()
        self.backbone = CamembertModel.from_pretrained("camembert-base")
        
        # Freeze CamemBERT
        for param in self.backbone.parameters():
            param.requires_grad = False

        # Set LSTM to be bidirectional
        self.cnn1 = nn.
        self.lstm1 = nn.LSTM(input_size=768, hidden_size=hidden_dim, num_layers=1, 
                             batch_first=True, bidirectional=True)  # Bidirectional LSTM
        self.relu = nn.ReLU()
        self.lstm2 = nn.LSTM(input_size=hidden_dim * 2, hidden_size=hidden_dim, num_layers=1, 
                             batch_first=True, bidirectional=True)  # Bidirectional LSTM
        self.classifier = nn.Linear(hidden_dim * 2, 1)  # We now have double the hidden size because of bidirectionality

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        
        sequence_output = outputs.last_hidden_state

        # LSTM layers
        lstm_out1, _ = self.lstm1(sequence_output)  # Output shape: [batch_size, seq_len, hidden_dim * 2]
        relu_out = self.relu(lstm_out1)
        lstm_out2, _ = self.lstm2(relu_out)  # Output shape: [batch_size, seq_len, hidden_dim * 2]

        cls_token_out = lstm_out2[:, 0, :]  # Get the output of the [CLS] token

        logits = self.classifier(cls_token_out)  # Final output shape: [batch_size, 1]
        return logits.squeeze(1)  # Output: [batch_size]


In [None]:
model = CamembertLSTMClassifier().to(device)

criterion = nn.BCEWithLogitsLoss()  # Use sigmoid during evaluation
optimizer = torch.optim.Adam(model.classifier.parameters(), lr=1e-4)

In [None]:
num_epochs = 10

for epoch in tqdm(range(num_epochs)):
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        logits = model(input_ids, attention_mask).squeeze()
        loss = criterion(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    
    model.eval()
    all_preds = []
    all_probs = []
    all_labels = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            logits = model(input_ids, attention_mask).squeeze()
            probs = torch.sigmoid(logits)

            all_probs.extend(probs.cpu().numpy())
            all_preds.extend((probs > 0.5).int().cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    auc = roc_auc_score(all_labels, all_probs)

    print(f"Epoch {epoch+1}/{num_epochs} — Loss: {total_loss/len(train_loader):.4f} | "
          f"Accuracy: {accuracy:.4f} | F1: {f1:.4f} | AUC: {auc:.4f}")
