In [None]:
from transformers import AutoModelForMaskedLM, AutoModelForSequenceClassification,AutoTokenizer
import torch
from datasets import load_dataset,concatenate_datasets,Dataset
import math
from torch.utils.data import DataLoader
from transformers import default_data_collator
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import preprocessing
import argparse
import preprocessing
import pickle
from sklearn.preprocessing import LabelEncoder
from transformers import get_linear_schedule_with_warmup
import random
import re
import numpy as np

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
model_checkpoint = "KBLab/bert-base-swedish-cased"
model =  AutoModelForSequenceClassification.from_pretrained(model_checkpoint)

In [None]:
model_finetuned = AutoModelForSequenceClassification.from_pretrained("finetuning_hugging_python-finetuned-imdb/checkpoint-920384")
model_finetuned=model_finetuned.to(device)

In [None]:
tokenizer= AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
def filter_NaN(subset,example):
    return example[subset] is not None


In [None]:
def subset_label(dataset,nb_obs,label_name):
    df = dataset.to_pandas()

    # Calculer le nombre d'observations pour chaque étiquette
    grouped_data = df.groupby(label_name)

    # Calculer le nombre d'observations par étiquette pour obtenir une répartition uniforme
    total_samples = nb_obs
    samples_per_label = total_samples // len(grouped_data.groups)

    # Créer une liste pour stocker les observations échantillonnées
    sampled_data = []

    # Prélever aléatoirement les observations pour chaque groupe de label
    for group_label, group_data in grouped_data.groups.items():
        group_dataset=dataset.select(group_data)
        label_data = group_dataset.shuffle(seed=np.random.randint(1, 1000)).select(range(min(len(group_data), samples_per_label)))
        sampled_data.extend(label_data)

    # Mélanger les observations pour obtenir un ordre aléatoire
    np.random.shuffle(sampled_data)

    # Créer un Dataset Hugging Face à partir des observations échantillonnées
    sampled_dataset = Dataset.from_dict({key: [example[key] for example in sampled_data] for key in sampled_data[0]})
    
    return sampled_dataset

In [None]:
def subset_random(dataset,nb_ob):
    dataset=dataset.shuffle()
    echantillon_aleatoire = dataset.select(range(nb_ob))
    return echantillon_aleatoire

In [None]:
def tokenize_function(examples):
    result = tokenizer(examples["Note"],padding=True, truncation=True,max_length=512)
    return result

In [None]:
def evaluate(model, loader):
    loss, accuracy = 0.0, []
    model.eval()
    for batch in tqdm(loader, total=len(loader)):
        input_ids = batch["input_ids"].to(device)
        input_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        output = model(input_ids,
            token_type_ids=None, 
            attention_mask=input_mask, 
            labels=labels)
        loss += output.loss.item()
        preds_batch = torch.argmax(output.logits, axis=1)
        batch_acc = torch.mean((preds_batch == labels).float())
        accuracy.append(batch_acc)
        
    accuracy = torch.mean(torch.tensor(accuracy))
    return loss, accuracy


In [None]:
def extract_date_from_filename(protocole):
    match = re.search(r'/(\d+)/', protocole)
    if match:
        year = match.group(1)
        return int(year[:4])
    else:
        return None

In [None]:
data_files = {"train": "swerick_data_party_train.pkl", "test": "swerick_data_party_test.pkl"}
party_dataset = load_dataset("pandas",data_files=data_files)
print(party_dataset)

In [None]:
dates = [extract_date_from_filename(row['protocole']) for row in party_dataset['train']]
dates_test = [extract_date_from_filename(row['protocole']) for row in party_dataset['test']]
party_dataset['train'] = party_dataset['train'].add_column('date', dates)
party_dataset['test'] = party_dataset['test'].add_column('date', dates_test)

print(party_dataset["train"][0])
print(party_dataset["test"][0])

In [None]:
data_files = {"valid": "swerick_data_party_valid.pkl"}
party_valid_dataset = load_dataset("pandas",data_files=data_files)
print(party_valid_dataset)

In [None]:
dates_valid = [extract_date_from_filename(row['protocole']) for row in party_valid_dataset['valid']]
party_valid_dataset['valid'] = party_valid_dataset['valid'].add_column('date', dates_valid)

In [None]:
from sklearn.preprocessing import StandardScaler

date_scaler = StandardScaler()
dates_train_2d = [[date] for date in party_dataset['train']["date"]]
dates_test_2d=[[date] for date in party_dataset['test']["date"]]
date_scaler.fit(dates_train_2d)
dates_train =date_scaler.transform(dates_train_2d)
dates_test =date_scaler.transform(dates_test_2d)
print(dates_train)
party_dataset['train'] = party_dataset['train'].add_column('date_scaled', dates_train.squeeze())
party_dataset['test'] = party_dataset['test'].add_column('date_scaled', dates_test.squeeze())


In [None]:
party_dataset["train"]=party_dataset["train"].filter(lambda x : filter_NaN("party",x))
party_dataset["test"]=party_dataset["test"].filter(lambda x : filter_NaN("party",x))

In [None]:
party_valid_dataset["valid"]=party_valid_dataset["valid"].filter(lambda x : filter_NaN("party",x))

In [None]:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(party_dataset["train"]["gender"])
label_names = label_encoder.classes_
label_dict={ i : label_names[i] for i in  range(len(label_names))}
print(label_dict)

In [None]:
print(label_names)

In [None]:
with open("labels_gender.pkl", "wb") as fp:   
   pickle.dump(label_names, fp)

In [None]:
with open("labels.pkl","rb") as f :
    label_names=pickle.load(f)

print(label_names.tolist())

In [None]:
party_dataset["train"]=party_dataset["train"].map(lambda example :{"party_labels" : label_encoder.transform([example["party"]])[0]})
party_dataset["test"]=party_dataset["test"].map(lambda example :{"party_labels" : label_encoder.transform([example["party"]])[0]})

In [None]:
party_valid_dataset["valid"]=party_valid_dataset["valid"].map(lambda example :{"party_labels" : label_encoder.transform([example["party"]])[0]})

In [None]:
import train_party_detection

In [None]:
party_train_datasets = subset_label(party_dataset["train"],5000,"party")
party_test_datasets = subset_label(party_dataset["test"],5000,"party")
party_valid_datasets = subset_label(party_valid_dataset["valid"],5000,"party")

In [None]:
train_set1 = subset_random(party_dataset["train"],100)
train_set2 = subset_random(party_dataset["train"],200)
train_set3 = subset_random(party_dataset["train"],500)
train_set4= subset_random(party_dataset["train"],1000)
test_set = subset_random(party_dataset["test"],10000)


In [None]:
print(test_set[:3])

In [None]:
print(party_train_datasets)
print(party_test_datasets)
print(party_valid_datasets)

In [None]:
party_dataset = concatenate_datasets([train_set,test_set,valid_set])

In [None]:
tokenized_train_datasets = train_set.map(tokenize_function,batched=True )
tokenized_test_datasets = test_set.map(tokenize_function,batched=True )
tokenized_valid_datasets = valid_set.map(tokenize_function,batched=True )
tokenized_train_datasets

In [None]:
tokenized_train_datasets=tokenized_train_datasets.remove_columns(["protocole","id","party","gender","Note"])
tokenized_test_datasets=tokenized_test_datasets.remove_columns(["protocole","id","party","gender","Note"])
tokenized_valid_datasets=tokenized_valid_datasets.remove_columns(["protocole","id","party","gender","Note"])

In [None]:
tokenized_train_datasets=tokenized_train_datasets.rename_column("party_labels","labels")
tokenized_test_datasets=tokenized_test_datasets.rename_column("party_labels","labels")
tokenized_valid_datasets=tokenized_valid_datasets.rename_column("party_labels","labels")





In [None]:
tokenized_train_datasets.set_format(type="torch",columns=["input_ids","labels","attention_mask"])
tokenized_test_datasets.set_format(type="torch",columns=["input_ids","labels","attention_mask"])
tokenized_valid_datasets.set_format(type="torch",columns=["input_ids","labels","attention_mask"])

In [None]:
batch_size = 64
num_workers=4

train_loader = DataLoader(
        tokenized_train_datasets,
        shuffle=True,
        batch_size = batch_size,
        num_workers = num_workers
    )

valid_loader = DataLoader(
        tokenized_valid_datasets,
        shuffle=False,
        batch_size = batch_size,
        num_workers = num_workers
    )

# Not used atm
test_loader = DataLoader(
        tokenized_test_datasets,
        shuffle=False,
        batch_size = batch_size,
        num_workers = num_workers
    )

In [None]:
n_epochs =10
model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(label_dict),
        id2label=label_dict).to("cpu")

optimizer = torch.optim.Adam(
    filter(lambda p: p.requires_grad, model.parameters()), lr=2e-5)
num_training_steps = len(train_loader) * n_epochs
num_warmup_steps = num_training_steps // 10

# Linear warmup and step decay
scheduler = get_linear_schedule_with_warmup(
    optimizer = optimizer,
    num_warmup_steps = num_warmup_steps,
    num_training_steps = num_training_steps
    )

In [None]:
import pandas as pd

df=pd.read_pickle("swerick_data_party_train.pkl")
df = df.rename(columns={"Note":"content","party" : "tag"})
df.to_csv("swerick_data_party_train.csv")

In [None]:
import pandas as pd
df=pd.read_csv("swerick_subsetdata_date_test.csv")
print(type(df["tag"][0]))

In [None]:
df =df.dropna(subset="tag")
df.to_csv("swerick_data_party_train.csv")

In [None]:
df=test_set.to_pandas()
print(df)
df = df.rename(columns={"Note":"content", "date_scaled" : "tag"})
print(df)
df.to_csv("swerick_subsetdata_date_test.csv")

In [None]:
!python3 train_regression.py


In [None]:
!python3 train_regression.py --base_model "finetuning_hugging_whitespace-finetuned-imdb/checkpoint-801500" --model_filename "trained/regression_date_hugging_face"

In [None]:
!python3 compare_models_regression.py --model_filename1 "trained/regression_date" --model_filename2 "trained/regression_date_hugging_face" --data_path "swerick_subsetdata_date_train.csv"

In [None]:
from evaluation import regression_year

regression_year("finetuning_hugging_whitespace-finetuned-imdb/checkpoint-2919750","swerick_subsetdata_date_train1000.csv")

In [None]:
from evaluation import regression_year
with open("comparison_results.txt", "w") :
        pass
for i in range (10):
    train_set = subset_random(party_dataset["train"],1000)
    df=train_set.to_pandas()
    df=df.rename(columns={"Note":"content", "date_scaled" : "tag"})
    df.to_csv("swerick_subsetdata_date_train_robust.csv")
    regression_year("finetuning_hugging_whitespace-finetuned-imdb/checkpoint-3148750","swerick_subsetdata_date_train_robust.csv")


losses_model2, r2_model2 = [], []
losses_model1, r2_model1 = [], []
with open("comparison_results.txt", "r") as file:
    for line in file:
        loss_model1, r2_model1_val, loss_model2, r2_model2_val = line.strip().split(',')
        losses_model1.append(float(loss_model1))
        r2_model1.append(float(r2_model1_val))

        losses_model2.append(float(loss_model2))
        r2_model2.append(float(r2_model2_val))

print(sum(losses_model1)/len(losses_model1))
print(sum(losses_model2)/len(losses_model2))
print(sum(r2_model1)/len(r2_model1))
print(sum(r2_model2)/len(r2_model2))

In [None]:
losses_model1

In [None]:

def r2_score(outputs, labels):
    predictions = outputs.logits.squeeze()
    labels_mean = torch.mean(labels.float())
    ss_tot = torch.sum((labels - labels_mean) ** 2)
    ss_res = torch.sum((labels - predictions) ** 2)
    r2 = 1 - ss_res / ss_tot
    return r2

In [None]:
def encode(df, tokenizer):
    # Tokenize all of the sentences and map the tokens to their word IDs.
    input_ids = []
    attention_masks = []

    # For every sentence...
    for ix, row in df.iterrows():
        encoded_dict = tokenizer.encode_plus(
            row['content'],
            add_special_tokens=True,
            max_length=512,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        # Add the encoded sentence to the list.    
        input_ids.append(encoded_dict['input_ids'])
        
        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks.append(encoded_dict['attention_mask'])

    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(df['tag'].tolist())

    return input_ids, attention_masks, labels

In [None]:
def evaluate(model, loader):
    loss, valid_r2 = 0.0, []
    model.eval()
    for batch in tqdm(loader, total=len(loader)):
        input_ids = batch[0].to(device)
        input_mask = batch[1].to(device)
        labels = batch[2].float().to(device)
        output = model(input_ids,token_type_ids=None,attention_mask=input_mask,labels=labels)
        loss +=output.loss.item()
        r2 = r2_score(output, labels)
        valid_r2.append(r2.item())
        
    r2 = torch.mean(torch.tensor(valid_r2))
    return loss, r2


In [None]:
from torch.utils.data import TensorDataset, DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import pandas as pd 
from torch.profiler import profile, record_function, ProfilerActivity
model1 = AutoModelForSequenceClassification.from_pretrained(
    "trained/regression_date",
    num_labels=1,
).to(device)

model2 = AutoModelForSequenceClassification.from_pretrained(
    "trained/regression_date_hugging_face",
    num_labels=1).to(device)

df = pd.read_csv("swerick_subsetdata_date_test.csv")
df = df.sample(frac=1, random_state=123).reset_index(drop=True)
input_ids, attention_masks, labels = encode(df, tokenizer)
dataset = TensorDataset(input_ids, attention_masks, labels)
test_loader = DataLoader(
        dataset,
        shuffle=False,
        batch_size=16,
        num_workers=4
    )

loss1,r2=evaluate(model1,test_loader)
loss2,r22=evaluate(model2,test_loader)
            
        


    
    

print("\nLoss model 1:", loss1 * 16/ len(test_loader))
print("\nR2 model1:",torch.mean(torch.tensor(r2)))


print("\nLoss model 2:", loss2* 16 / len(test_loader))
print("\nR2 model2:",torch.mean(torch.tensor(r22)))


In [None]:
print(prof.key_averages().table(sort_by="cuda_memory_usage", row_limit=10))

In [None]:
!python3 train_binary_bert.py --data_path "swerick_subsetdata_party_train.csv" --label_names $label_names_str 

In [None]:
!python3 train_binary_bert.py --model_filename "trained_hugging_face_party_classification" --base_model "finetuning_hugging_whitespace-finetuned-imdb/checkpoint-343500" --data_path "swerick_subsetdata_party_train.csv"