In [1]:
import os

import random
import numpy as np

import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import (
    AdamW,
    Trainer,
    TrainingArguments,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_cosine_schedule_with_warmup,
)
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
new_data=pd.read_csv("../input/csvvvv/Classeur 1 (5).csv")

In [3]:
new_data.drop(["Y","Word Count","Language",' '], axis=1, inplace=True)
new_data.columns = ["text", "CEFR Review","CEFR Level_Samar"]

In [4]:
new_data.shape

(191, 3)

In [5]:
new_data.head(2)

Unnamed: 0,text,CEFR Review,CEFR Level_Samar
0,"En 1815, M. Charles-François-Bienvenu Myriel é...",A2,C1
1,Quoique ce détail ne touche en aucune manière ...,C1,C2


In [6]:
mike_20_june_samar=new_data[pd.notnull(new_data['CEFR Level_Samar'])]
mike_20_june_samar.columns = ["text", "CEFR Review","label"]
mike_20_june_samar.drop(["CEFR Review"],axis=1, inplace=True)
mike_20_june_samar.to_csv("mike_20_june_samar.csv" , index=False)
mike_20_june_samar.head()

Unnamed: 0,text,label
0,"En 1815, M. Charles-François-Bienvenu Myriel é...",C1
1,Quoique ce détail ne touche en aucune manière ...,C2
2,"En 1804, M. Myriel était curé de Brignolles. I...",C1
3,"Vers l'époque du couronnement, une petite affa...",C2
4,—Quel est ce bonhomme qui me regarde?,C1


In [7]:
mike_20_june_new=new_data[pd.notnull(new_data['CEFR Review'])]
mike_20_june_new.columns = ["text", "label","CEFR Level_Samar"]
mike_20_june_new.drop(["CEFR Level_Samar"],axis=1, inplace=True)
mike_20_june_new.to_csv("mike_20_june_new.csv" , index=False)
mike_20_june_new.head() 

Unnamed: 0,text,label
0,"En 1815, M. Charles-François-Bienvenu Myriel é...",A2
1,Quoique ce détail ne touche en aucune manière ...,C1
2,"En 1804, M. Myriel était curé de Brignolles. I...",A2
3,"Vers l'époque du couronnement, une petite affa...",B2
4,—Quel est ce bonhomme qui me regarde?,A2


**!!!! samar annotation seems to be weird because there is just C1 and C2**

In [8]:
mike_20_june_samar["label"].unique(), mike_20_june_new["label"].unique()

(array(['C1', 'C2'], dtype=object),
 array(['A2', 'C1', 'B2', 'A1', 'B1', 'C2'], dtype=object))

In [9]:
mike_20_june_new.label.value_counts()

C1    58
A1    41
B2    33
B1    29
A2    24
C2     6
Name: label, dtype: int64

In [10]:
mike_20_june_samar.shape,mike_20_june_new.shape

((141, 2), (191, 2))

In [11]:
trainn=pd.read_csv("../input/frenchcefr/french dataset.csv")
trainn.shape

(315, 4)

****

In [12]:
model_name = "dbmdz/bert-base-french-europeana-cased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Downloading:   0%|          | 0.00/420 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/227k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/83.0 [00:00<?, ?B/s]

In [13]:
lr = 2e-5
epochs =  6
batch_size = 5
max_seq_len = 75

test_frac = 0.1

In [14]:
import os

def set_seed(seed=106052):
    """Set seed for reproducibility.
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed()

In [15]:
class CEFRDataset(Dataset):
    """Classification dataset, built on top of pytorch dataset object
    """
    
    def __init__(self, texts, labels):
        
        self.encoder = LabelEncoder()
        print(self.encoder.__dict__)
        self.texts = texts
        self.labels = self.encoder.fit_transform(labels)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]
        encoded_text = tokenizer(
            text,
            padding="max_length",
            max_length=max_seq_len,
            truncation=True,
            return_tensors="pt",
        )
        encoded_text["input_ids"] = encoded_text["input_ids"].squeeze()
        encoded_text["attention_mask"] = encoded_text["attention_mask"].squeeze()
        label = torch.tensor(label)

        return {
            "input_ids": encoded_text["input_ids"],
            "attention_mask": encoded_text["attention_mask"],
            "labels": label,
        }

    def get_labels(self):
        return self.labels

In [16]:
def train(train_set, valid_set, epochs=10, warmup_size=0.1, lr=1e-3, batch_size=16):
    model = get_model(model_name)
    optim = AdamW(model.parameters(), lr=lr)
    scheduler = get_scheduler(
        optim, warmup_size, round(len(train_set) / batch_size * epochs)
    )
    training_args = get_training_args(epochs, batch_size)
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_set,
        eval_dataset=valid_set,
        optimizers=[optim, scheduler],
        compute_metrics=compute_accuracy,
    )
    trainer.train()
    trainer.save_model()
    return trainer

In [17]:
def get_model(pretrained_checkpoint):
    model = AutoModelForSequenceClassification.from_pretrained(
        pretrained_checkpoint, num_labels=6
    )
    return model.to(device)

In [18]:
os.environ["WANDB_DISABLED"] = "true"


def get_scheduler(optimizer, warmup_size, total_steps):
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=round(total_steps * warmup_size),
        num_training_steps=total_steps,
    )
    return scheduler


def get_training_args(epochs, batch_size):
    return TrainingArguments(
        output_dir="./b",
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        logging_steps=50,
        fp16=False,
        evaluation_strategy="epoch",
        eval_accumulation_steps=1,
        report_to=None,
        save_total_limit=1,
        load_best_model_at_end=True
    )


def compute_accuracy(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}



In [19]:
lr = 2e-5
epochs =  7
batch_size = 8
max_seq_len = 512

In [20]:
def split_valid(df, frac=0.01):
    
    val = pd.DataFrame()
    val["text"] = ""
    val["label"] = -1
    
    for i in df.label.unique():
        val = pd.concat([val, df[df.label == i].sample(frac=frac)])
        
    return df[~df.index.isin(val.index)].reset_index(drop=True) , val.reset_index(drop=True)

In [21]:
# train_set_df = pd.read_csv("../input/newdatasets/mike_20_june_new (1).csv")
# train_set_df

In [22]:
mike_20_june_new.head(2)

Unnamed: 0,text,label
0,"En 1815, M. Charles-François-Bienvenu Myriel é...",A2
1,Quoique ce détail ne touche en aucune manière ...,C1


In [23]:
mike_20_june_new.shape

(191, 2)

In [24]:
new=pd.read_csv("../input/lingua/fr_lingua.csv")

In [25]:
new.columns=["text","label"]

In [26]:
french=pd.read_csv("../input/frenchcefr/french dataset.csv")
french=french[["text","label"]]
french.shape,french.head(2)


((315, 2),
                                                 text label
 0  Il était une fois un prince qui voulait épouse...    A2
 1  Il fit le tour de la Terre pour en trouver une...    B1)

In [27]:
# train_set_df=pd.concat([french, mike_20_june_new,new], ignore_index=True)

In [28]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

train_set_df = pd.read_csv("../input/newdatasets/mike_20_june_new (1).csv")
train_set_df = train_set_df[train_set_df.label != "XX"]
train_set_df = train_set_df[["text", "label"]]

train_set_df.text = train_set_df.text.apply(lambda x: x.replace("\r", "").replace("\n", " "))

extra_df = pd.read_csv("../input/frenchcefr/french_mike_june.csv")
extra_df.columns = ["text", "label", "label_"]
extra_df = extra_df[["text", "label"]]
extra_df.text = extra_df.text.astype(str)
#train_set_df = pd.concat([train_set_df, extra_df]).reset_index(drop=True)

train_set_df, valid_set_df = split_valid(train_set_df)

In [29]:
train_set_df.label.unique()

array(['A2', 'C1', 'B2', 'A1', 'B1', 'C2'], dtype=object)

In [30]:
train_set_df.label = le.fit_transform(train_set_df.label)
valid_set_df.label = le.transform(valid_set_df.label)

In [31]:
train_set_df.label.nunique()

6

In [32]:
train_set_df.label.value_counts()

4    57
0    41
3    33
2    29
1    24
5     6
Name: label, dtype: int64

In [33]:
valid_set_df.label.nunique()

1

In [34]:
# train_set_df = train_set_df.sample(frac=1)

In [35]:
from tqdm import tqdm 

def predict(model, text):
    
    preds = []
    
    for i in tqdm(range(len(text))):
        tokenized = tokenizer(text[i:i+1], return_tensors="pt", truncation=True, max_length=512).to("cuda")
        pred = model(**tokenized)
        preds.append(pred.logits.argmax(-1).item())

    return preds

In [36]:
train_set = CEFRDataset(train_set_df["text"], train_set_df["label"])
valid_set = CEFRDataset(valid_set_df["text"], valid_set_df["label"])


trainer_second = train(train_set, valid_set, epochs=epochs, warmup_size=0.2, lr=lr, batch_size=batch_size)
model = trainer_second.model

{}
{}


Downloading:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-base-french-europeana-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassificat

Epoch,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
1,No log,2.087388,0.0,0.051,19.612
2,No log,3.319314,0.0,0.0835,11.974
3,1.512000,4.162873,0.0,0.0453,22.071
4,1.512000,4.354156,0.0,0.0453,22.06
5,0.867000,4.624961,0.0,0.0416,24.039
6,0.867000,4.566848,0.0,0.0448,22.341
7,0.585500,4.568268,0.0,0.045,22.205


In [37]:
valid_set_df["preds"] = train_set.encoder.inverse_transform(predict(model, valid_set_df.text.tolist()))
valid_set_df.columns = ["text", "cefr", "preds",] 

100%|██████████| 1/1 [00:00<00:00, 32.40it/s]


In [38]:
lingua = pd.read_csv("../input/lingua/fr_lingua.csv")
lingua["preds"] = train_set.encoder.inverse_transform(predict(model, lingua.text.tolist()))

100%|██████████| 111/111 [00:01<00:00, 58.09it/s]


In [39]:
# lingua.columns=["text","len","lang","cefr","preds"]
# lingua = lingua[lingua.cefr != "XX"]

In [40]:
lingua.cefr.value_counts()

B1    60
A2    26
A1    13
B2    12
Name: cefr, dtype: int64

In [41]:
lingua.preds.unique()

array([4])

In [42]:
def compute_average_distance(df, col_name="cefr") :
    
    labels = ["A1", "A2", "B1", "B2", "C1", "C2"]
    return (df[col_name].apply(lambda x: labels.index(x)) - df.preds.apply(lambda x: labels.index(x))).abs().mean()


In [43]:
lingua.preds.value_counts()

4    111
Name: preds, dtype: int64

In [44]:
lingua["preds"] = le.inverse_transform(lingua.preds)

In [45]:
lingua.preds.value_counts()

C1    111
Name: preds, dtype: int64

In [46]:
print("Distance: ")
print(compute_average_distance(lingua, "cefr"))

print(f"Lingua accuracy: {(lingua['preds'] == lingua['cefr']).mean()}")

Distance: 
2.3603603603603602
Lingua accuracy: 0.0


In [47]:
for i in range(lingua.shape[0]):
    if (lingua.loc[i,"preds"]=="C1") | (lingua.loc[i,"preds"]=="C2") :
        lingua.loc[i,"Predss"]="B2"
    else :
        lingua.loc[i,"Predss"]= lingua.loc[i,"preds"]
lingua["Predss"].value_counts()

B2    111
Name: Predss, dtype: int64

In [48]:
print("Distance: ")
print(compute_average_distance(lingua, "cefr"))

print(f"Lingua accuracy: {(lingua['Predss'] == lingua['cefr']).mean()}")

Distance: 
2.3603603603603602
Lingua accuracy: 0.10810810810810811


****

Distance: 
0.990990990990991
Lingua accuracy: 0.40540540540540543 <br>
2    82
3    15
5    10
4     2
0     1
1     1 <br> 
inokufu/flaubert-base-uncased-xnli-sts-finetuned-education

****


Distance: 
0.8648648648648649
Lingua accuracy: 0.450450450450450462 <br>
3     7
1     5
5     4
4     4
0     1 <br> inokufu/flaubert-base-uncased-xnli-sts-finetuned-education