In [2]:
import os

import random
import numpy as np

import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import (
    AdamW,
    Trainer,
    TrainingArguments,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_cosine_schedule_with_warmup,
)
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [3]:
model_name = "microsoft/Multilingual-MiniLM-L12-H384"

tokenizer = AutoTokenizer.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Downloading:   0%|          | 0.00/430 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

In [4]:
lr = 2e-5
epochs =  6
batch_size = 5
max_seq_len = 75

test_frac = 0.1

In [5]:
import os

def set_seed(seed=106052):
    """Set seed for reproducibility.
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed()

In [6]:
class CEFRDataset(Dataset):
    """Classification dataset, built on top of pytorch dataset object
    """
    
    def __init__(self, texts, labels):
        
        self.encoder = LabelEncoder()
        print(self.encoder.__dict__)
        self.texts = texts
        self.labels = self.encoder.fit_transform(labels)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]
        encoded_text = tokenizer(
            text,
            padding="max_length",
            max_length=max_seq_len,
            truncation=True,
            return_tensors="pt",
        )
        encoded_text["input_ids"] = encoded_text["input_ids"].squeeze()
        encoded_text["attention_mask"] = encoded_text["attention_mask"].squeeze()
        label = torch.tensor(label)

        return {
            "input_ids": encoded_text["input_ids"],
            "attention_mask": encoded_text["attention_mask"],
            "labels": label,
        }

    def get_labels(self):
        return self.labels

In [7]:
def train(train_set, valid_set, epochs=10, warmup_size=0.1, lr=1e-3, batch_size=16):
    model = get_model(model_name)
    optim = AdamW(model.parameters(), lr=lr)
    scheduler = get_scheduler(
        optim, warmup_size, round(len(train_set) / batch_size * epochs)
    )
    training_args = get_training_args(epochs, batch_size)
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_set,
        eval_dataset=valid_set,
        optimizers=[optim, scheduler],
        compute_metrics=compute_accuracy,
    )
    trainer.train()
    trainer.save_model()
    return trainer

In [8]:
def get_model(pretrained_checkpoint):
    model = AutoModelForSequenceClassification.from_pretrained(
        pretrained_checkpoint, num_labels=6
    )
    return model.to(device)

In [9]:
os.environ["WANDB_DISABLED"] = "true"


def get_scheduler(optimizer, warmup_size, total_steps):
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=round(total_steps * warmup_size),
        num_training_steps=total_steps,
    )
    return scheduler


def get_training_args(epochs, batch_size):
    return TrainingArguments(
        output_dir="./b",
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        logging_steps=50,
        fp16=False,
        evaluation_strategy="epoch",
        eval_accumulation_steps=1,
        report_to=None,
        save_total_limit=1,
        load_best_model_at_end=True
    )


def compute_accuracy(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}



In [10]:
lr = 2e-5
epochs =  4
batch_size = 8
max_seq_len = 512

In [11]:
def split_valid(df, frac=0.1):
    
    val = pd.DataFrame()
    val["text"] = ""
    val["label"] = -1
    
    for i in df.label.unique():
        val = pd.concat([val, df[df.label == i].sample(frac=frac)])
        
    return df[~df.index.isin(val.index)].reset_index(drop=True) , val.reset_index(drop=True)

In [12]:
train_set_df = pd.read_csv("../input/fr-cerfr/dataset_to_be_annotated_-_dataset_to_be_annotated.csv")
train_set_df.drop("Unnamed: 3", axis=1, inplace=True)
train_set_df = train_set_df[pd.notnull(train_set_df['cefr'])]
train_set_df = train_set_df.reset_index(drop=True)
train_set_df.columns=["text","lens","label"]
train_set_df = train_set_df[train_set_df.label != "-"]
train_set_df = train_set_df[["text", "label"]]

In [13]:
# train_set_df.label = train_set_df.label.replace("c1", "C1")

In [None]:
"""A1_A2=pd.read_csv("../input/collecteda1a2/frr.csv - Feuil1.csv")
A1_A2=A1_A2.dropna()
A1_A2.columns=["text","label"]
A1_A2 = A1_A2.reset_index(drop=True)
A1A2=pd.read_csv("../input/collecteda1a2/complete.csv.csv")
A1__A2=pd.read_csv("../input/a1-a2-ex/A1_A2_tran.csv")"""

In [14]:
# A1_A2=pd.read_csv("../input/collecteda1a2/frr.csv - Feuil1.csv")
# A1_A2=A1_A2.dropna()
# A1_A2.columns=["text","label"]
# A1_A2 = A1_A2.reset_index(drop=True)

In [15]:
# A1_A2["label"] = A1_A2["label"].replace(" A2", "A2").replace("B25", "B2")
# A1_A2 = A1_A2[A1_A2["label"].isin(["B1", "A2", "B2", "A1"])]

In [16]:
# train_set_df=pd.concat([train_set_df , A1_A2], ignore_index=True)

In [17]:
## Didn't give good results
"""translated = pd.read_csv("../input/eng-data/translated.csv")

translated = translated[translated.label.isin(["A1", "A2", "C1", "C2"])].reset_index(drop=True)
translated  = translated[["translation", "label"]]
translated.columns = ["text", "label"]

train_set_df = pd.concat([train_set_df, translated.sample(frac=0.2)])"""

'translated = pd.read_csv("../input/eng-data/translated.csv")\n\ntranslated = translated[translated.label.isin(["A1", "A2", "C1", "C2"])].reset_index(drop=True)\ntranslated  = translated[["translation", "label"]]\ntranslated.columns = ["text", "label"]\n\ntrain_set_df = pd.concat([train_set_df, translated.sample(frac=0.2)])'

In [19]:
extra_df = pd.read_csv("../input/frenchcefr/french_mike_june.csv")
extra_df.columns = ["text", "label", "label_"]
extra_df = extra_df[["text", "label"]]
extra_df.text = extra_df.text.astype(str)

train_set_df = pd.concat([train_set_df, extra_df.sample(frac=0.58)]).reset_index(drop=True)

In [20]:
# lingua_ = pd.read_csv("../input/lingua/fr_lingua.csv")
# lingua_.columns = ["text", "label"]

# train_set_df = pd.concat([train_set_df, lingua_.sample(frac=0.75)]).reset_index(drop=True)

In [21]:
train_set_df.label.value_counts()

B1    183
B2    159
A2    109
C1    101
A1     84
C2     16
Name: label, dtype: int64

In [22]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

train_set_df.text = train_set_df.text.apply(lambda x: x.replace("\r", "").replace("\n", " "))

# extra_df = pd.read_csv("../input/frenchcefr/french_mike_june.csv")
# extra_df.columns = ["text", "label", "label_"]
# extra_df = extra_df[["text", "label"]]
# extra_df.text = extra_df.text.astype(str)
#train_set_df = pd.concat([train_set_df, extra_df]).reset_index(drop=True)

train_set_df, valid_set_df = split_valid(train_set_df)

In [23]:
# for i in range(train_set_df.shape[0]):
#     if train_set_df.loc[i,"label"]=="c1":
#         train_set_df.loc[i,"label"]="C1"

In [24]:
train_set_df.label.unique()

array(['C1', 'B2', 'B1', 'C2', 'A2', 'A1'], dtype=object)

In [25]:
train_set_df.label.value_counts()

B1    165
B2    143
A2     98
C1     91
A1     76
C2     14
Name: label, dtype: int64

In [26]:
train_set_df.label = le.fit_transform(train_set_df.label)
valid_set_df.label = le.transform(valid_set_df.label)

In [27]:
train_set_df.label.unique()

array([4, 3, 2, 5, 1, 0])

In [28]:
valid_set_df.label.nunique()

6

In [29]:
valid_set_df

Unnamed: 0,text,label
0,"À sa droite, son aide de camp, et à sa gauche,...",4
1,Les ornières devinrent plus profondes. On appr...,4
2,"Son cou sortait d’un col blanc, rabattu. Ses c...",4
3,Loin de là. Comme il y a toujours encore plus ...,4
4,– Cinq cents vers à toute la classe ! exclamé ...,4
...,...,...
60,Séminaire des missions étrangères à Paris: deu...,0
61,Pour la société de charité maternelle de Dragu...,0
62,Le sommeil profond a un rôle important dans la...,0
63,Total: trois mille livres,0


In [30]:
# train_set_df = train_set_df.sample(frac=1)

In [31]:
from tqdm import tqdm 

def predict(model, text):
    
    preds = []
    
    for i in tqdm(range(len(text))):
        tokenized = tokenizer(text[i:i+1], return_tensors="pt", truncation=True, max_length=512).to("cuda")
        pred = model(**tokenized)
        preds.append(pred.logits.argmax(-1).item())

    return preds

In [32]:
train_set = CEFRDataset(train_set_df["text"], train_set_df["label"])
valid_set = CEFRDataset(valid_set_df["text"], valid_set_df["label"])


trainer_second = train(train_set, valid_set, epochs=epochs, warmup_size=0.2, lr=lr, batch_size=batch_size)
model = trainer_second.model

{}
{}


Downloading:   0%|          | 0.00/471M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/Multilingual-MiniLM-L12-H384 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
1,1.7775,1.709325,0.276923,0.715,90.913


KeyboardInterrupt: 

In [None]:
valid_set_df["preds"] = train_set.encoder.inverse_transform(predict(model, valid_set_df.text.tolist()))
valid_set_df.columns = ["text", "cefr", "preds",] 

In [None]:
lingua = pd.read_csv("../input/lingua/fr_lingua.csv")
lingua["preds"] = train_set.encoder.inverse_transform(predict(model, lingua.text.tolist()))

In [None]:
lingua.shape

In [None]:
lingua.cefr.value_counts()

In [None]:
lingua.preds.unique()

In [None]:
def compute_average_distance(df, col_name="cefr") :
    
    labels = ["A1", "A2", "B1", "B2", "C1", "C2"]
    return (df[col_name].apply(lambda x: labels.index(x)) - df.preds.apply(lambda x: labels.index(x))).abs().mean()


In [None]:
lingua.preds.value_counts()

In [None]:
lingua["preds"] = le.inverse_transform(lingua.preds)

In [None]:
lingua.cefr.value_counts()

In [None]:
lingua.preds.value_counts()

In [None]:
print("Distance: ")
print(compute_average_distance(lingua, "cefr"))

print(f"Lingua accuracy: {(lingua['preds'] == lingua['cefr']).mean()}")

In [None]:
# lingua.columns=["text","cefr","preds:A1_A2_from_trans_data&new_data,acc:0.5"]
# lingua.to_csv("lingua1.csv", index=False)

****

****

In [None]:
extra_df = pd.read_csv("../input/frenchcefr/french_mike_june.csv")
extra_df.columns = ["text", "label", "label_"]
extra_df = extra_df[["text", "label"]]
extra_df.text = extra_df.text.astype(str)



In [None]:
extra_df

In [None]:
extra_df["preds"] = predict(model, extra_df.text.tolist())


In [None]:
extra_df.preds.unique()

In [None]:
extra_df["preds"] = le.inverse_transform(extra_df.preds.tolist())

In [None]:
print("Distance: ")
print(compute_average_distance(extra_df, "label"))

print(f"Lingua accuracy: {(extra_df['preds'] == extra_df['label']).mean()}")

In [None]:
lingua.to_csv("lingua_latest_preds.csv", index=False)
extra_df.to_csv("mike_annotated_dataset_preds.csv", index=False)

In [None]:
extra_df.preds.value_counts()

In [None]:
extra_df.label.value_counts()