# Inferring with the models we fine-tuned
## 1. Load model

In [None]:
model_finetuned = "xlm-roberta-large_text-mine"
pretrained = "xlm-roberta-large"

In [None]:
model_finetuned = "camembert/camembert-large_text-mine"
pretrained = "camembert/camembert-large"

In [None]:
import transformers
from transformers import pipeline
import pandas as pd

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(pretrained, is_split_into_words=True, model_max_length=512)

In [None]:
from transformers import AutoModelForTokenClassification

label_correspondance = {
    0: "aucun",
    1: "geogFeat",
    2: "geogFeat geogName",
    3: "geogName",
    4: "name",
    5: "name geogName"
}

id2label = label_correspondance
label2id = {v: k for k, v in id2label.items()}

model = AutoModelForTokenClassification.from_pretrained(
    f"./../models_finetuned/{model_finetuned}",
    id2label=id2label,
    label2id=label2id,
)

In [None]:
import numpy as np
import torch

def textmine_pipeline(text, model=model, tokenizer=tokenizer):
    tokens = tokenizer.encode_plus(text, is_split_into_words=True, return_tensors="pt")
    predictions = model(**tokens)
    logits = predictions.logits
    logits = logits
    with torch.no_grad():
        p = np.argmax(logits, axis=-1)
    predictions_aggregated = []
    subtoken_old = np.nan
    for i, subtoken in enumerate(tokens.word_ids()[1:-1]):
        if subtoken != subtoken_old:
            predictions_aggregated.append(label_correspondance[p[0][i].item()])
        subtoken_old = subtoken
    return predictions_aggregated

In [None]:
t = textmine_pipeline(["Le", "port", "de", "plaisance", "de", "Cavalaire", "se", "trouve", "à"])
t

## 2. Load data

In [None]:
df_raw = pd.read_csv("./../data/test.csv")

In [None]:
# Rebuilding the sentences and making list of labels
current_sentence = []
list_sentences = []

last_row = df_raw.iloc[-1]["Id"]
# Iterate over each row in the DataFrame
for index, row in df_raw.iterrows():
    token = row['Token'].replace('"','')
    current_sentence.append(token)
    
    # Check if the current token ends with a period
    if token.endswith('.') or index == last_row :       
        # Update the 'Sentence' column with the rebuilt sentence
        list_sentences.append(current_sentence)
        # Reset the current sentence
        current_sentence = []


df = pd.DataFrame()
df["tokens"] = list_sentences

In [None]:
df

## 3. Infer

In [None]:
from tqdm.notebook import tqdm
tqdm.pandas()

df["predicted"] = df["tokens"].progress_apply(textmine_pipeline)

In [None]:
df

## 4. Saving files

In [None]:
def split_lists(row):
    tokens = row['tokens']
    predicted = row['predicted']
    return list(zip(tokens, predicted))

In [None]:
# df_test = df.apply(split_lists, axis=1)

list_tokens = []
for liste in df_test.tolist():
    list_tokens.extend(liste)

df_test2 = pd.DataFrame(list_tokens)
df_test2 = df_test2.reset_index()
df_test2 = df_test2.rename(columns={"Index": "Id", 0: "Token", 1: "Label"})

df_test2

In [None]:
df_test2.to_csv(f"./../output/{model_finetuned.replace('/', '_')}_nopipeline_test.csv")

In [None]:
df_test2[["Id", "Label"]].to_csv(f"./../output/submission_remy_2.csv", index=False)

## 6 Others verification test on training date

In [None]:
df_raw_train = pd.read_csv("./../data/train.csv")
# Rebuilding the sentences and making list of labels
current_sentence = []
list_sentences = []

# Iterate over each row in the DataFrame
for index, row in df_raw_train.iterrows():
    token = row['Token'].replace('"','')
    current_sentence.append(token)
    
    # Check if the current token ends with a period
    if token.endswith('.'):       
        # Update the 'Sentence' column with the rebuilt sentence
        list_sentences.append(current_sentence)
        # Reset the current sentence
        current_sentence = []


df_train = pd.DataFrame()
df_train["tokens"] = list_sentences

In [None]:
df_train

In [None]:
df_train["predicted"] = df_train["tokens"].progress_apply(nlp)

In [None]:
df_train["predicted_post"] = df_train.progress_apply(lambda x: geogName(x.tokens, x.predicted), axis=1)

In [None]:
id_number = 0
list_tokens = []
# for sentence in df_train["predicted"]:
for sentence in df_train["predicted_post"]:
    # print(sentence)
    for ner_results in sentence:
        results_dict = {
            "Id": id_number,
            "Token": f'"""{ner_results[0]["word"]}"""',
            "Label": ner_results[0]["entity_group"]
        }
        list_tokens.append(results_dict)
        id_number = id_number + 1

df_test_train = pd.DataFrame(list_tokens)
df_test_train.to_csv(f"./../output/{model_finetuned.replace('/', '_')}_train.csv")

In [None]:
nlp(["Le", "port", "de", "plaisance", "de", "Cavalaire", "se", "trouve", "à"])

In [None]:
print(df_train.iloc[1]["predicted"])

In [None]:
def cleaning_label(label):
    if label == "geogName name":
        return "name geogName"
    elif label == "geogFeat geogName geogName":
        return "geogFeat geogName"
    elif label == "name geogName geogName":
        return "name geogName"
    elif label == "geogName geogName":
        return "geogName"
    elif label == "geogName geogFeat geogName":
        return "geogFeat geogName"
    elif label == "geogName geogFeat":
        return "geogFeat geogName"
    elif label == "geogName geogName name":
        return "name geogName"
    elif label == "geogName name geogName":
        return "name geogName"
    elif label == "geogFeat geogName geogName geogName":
        return "geogFeat geogName"
    elif label == "geogName geogName name geogName":
        return "name geogName"
    else:
        return label

df_raw_train["Label"] = df_raw_train["Label"].apply(cleaning_label)

In [None]:
label_correspondance = {
    0: "aucun",
    1: "geogFeat",
    2: "geogFeat geogName",
    3: "geogName",
    4: "name",
    5: "name geogName"
}

def label_id(desired_value):
    # reverse Correspondance between label value and their index
    for key, value in label_correspondance.items():
        if value == desired_value:
            return key

def list_of_prediction(df_raw):
    # Rebuilding the sentences and making list of labels
    current_sentence = []
    current_sentence_label = []
    current_sentence_ner_tag = []
    list_sentences = []
    list_sentences_label = []
    list_sentences_ner_tags = []

    # Iterate over each row in the DataFrame
    for index, row in df_raw.iterrows():
        token = row['Token'].replace('"','')
        label = row['Label']
        ner_tag = label_id(row["Label"])
        current_sentence.append(token)
        current_sentence_label.append(label)
        current_sentence_ner_tag.append(ner_tag)

        # Check if the current token ends with a period
        if token.endswith('.'):       
            # Update the 'Sentence' column with the rebuilt sentence
            list_sentences.append(current_sentence)
            list_sentences_label.append(current_sentence_label)
            list_sentences_ner_tags.append(current_sentence_ner_tag)

            # Reset the current sentence
            current_sentence = []
            current_sentence_label = []
            current_sentence_ner_tag = []

    df = pd.DataFrame()
    df["tokens"] = list_sentences
    df["labels"] = list_sentences_label
    df["ner_tags"] = list_sentences_ner_tags
    return df

df_predicted_train = list_of_prediction(df_test_train)
df_true_train = list_of_prediction(df_raw_train)

In [None]:
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score

label_correspondance = {
    0: "aucun",
    1: "geogFeat",
    2: "geogFeat geogName",
    3: "geogName",
    4: "name",
    5: "name geogName"
}
true_labels = [[label_names[l] for l in label if l != -100] for label in df_predicted_train["ner_tags"]]
true_predictions = [[label_names[l] for l in label if l != -100] for label in df_true_train["ner_tags"]]

print(classification_report(df_true_train["labels"], df_predicted_train["labels"]))
all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
# print(all_metrics)
all_metrics

In [None]:
df_true_train["labels"].values

In [None]:
df_predicted_train["labels"]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(pretrained, model_max_length=512)
nlp_misc = pipeline("ner", model=model, tokenizer=tokenizer,  device=0, framework="pt", aggregation_strategy="first")

In [None]:
tokens= ["Le", "port", "de", "plaisance", "de", "Cavalaire", "se", "trouve", "à"]
nlp_misc(tokens)

In [None]:
nlp(["Le", "port", "de", "plaisance", "de", "Cavalaire", "se", "trouve", "à"])

In [None]:
tokens = tokenizer(["Le", "port", "de", "plaisance", "de", "Cavalaire", "se", "trouve", "à"], is_split_into_words=True, return_tensors="pt")
tokens.to(0)
# tokens = tokenizer("le port du lavandou")
print(tokens)
# model.to(0)
predictions = model(**tokens)
print(predictions)
import numpy as np
import torch
logits = predictions.logits
logits = logits.cpu()
with torch.no_grad():
    p = np.argmax(logits, axis=-1)
print(p)
