In [7]:
# Importation des bibliothèques nécessaires
!pip install transformers
!pip install tensorflow
!pip install sacremoses
!pip install sentencepiece


import pandas as pd
import re
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from transformers import FlaubertTokenizer, TFFlaubertModel, TFFlaubertForSequenceClassification
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
import random
from sklearn.model_selection import train_test_split



In [8]:
# Charger les données
train_data = pd.read_csv('/kaggle/input/ouchy-data/training_data.csv')

# Dupliquer les données une première fois
train_data_duplicated_once = pd.concat([train_data, train_data])

# Dupliquer les données une deuxième fois pour obtenir une multiplication par 4
train_data_duplicated_twice = pd.concat([train_data_duplicated_once, train_data_duplicated_once])

train_data = train_data_duplicated_twice

X = train_data['sentence']
y = train_data['difficulty']

# Encodage des labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Initialiser le tokenizer FlauBERT
tokenizer = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased')

# Préparer les données pour FlauBERT
def encode_for_flaubert(sentences, max_length=128):
    input_ids = []
    attention_masks = []

    for sentence in sentences:
        encoded_dict = tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=max_length,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='tf',
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = tf.concat(input_ids, 0)
    attention_masks = tf.concat(attention_masks, 0)

    return input_ids, attention_masks

train_input_ids, train_attention_masks = encode_for_flaubert(X_train, max_length=128)
test_input_ids, test_attention_masks = encode_for_flaubert(X_test, max_length=128)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [9]:
# Charger le modèle FlauBERT pré-entraîné pour la classification de séquence
model = TFFlaubertForSequenceClassification.from_pretrained('flaubert/flaubert_base_cased', num_labels=len(label_encoder.classes_), from_pt=True)

# Compiler le modèle
optimizer = Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

# Entraîner le modèle
model.fit(
    [train_input_ids, train_attention_masks],
    y_train,
    epochs=3,
    batch_size=16,
    validation_data=([test_input_ids, test_attention_masks], y_test)
)


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFFlaubertForSequenceClassification: ['pred_layer.proj.bias', 'pred_layer.proj.weight']
- This IS expected if you are initializing TFFlaubertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFFlaubertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFFlaubertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7b7c0c92ff70>

In [10]:
# Charger les données de test non étiquetées
test_data = pd.read_csv('/kaggle/input/ouchy-data/unlabelled_test_data.csv')
test_input_ids, test_attention_masks = encode_for_flaubert(test_data['sentence'])

# Prédiction sur les données de test
test_predictions = model.predict([test_input_ids, test_attention_masks])
test_predicted_classes = tf.argmax(test_predictions.logits, axis=1).numpy()

# Création du fichier de soumission
submission = pd.DataFrame({'id': test_data['id'], 'difficulty': label_encoder.inverse_transform(test_predicted_classes)})
submission.to_csv('submission3.csv', index=False)






In [11]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
import numpy as np
import pandas as pd
from IPython.display import HTML

# Supposons que vous ayez un ensemble de test X_test et y_test
test_input_ids, test_attention_masks = encode_for_flaubert(X_test, max_length=128)
y_pred = model.predict([test_input_ids, test_attention_masks])
y_pred_classes = np.argmax(y_pred.logits, axis=1)

# Calculer les métriques
precision = precision_score(y_test, y_pred_classes, average='weighted')
recall = recall_score(y_test, y_pred_classes, average='weighted')
f1 = f1_score(y_test, y_pred_classes, average='weighted')
accuracy = accuracy_score(y_test, y_pred_classes)

# Générer la matrice de confusion
cm = confusion_matrix(y_test, y_pred_classes)






In [12]:
# Tableau des métriques
metrics_html = f"""
<table>
<tr><th>Metric</th><th>Value</th></tr>
<tr><td>Precision</td><td>{precision:.2f}</td></tr>
<tr><td>Recall</td><td>{recall:.2f}</td></tr>
<tr><td>F1-Score</td><td>{f1:.2f}</td></tr>
<tr><td>Accuracy</td><td>{accuracy:.2f}</td></tr>
</table>
"""

# Tableau de la matrice de confusion
cm_df = pd.DataFrame(cm, index=label_encoder.classes_, columns=label_encoder.classes_)
cm_html = cm_df.to_html()

# Afficher les tableaux
HTML(metrics_html + cm_html)


Metric,Value
Precision,0.92
Recall,0.92
F1-Score,0.92
Accuracy,0.92

Unnamed: 0,A1,A2,B1,B2,C1,C2
A1,631,24,1,0,0,0
A2,65,563,19,0,0,0
B1,8,84,558,0,0,0
B2,1,8,44,559,6,11
C1,0,0,6,13,597,9
C2,0,0,1,8,17,607


In [14]:
# Préparer les données pour les prédictions erronées
incorrect_predictions_df = pd.DataFrame({'Sentence': X_test, 'Actual': label_encoder.inverse_transform(y_test), 'Predicted': label_encoder.inverse_transform(y_pred_classes)})
incorrect_predictions_df = incorrect_predictions_df[incorrect_predictions_df['Actual'] != incorrect_predictions_df['Predicted']].head(10)

# Convertir en HTML
incorrect_html = incorrect_predictions_df.to_html()
HTML(incorrect_html)


Unnamed: 0,Sentence,Actual,Predicted
4180,Il fait un froid incroyable à Moscou mais la ville est superbe,B1,A2
1960,"Malgré le danger, il a gardé la tête froide durant cette expérience.",B2,B1
2609,"Le nombre insuffisant de professeurs formés est également un frein pour beaucoup d'adversaires de cette mesure, qui la jugent inapplicable.",C2,B2
1347,Cette période marque l'arrivée de Noël et des fêtes de fin d'année.,A1,A2
2920,"Oui, prenons-en pour le trajet!",A2,A1
1277,"Je m'appelle Laurent et j'habite à Paris avec mes parents, ma soeur ainée et mon frère.",A2,A1
344,Comment j'ai survécu à une perverse narcissique,B2,B1
1411,L'asile est le creuset terrible où se forge l'identité du vagabond.,C1,C2
4591,"Les genoux au menton, les bras croisés sur la poitrine, il se fit boule pour mieux interroger le mot qui ne quittait jamais longtemps sa pensée.",C1,C2
2684,"C'est une chance pour lui, il va gagner un peu plus d'argent.",B1,A1


In [15]:
!pip install streamlit


Collecting streamlit
  Obtaining dependency information for streamlit from https://files.pythonhosted.org/packages/d3/96/9251b421d0a1c7d625a82a04bea56b8a9830c785940ec16db454b85c6db7/streamlit-1.29.0-py2.py3-none-any.whl.metadata
  Downloading streamlit-1.29.0-py2.py3-none-any.whl.metadata (8.2 kB)
Collecting validators<1,>=0.2 (from streamlit)
  Obtaining dependency information for validators<1,>=0.2 from https://files.pythonhosted.org/packages/3a/0c/785d317eea99c3739821718f118c70537639aa43f96bfa1d83a71f68eaf6/validators-0.22.0-py3-none-any.whl.metadata
  Downloading validators-0.22.0-py3-none-any.whl.metadata (4.7 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.8.1b0-py2.py3-none-any.whl (4.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting watchdog>=2.1.5 (from streamlit)
  Downloading watchdog-3.0.0-py3-none-manylinux2014_x86_64.whl (82 kB)
[2K     [90m━

In [23]:
import streamlit as st
import types
from transformers import FlaubertTokenizer, TFFlaubertForSequenceClassification
import tensorflow as tf
import numpy as np

# Custom hash function to bypass hashing of the load_model function
def bypass_hashing(func):
    return 0

# Function to load the FlauBERT model
@st.cache(allow_output_mutation=True, hash_funcs={types.FunctionType: bypass_hashing})
def load_model():
    model = TFFlaubertForSequenceClassification.from_pretrained('flaubert/flaubert_base_cased', num_labels=6, from_pt=True)
    return model

# Function to encode text for FlauBERT
def encode_text(text, tokenizer, max_length=128):
    encoded_dict = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='tf',
    )
    return encoded_dict['input_ids'], encoded_dict['attention_mask']

# Load FlauBERT tokenizer
tokenizer = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased')

# Load the model
model = load_model()

# Streamlit interface
st.title('French Text Difficulty Predictor')
user_input = st.text_area("Enter a sentence in French", "")

if st.button('Predict Difficulty'):
    input_ids, attention_masks = encode_text(user_input, tokenizer)
    predictions = model.predict([input_ids, attention_masks])
    difficulty_level = np.argmax(predictions.logits, axis=1)[0]

    # Mapping the prediction to difficulty level
    levels = ['A1', 'A2', 'B1', 'B2', 'C1', 'C2']
    predicted_level = levels[difficulty_level]

    st.write(f"The predicted difficulty level is: {predicted_level}")


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFFlaubertForSequenceClassification: ['pred_layer.proj.bias', 'pred_layer.proj.weight']
- This IS expected if you are initializing TFFlaubertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFFlaubertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFFlaubertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2023

In [26]:
# Save the trained model
model_save_path = "/Users/mac/Desktop/KAGGLE Competition"
model.save_pretrained(model_save_path)

In [28]:
model_save_path = "/Users/mac/Desktop/my_flauBERT_model"
model.save(model_save_path, save_format="tf")


In [29]:
model_save_path = "./my_flauBERT_model"
model.save(model_save_path, save_format="tf")


In [30]:
!zip -r my_flauBERT_model.zip my_flauBERT_model


  adding: my_flauBERT_model/ (stored 0%)
  adding: my_flauBERT_model/keras_metadata.pb (deflated 95%)
  adding: my_flauBERT_model/saved_model.pb (deflated 92%)
  adding: my_flauBERT_model/assets/ (stored 0%)
  adding: my_flauBERT_model/variables/ (stored 0%)
  adding: my_flauBERT_model/variables/variables.index (deflated 77%)
  adding: my_flauBERT_model/variables/variables.data-00000-of-00001 (deflated 7%)
  adding: my_flauBERT_model/fingerprint.pb (stored 0%)


In [31]:
from IPython.display import FileLink
FileLink(r'my_flauBERT_model.zip')
