# Imports

In [1]:
import pandas as pd 
import numpy as np
import re

  from pandas.core import (


## Databases

In [2]:
# Import the train data
train = pd.read_csv('https://github.com/tcastrom/CEFR-French-/raw/main/Data/training_data.csv')
train.set_index('id', inplace=True)
display(train.head())

#Import the unlabel data
unlabel = pd.read_csv('https://github.com/tcastrom/CEFR-French-/raw/main/Data/unlabelled_test_data.csv')
unlabel.set_index('id', inplace=True)
display(unlabel.head())

Unnamed: 0_level_0,sentence,difficulty
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Les coûts kilométriques réels peuvent diverger...,C1
1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1
2,Le test de niveau en français est sur le site ...,A1
3,Est-ce que ton mari est aussi de Boston?,A1
4,"Dans les écoles de commerce, dans les couloirs...",B1


Unnamed: 0_level_0,sentence
id,Unnamed: 1_level_1
0,Nous dûmes nous excuser des propos que nous eû...
1,Vous ne pouvez pas savoir le plaisir que j'ai ...
2,"Et, paradoxalement, boire froid n'est pas la b..."
3,"Ce n'est pas étonnant, car c'est une saison my..."
4,"Le corps de Golo lui-même, d'une essence aussi..."


# Readability metrics in train

In [3]:
# import it and use it
import textstat as txt
# put it in french
txt.set_lang('fr')

In [4]:
# Create the flesh reading score for the training data 
train['flesch_reading_ease'] = train['sentence'].apply(txt.flesch_reading_ease)
# Create the polysyllable count for the train data
train['polysyllable_count'] = train['sentence'].apply(lambda x: txt.polysyllabcount(x))
# Create the coleman liau index for the train data
train['coleman_liau_index'] = train['sentence'].apply(txt.coleman_liau_index)
display(train.head())

Unnamed: 0_level_0,sentence,difficulty,flesch_reading_ease,polysyllable_count,coleman_liau_index
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Les coûts kilométriques réels peuvent diverger...,C1,43.31,9,15.39
1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1,99.14,1,4.57
2,Le test de niveau en français est sur le site ...,A1,90.77,1,5.03
3,Est-ce que ton mari est aussi de Boston?,A1,95.84,0,2.86
4,"Dans les écoles de commerce, dans les couloirs...",B1,69.45,2,11.79


# Readability metrics in unlabel

In [5]:
# Create the flesh reading score for the unlabel data
unlabel['flesch_reading_ease'] = unlabel['sentence'].apply(txt.flesch_reading_ease)
# Create the polysyllable count for the unlabel data
unlabel['polysyllable_count'] = unlabel['sentence'].apply(lambda x: txt.polysyllabcount(x))
# Create the coleman liau index for the unlabel data
unlabel['coleman_liau_index'] = unlabel['sentence'].apply(txt.coleman_liau_index)
display(unlabel.head())

Unnamed: 0_level_0,sentence,flesch_reading_ease,polysyllable_count,coleman_liau_index
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Nous dûmes nous excuser des propos que nous eû...,86.45,2,10.24
1,Vous ne pouvez pas savoir le plaisir que j'ai ...,89.75,1,8.63
2,"Et, paradoxalement, boire froid n'est pas la b...",80.11,1,10.58
3,"Ce n'est pas étonnant, car c'est une saison my...",94.83,1,9.31
4,"Le corps de Golo lui-même, d'une essence aussi...",8.8,12,14.3


# Open Lexicon

In [6]:
lexicon = pd.read_excel('https://github.com/tcastrom/CEFR-French-/raw/main/Data/Lexique382.xlsx')
display(lexicon.head())

Unnamed: 0,ortho,phon,lemme,cgram,genre,nombre,freqlemfilms2,freqlemlivres,freqfilms2,freqlivres,...,orthrenv,phonrenv,orthosyll,cgramortho,deflem,defobs,old20,pld20,morphoder,nbmorph
0,a,a,a,NOM,m,,81.36,58.65,81.36,58.65,...,a,a,a,"NOM,AUX,VER",,,1.0,1.0,a,1
1,a,a,avoir,AUX,,,18559.22,12800.81,6350.91,2926.69,...,a,a,a,"NOM,AUX,VER",,,1.0,1.0,avoir,1
2,a,a,avoir,VER,,,13572.4,6426.49,5498.34,1669.39,...,a,a,a,"NOM,AUX,VER",93.0,16.0,1.0,1.0,avoir,1
3,a capella,akapEla,a capella,ADV,,,0.04,0.07,0.04,0.07,...,allepac a,alEpaka,a ca-pel-la,ADV,,,3.85,2.85,a-capella,2
4,a cappella,akapEla,a cappella,ADV,,,0.04,0.07,0.04,0.07,...,alleppac a,alEpaka,a cap-pel-la,ADV,,,4.6,2.85,a-cappella,2


We are not interested in all the the columns here. Here are the columns we will retain and their meaning: 

- **Mot (ortho)**: La graphie est la forme orthographique du mot (p. ex.chienne) Attention, les mots correspondent
seulement aux mots qui sont apparus au moins une fois dans notre corpus (16 + 50 millions de mots). Il peut
ainsi y avoir des lemmes de certains mots apparus dans le corpus qui ne sont pas listés comme entrées
indépendante car il n’y sont pas apparus en tant que tels (seul le mot dérivé était dans le corpus). Lexique 2
comprenait 129 000 entrées tandis que Lexique 3 en comprenait 135 000 et Lexique 3.5 142 000- 


- **Fréquence du lemme par million selon le corpus de films (freqlemfilms2)** : Elle correspond à la somme des
fréquences des formes fléchies de chaque lemme fournie par notre sélection de films. Ex: freq (arbre) = freq
("arbre") + freq ("arbres")

- **Fréquence du lemme par million selon le corpus de livres (freqlemlivres)** : Elle correspond à la somme des
fréquences des formes fléchies de chaque lemme fournie par notre sélection de livres de Frantext, normalisée par
une division par 14,8 (le corpus original comprenant 14,7 millions d'occurrences).

- **Fréquence par million selon le corpus de films (freqfilms2)** : Elle correspond à la fréquence par million
d'occurrences du mot selon notre corpus de sous-titres. Contrairement à Lexique 2, danse aura deux entrées et
deux fréquences, une pour sa forme nominale (p.ex. la danse) et une pour sa forme verbale (je danse). Attention,
cette fréquence a changé à partir de Lexique 3.40.

- **Fréquence par million selon le corpus de livres (freqlivres)** : Elle correspond à la fréquence par million
d'occurrences du mot selon notre corpus de livres. (14,7 millions de mots).


- **Nombre de lettres (nblettres)**

-  **Nombre de syllabes (nbsyll)**



In [7]:
# Retain only the columns we need 
columns_needed = ['ortho', 'freqlemfilms2', 'freqlemlivres', 'freqfilms2', 'freqlivres', 'nblettres', 'nbsyll']
lexicon = lexicon[columns_needed]

display(lexicon.head())

Unnamed: 0,ortho,freqlemfilms2,freqlemlivres,freqfilms2,freqlivres,nblettres,nbsyll
0,a,81.36,58.65,81.36,58.65,1,1
1,a,18559.22,12800.81,6350.91,2926.69,1,1
2,a,13572.4,6426.49,5498.34,1669.39,1,1
3,a capella,0.04,0.07,0.04,0.07,9,4
4,a cappella,0.04,0.07,0.04,0.07,10,4


In [8]:
# If a multiple lines have the same ortho, we will keep the ones that have a value in deflem and if there are multiple lines with the same ortho and have a deflem, we will keep the one with the highest freqfilms2
# Sort by 'ortho' and 'freqlivres2' (highest first)
lexicon = lexicon.sort_values(by=['ortho', 'freqlivres'], ascending=[True, False])

# Drop duplicates, keeping the first (which has the highest 'freqlivres2')
lexicon = lexicon.drop_duplicates(subset=['ortho'], keep='first')

# Display the resulting DataFrame
display(lexicon)

# Display the number of unique and duplicate values in the 'ortho' column
ortho_unique = lexicon['ortho'].nunique()
ortho_duplicate = lexicon.shape[0] - ortho_unique
print(f'The number of unique values in the ortho column is {ortho_unique}')
print(f'The number of duplicate values in the ortho column is {ortho_duplicate}')

# Display the percentage of missing values in the lexicon
missing_values = lexicon.isnull().sum() / len(lexicon) * 100
print(missing_values)

Unnamed: 0,ortho,freqlemfilms2,freqlemlivres,freqfilms2,freqlivres,nblettres,nbsyll
1,a,18559.22,12800.81,6350.91,2926.69,1,1
3,a capella,0.04,0.07,0.04,0.07,9,4
4,a cappella,0.04,0.07,0.04,0.07,10,4
5,a contrario,0.00,0.27,0.00,0.27,11,4
6,a fortiori,0.04,0.88,0.04,0.88,10,4
...,...,...,...,...,...,...,...
142686,ôté,16.81,42.03,3.18,5.47,3,2
142688,ôtée,16.81,42.03,0.42,0.54,4,2
142690,ôtées,16.81,42.03,0.16,0.07,5,2
142692,ôtés,16.81,42.03,0.04,0.14,4,2


The number of unique values in the ortho column is 125652
The number of duplicate values in the ortho column is 1
ortho            0.000796
freqlemfilms2    0.000000
freqlemlivres    0.000000
freqfilms2       0.000000
freqlivres       0.000000
nblettres        0.000000
nbsyll           0.000000
dtype: float64


In [9]:
# Drop all the lines with missing values
lexicon = lexicon.dropna()

# Display the percentage of missing values in the lexicon
missing_values = lexicon.isnull().sum() / len(lexicon) * 100
print(missing_values)

# Display the number of lines 
print(f'The number of lines in the lexicon is {lexicon.shape[0]}')

# Display the number of unique and duplicate values in the 'ortho' column
ortho_unique = lexicon['ortho'].nunique()
ortho_duplicate = lexicon.shape[0] - ortho_unique
print(f'The number of unique values in the ortho column is {ortho_unique}')
print(f'The number of duplicate values in the ortho column is {ortho_duplicate}')



ortho            0.0
freqlemfilms2    0.0
freqlemlivres    0.0
freqfilms2       0.0
freqlivres       0.0
nblettres        0.0
nbsyll           0.0
dtype: float64
The number of lines in the lexicon is 125652
The number of unique values in the ortho column is 125652
The number of duplicate values in the ortho column is 0


In [10]:
display(lexicon)

Unnamed: 0,ortho,freqlemfilms2,freqlemlivres,freqfilms2,freqlivres,nblettres,nbsyll
1,a,18559.22,12800.81,6350.91,2926.69,1,1
3,a capella,0.04,0.07,0.04,0.07,9,4
4,a cappella,0.04,0.07,0.04,0.07,10,4
5,a contrario,0.00,0.27,0.00,0.27,11,4
6,a fortiori,0.04,0.88,0.04,0.88,10,4
...,...,...,...,...,...,...,...
142685,ôtèrent,16.81,42.03,0.00,0.27,7,2
142686,ôté,16.81,42.03,3.18,5.47,3,2
142688,ôtée,16.81,42.03,0.42,0.54,4,2
142690,ôtées,16.81,42.03,0.16,0.07,5,2


# Average per sentence in Training set

In [11]:

# Function to tokenize sentences into words
def tokenize(sentence):
    # Use regex to split on spaces, punctuation, and handle contractions
    tokens = re.findall(r"\b\w+\b|[']\w+", sentence.lower())
    return tokens

# Step 2: Match these words with the lexicon dataset to retrieve their statistics
# Creating a dictionary from the lexicon for quick lookup
lexicon_dict = lexicon.set_index('ortho').T.to_dict()

def get_lexicon_stats(word):
    return lexicon_dict.get(word, {})

# Step 3: Compute the required statistical measures for each lexicon measure
def compute_statistics(words):
    stats = {
        'freqlemfilms2': [],
        'freqlemlivres': [],
        'freqfilms2': [],
        'nblettres': [],
        'nbsyll': [],
    }
    
    for word in words:
        word_stats = get_lexicon_stats(word)
        for key in stats.keys():
            if key in word_stats:
                stats[key].append(word_stats[key])
    
    # Computing the required statistical measures
    result = {}
    for key, values in stats.items():
        if values:  # Check if there are any values to compute statistics on
            result[f'{key}_mean'] = np.mean(values)
            result[f'{key}_median'] = np.median(values)
            result[f'{key}_max'] = np.max(values)
            result[f'{key}_min'] = np.min(values)
            result[f'{key}_25%'] = np.percentile(values, 25)
            result[f'{key}_75%'] = np.percentile(values, 75)
        else:
            result[f'{key}_mean'] = np.nan
            result[f'{key}_median'] = np.nan
            result[f'{key}_max'] = np.nan
            result[f'{key}_min'] = np.nan
            result[f'{key}_25%'] = np.nan
            result[f'{key}_75%'] = np.nan
    
    return result


In [12]:
# Step 1: Break down each sentence into individual words using the tokenizer
train['words'] = train['sentence'].apply(tokenize)

# Applying the function to each row in the train dataset
lexicon_measures = train['words'].apply(compute_statistics)

# Convert the list of dictionaries into a DataFrame
lexicon_measures_df = pd.DataFrame(lexicon_measures.tolist())

# Step 4: Add these computed measures as new columns to the train dataset
train_concat = pd.concat([train, lexicon_measures_df], axis=1)

# Drop the 'words' column as it is no longer needed
train_concat = train_concat.drop(columns='words')

# Display the updated train dataframe
display(train_concat.head())

Unnamed: 0,sentence,difficulty,flesch_reading_ease,polysyllable_count,coleman_liau_index,freqlemfilms2_mean,freqlemfilms2_median,freqlemfilms2_max,freqlemfilms2_min,freqlemfilms2_25%,...,nblettres_max,nblettres_min,nblettres_25%,nblettres_75%,nbsyll_mean,nbsyll_median,nbsyll_max,nbsyll_min,nbsyll_25%,nbsyll_75%
0,Les coûts kilométriques réels peuvent diverger...,C1,43.31,9,15.39,5083.383947,86.65,25220.86,0.04,13.93,...,13.0,1.0,2.0,7.0,1.763158,1.0,4.0,1.0,1.0,2.0
1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1,99.14,1,4.57,6604.464167,1214.575,25983.2,8.21,46.2975,...,8.0,1.0,2.0,4.0,1.25,1.0,3.0,1.0,1.0,1.0
2,Le test de niveau en français est sur le site ...,A1,90.77,1,5.03,9111.054615,2520.11,32236.5,5.61,50.7,...,8.0,1.0,2.0,4.0,1.307692,1.0,3.0,1.0,1.0,1.0
3,Est-ce que ton mari est aussi de Boston?,A1,95.84,0,2.86,11378.233333,4100.9,32236.5,0.44,1402.33,...,6.0,2.0,3.0,4.0,1.333333,1.0,2.0,1.0,1.0,2.0
4,"Dans les écoles de commerce, dans les couloirs...",B1,69.45,2,11.79,6372.454516,1252.42,25220.86,2.02,54.68,...,11.0,1.0,2.0,6.0,1.354839,1.0,3.0,1.0,1.0,2.0


In [13]:
# Number of missing values in the new columns
missing_values = train_concat.isnull().sum()
print(missing_values.sum())

0


# Average per Sentence unlabel

In [14]:
# Step 1: Break down each sentence into individual words using the tokenizer
unlabel['words'] = unlabel['sentence'].apply(tokenize)

# Applying the function to each row in the unlabel dataset
lexicon_measures = unlabel['words'].apply(compute_statistics)

# Convert the list of dictionaries into a DataFrame
lexicon_measures_df = pd.DataFrame(lexicon_measures.tolist())

# Step 4: Add these computed measures as new columns to the unlabel dataset
unlabel_concat = pd.concat([unlabel, lexicon_measures_df], axis=1)

# Drop the 'words' column as it is no longer needed
unlabel_concat = unlabel_concat.drop(columns='words')

# Display the updated unlabel dataframe
display(unlabel_concat.head())

Unnamed: 0,sentence,flesch_reading_ease,polysyllable_count,coleman_liau_index,freqlemfilms2_mean,freqlemfilms2_median,freqlemfilms2_max,freqlemfilms2_min,freqlemfilms2_25%,freqlemfilms2_75%,...,nblettres_max,nblettres_min,nblettres_25%,nblettres_75%,nbsyll_mean,nbsyll_median,nbsyll_max,nbsyll_min,nbsyll_25%,nbsyll_75%
0,Nous dûmes nous excuser des propos que nous eû...,86.45,2,10.24,4181.733,4436.51,13572.4,24.99,1107.535,4772.12,...,9.0,3.0,4.0,5.75,1.5,1.0,3.0,1.0,1.0,1.75
1,Vous ne pouvez pas savoir le plaisir que j'ai ...,89.75,1,8.63,7940.903571,4308.805,25220.86,64.31,533.1575,13636.995,...,8.0,1.0,2.25,6.0,1.428571,1.0,3.0,1.0,1.0,2.0
2,"Et, paradoxalement, boire froid n'est pas la b...",80.11,1,10.58,5342.156667,339.05,18188.15,0.22,9.67,12909.08,...,14.0,1.0,2.0,5.0,1.666667,1.0,6.0,1.0,1.0,1.0
3,"Ce n'est pas étonnant, car c'est une saison my...",94.83,1,9.31,3515.732222,31.95,18188.15,8.21,19.61,5219.1,...,11.0,1.0,2.0,6.0,1.555556,1.0,3.0,1.0,1.0,2.0
4,"Le corps de Golo lui-même, d'une essence aussi...",8.8,12,14.3,5236.215139,1174.325,32236.5,0.01,38.31,5559.81,...,14.0,1.0,2.0,6.25,1.555556,1.0,5.0,1.0,1.0,2.0


In [15]:
# Number of missing values in the new columns
missing_values = unlabel_concat.isnull().sum()
print(missing_values.sum())

0


# Export

In [16]:
# Name the index column 'id'
train_concat.index.name = 'id'
unlabel_concat.index.name = 'id'

# Drop the column sentence
train_concat = train_concat.drop(columns='sentence')
unlabel_concat = unlabel_concat.drop(columns='sentence')

# Save the train_concat and unlabel_concat dataframes to CSV files
train_concat.to_csv('train_metrics_only.csv')
unlabel_concat.to_csv('unlabel_metrics_only.csv')
