In [1]:
# !pip install --force-reinstall numpy==1.23.5 scipy==1.10.1 gensim==4.3.1

import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

In [4]:
# Download pretrained French FastText embeddings
!wget -O cc.fr.300.vec.gz https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fr.300.vec.gz

# Load the vectors
# fasttext = KeyedVectors.load_word2vec_format("cc.fr.300.vec.gz", binary=False)
fasttext = KeyedVectors.load_word2vec_format("cc.fr.300.vec.gz", binary=False, limit=500_000)

# Check
print("Vector size:", fasttext.vector_size)
print("Sample vector for 'Paris':", fasttext["Paris"][:10])

--2025-05-03 21:18:24--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fr.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 18.173.166.74, 18.173.166.51, 18.173.166.31, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|18.173.166.74|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1287757366 (1.2G) [binary/octet-stream]
Saving to: ‘cc.fr.300.vec.gz’


2025-05-03 21:18:39 (82.9 MB/s) - ‘cc.fr.300.vec.gz’ saved [1287757366/1287757366]

Vector size: 300
Sample vector for 'Paris': [ 0.0349 -0.0034  0.0415 -0.0239  0.0604  0.0683  0.0352 -0.0098  0.0651
 -0.016 ]


In [5]:
# Load dataset (you can modify these paths)
train_df = pd.read_csv("/content/HIPE-2022-v2.1-letemps-train-fr.tsv",
                       sep="\t",
                       comment="#",
                       quoting=3,
                       names=[
    "TOKEN", "NE-COARSE-LIT", "NE-COARSE-METO", "NE-FINE-LIT", "NE-FINE-METO", "NE-FINE-COMP", "NE-NESTED", "NEL-LIT", "NEL-METO", "MISC"
])
test_df = pd.read_csv("/content/HIPE-2022-v2.1-letemps-test-fr.tsv",
                      sep="\t",
                      comment="#",
                      quoting=3,
                      names=[
    "TOKEN", "NE-COARSE-LIT", "NE-COARSE-METO", "NE-FINE-LIT", "NE-FINE-METO", "NE-FINE-COMP", "NE-NESTED", "NEL-LIT", "NEL-METO", "MISC"
])

# Drop empty tokens
train_df = train_df.dropna(subset=["TOKEN", "NE-COARSE-LIT"])
test_df = test_df.dropna(subset=["TOKEN", "NE-COARSE-LIT"])


In [6]:
def tokens_to_vectors(df, fasttext_model):
    vectors, labels = [], []
    unk_vector = np.zeros(fasttext_model.vector_size)

    for _, row in tqdm(df.iterrows(), total=len(df)):
        token = row["TOKEN"]
        label = row["NE-COARSE-LIT"]
        vec = fasttext_model[token] if token in fasttext_model else unk_vector
        vectors.append(vec)
        labels.append(label)

    return np.array(vectors), np.array(labels)

X_train, y_train = tokens_to_vectors(train_df, fasttext)
X_test, y_test = tokens_to_vectors(test_df, fasttext)

# Encode labels
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

100%|██████████| 379481/379481 [00:25<00:00, 14763.57it/s]
100%|██████████| 48469/48469 [00:02<00:00, 20277.57it/s]


In [7]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train_enc)
y_pred = clf.predict(X_test)

# Decode labels
y_pred_labels = le.inverse_transform(y_pred)
y_test_labels = le.inverse_transform(y_test_enc)

print("Classification Report (Token-Level):")
print(classification_report(y_test_labels, y_pred_labels))

Classification Report (Token-Level):
               precision    recall  f1-score   support

        B-loc       0.52      0.58      0.55       591
        B-org       0.00      0.00      0.00        79
       B-pers       0.52      0.40      0.46       347
        I-loc       0.56      0.03      0.06       151
        I-org       0.00      0.00      0.00       130
       I-pers       0.26      0.07      0.11       428
NE-COARSE-LIT       0.00      0.00      0.00         1
            O       0.98      0.99      0.98     46742

     accuracy                           0.97     48469
    macro avg       0.35      0.26      0.27     48469
 weighted avg       0.96      0.97      0.96     48469



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
