Take the pretrained KMeans weights and use them to label the french sentences dataset

In [2]:
import os
import re
import math
import numpy as np
import pandas as pd 
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
#from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
%matplotlib inline

[nltk_data] Downloading package stopwords to C:\Users\Toby
[nltk_data]     Usher\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Step 1: Load pretrained model and dataset

In [3]:
with open('kmeans.pkl', 'rb') as f:
    kmeans = pickle.load(f)

In [4]:
filepath = os.path.join("..", "french_sentences.csv")

df = pd.read_csv(filepath, delimiter='\t', header=None)
df.columns = ["id", "sentence"]


  df = pd.read_csv(filepath, delimiter='\t', header=None)


Step 2: Preprocess sentences

Need to transform sentences into the same feature space as was used to train the kmeans model by applying the same stemming and vectorisation etc. that were used during training. Details of this can be found in the k means clustering training notebook.

In [5]:
# First we need to get a list of stop words in french. These are common filler words that provide almost no useful information
# and occur frequently in most documents. Stop words can have a disproportionate influence on the overall representation of
# the document, which can be detrimental to the performance of TF-IDF. To mitigate this we need to remove stop words before
# calculating TF-IDF vectors

punc = ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}',"%"]

# Get French stop words
french_stop_words = stopwords.words('french')

# Combine with standard punctuation to get the comprehensive list of stop words. Convert stop words to a set
# to ensure no duplicates
stop_words = set(french_stop_words).union(punc)

# convert back to list
stop_words = list(stop_words)

print(stop_words)

['(', 'vous', 'par', 'aurions', 'la', 'des', 'nos', 'eussions', "'", 'serions', 'soit', 'se', 'étions', 'n', ':', 'fussent', 'pour', ']', 'son', 'aie', 'mon', 'fusses', 'nous', 'ayantes', 'm', 'soyez', 'fusse', 'ayants', 'eus', 'étais', ',', 'fussions', '"', 'auraient', 'mes', 'avions', 'et', 'aux', 'aient', ')', 'ayant', 'aurai', 'sois', 'sera', 'seras', 'aviez', 'du', 'aurez', 'avons', 'auriez', 'aurais', 'étées', 'on', 'eûmes', 'ai', 'sont', 'je', 'fûmes', 'elle', 'sa', 'ne', 'ces', 'ayante', ';', 'avec', '!', 'êtes', 'd', 'eûtes', 'eue', '}', 'sur', 'l', 'j', 'me', 'que', 'étants', 'notre', '.', 'furent', 'avait', 'qu', 't', 'ton', 'étantes', 'étiez', 'aurons', 'ait', 'eut', 'ont', 'serait', 'fûtes', 'toi', 'il', 'soyons', 'un', 'fussiez', 'eux', 'à', 'serons', 'leur', 'au', 'étaient', 'moi', 'même', 'fût', 'étée', 'eues', 'ma', 'tes', 'auront', '?', 'serai', 'es', 'est', 'de', 'été', '{', 'avez', 'seraient', '%', 'ses', 'une', 'seront', 'fus', 'était', 'eurent', 'ayez', 'c', 'te',

In [11]:
# Get the raw sentences from the dataframe
sentences = df['sentence'].values

In [12]:
stemmer = SnowballStemmer('french')
tokenizer = RegexpTokenizer(r'[a-zA-Z\']+')

def _tokenize(text: str) -> list[str]:
    """Tokenizes a document into its individual words and punctuation and returns a list of each token's stem.
    """

    return [stemmer.stem(word) for word in tokenizer.tokenize(text.lower())]

Predict clusters for each sentence and save dataframe as csv

In [13]:
vectorizer = TfidfVectorizer(stop_words = stop_words, tokenizer = _tokenize, max_features = 10000) # limit to 10000 most frequent terms in the corpus
X = vectorizer.fit_transform(sentences)

df['cluster'] = kmeans.predict(X)



In [14]:
df.to_csv('french_sentences_with_cluster_labels.csv', index=False)