# Clustering et word2vec

Sur la base des éléments méthodologiques et des enseignements techniques présentés lors du cours théorique, il est demandé dans le cadre de ce TP :
- d’effectuer un clustering des bulletins pour une décennie au choix et d’interpréter les résultats
- d’entraîner un modèle word2vec sur l’ensemble des bulletins et d’explorer les relations entre vecteurs

Pour ce faire, vous utiliserez différentes librairies Python vues au cours comme scikit-learn et gensim.

## 1. Clustering

## 2. Word2Vec

#### Librairies nécessaires

In [1]:
import sys

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec

import nltk
from nltk.tokenize import wordpunct_tokenize
from unidecode import unidecode

# stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

#### Fichier à nettoyer

In [2]:
infile = f"../data/sents_2.txt"

#### Nettoyage du fichier

In [3]:
sw = stopwords.words("french")
sw += ["les", "plus", "cette", "fait", "faire", "être", "deux", "comme", "dont", "tout", 
       "ils", "bien", "sans", "peut", "tous", "après", "ainsi", "donc", "cet", "sous",
       "celle", "entre", "encore", "toutes", "pendant", "moins", "dire", "cela", "non",
       "faut", "trois", "aussi", "dit", "avoir", "doit", "contre", "depuis", "autres",
       "van", "het", "autre", "jusqu"]

In [4]:
def clean_text(my_file):
    output_path = f"../data/sents_2_clean.txt"
    output = open(output_path, "w", encoding='utf-8')
    with open(my_file, encoding='utf-8') as f:
        text = f.read()
        kword = nltk.wordpunct_tokenize(text)
        kept = [w.lower() for w in kword if len(w) > 2 and w.lower() not in sw]
        kept_string = " ".join(kept)
        output.write(kept_string)
    return f'Output has been written in {output_path}!'

In [5]:
clean_text(infile)

'Output has been written in ../data/sents_2_clean.txt!'

In [6]:
cleaned_infile = f"../data/sents_2_clean.txt"

#### Chargement des phrases

In [7]:
class MySentences(object):
    """Tokenize and Lemmatize sentences"""
    def __init__(self, filename):
        self.filename = filename

    def __iter__(self):
        for line in open(self.filename, encoding='utf-8', errors="backslashreplace"):
            yield [unidecode(w.lower()) for w in wordpunct_tokenize(line)]

In [8]:
sentences = MySentences(cleaned_infile)

#### Création des bigrammes

In [9]:
bigram_phrases = Phrases(sentences)

In [10]:
len(bigram_phrases.vocab.keys())

10949523

In [11]:
# exploration des bigrammes

key_ = list(bigram_phrases.vocab.keys())[145]
print(key_)

presidence_chevalier


In [12]:
# score de la clé

bigram_phrases.vocab[key_]

27

In [13]:
print(list(bigram_phrases.vocab.keys())[:100])

['bruxelles', 'bulletin', 'bruxelles_bulletin', 'ires', 'bulletin_ires', '8eanas', 'ires_8eanas', 'conseil', '8eanas_conseil', 'communal', 'conseil_communal', 'annee', 'communal_annee', '1847', 'annee_1847', 'ville', '1847_ville', 'ville_bulletin', 'bulletin_conseil', 'aes', 'conseil_aes', 'seances', 'aes_seances', 'seances_communal', '1847_bruxelles', 'imprimerie', 'bruxelles_imprimerie', 'rite', 'imprimerie_rite', 'faubourg', 'rite_faubourg', 'consei', 'faubourg_consei', 'dibi', 'consei_dibi', 'communication', 'dibi_communication', 'conclusions', 'communication_conclusions', 'section', 'conclusions_section', 'nouvel', 'section_nouvel', 'hospice', 'nouvel_hospice', 'enraisonde', 'hospice_enraisonde', 'absence', 'enraisonde_absence', 'maladie', 'absence_maladie', 'maladie_conseil', 'ajourne', 'conseil_ajourne', 'leurs', 'ajourne_leurs', 'pierre', 'leurs_pierre', 'marchai', 'pierre_marchai', 'cles', 'marchai_cles', 'taxes', 'cles_taxes', 'communale', 'taxes_communale', 'bieniaance', 'co

In [14]:
%time bigram_phrases[sentences]

CPU times: user 1min 36s, sys: 2.13 s, total: 1min 38s
Wall time: 1min 43s


<gensim.interfaces.TransformedCorpus at 0x7fab4d361220>

In [15]:
bigram_phraser = Phraser(phrases_model=bigram_phrases)

In [16]:
%time bigram_phraser[sentences]

CPU times: user 1min 26s, sys: 1.5 s, total: 1min 28s
Wall time: 1min 32s


<gensim.interfaces.TransformedCorpus at 0x7fab4d310bb0>

In [17]:
trigram_phrases = Phrases(bigram_phraser[sentences])

In [18]:
trigram_phraser = Phraser(phrases_model=trigram_phrases)

In [19]:
corpus = list(trigram_phraser[bigram_phraser[sentences]])

In [20]:
# print(corpus[:10])

In [21]:
%%time
model = Word2Vec(
    corpus, # On passe le corpus de ngrams que nous venons de créer
    vector_size=32, # Le nombre de dimensions dans lesquelles le contexte des mots devra être réduit, aka. vector_size
    window=5, # La taille du "contexte", ici 5 mots avant et après le mot observé
    min_count=1, # On ignore les mots qui n'apparaissent pas au moins 5 fois dans le corpus
    workers=4, # Permet de paralléliser l'entraînement du modèle en 4 threads
    epochs=10 # Nombre d'itérations du réseau de neurones sur le jeu de données pour ajuster les paramètres avec la descende de gradient, aka. epochs.
)

CPU times: user 35.9 s, sys: 115 ms, total: 36 s
Wall time: 36.1 s


In [22]:
outfile = f"../data/bulletins_tp3_1.model"
model.save(outfile)

In [23]:
model = Word2Vec.load("../data/bulletins_tp3_1.model")

In [24]:
model.wv["bruxelles"]

array([-0.02685983, -0.02613496,  0.00242372,  0.02246045, -0.03219561,
        0.0117404 ,  0.01863977,  0.02072185,  0.02343076, -0.02209696,
        0.00329831,  0.02190949, -0.00977082, -0.01946867, -0.00266382,
       -0.02664573, -0.01793599,  0.02410629,  0.00915539,  0.02684479,
        0.02541147,  0.02583335, -0.00961438, -0.0023722 ,  0.0096348 ,
       -0.01583729,  0.02813287, -0.03157771,  0.02130563,  0.00867117,
       -0.01764636,  0.01351591], dtype=float32)

In [25]:
model.wv["boucher"]

array([ 0.02845549, -0.01146269, -0.00705318, -0.02275989,  0.02346707,
        0.00807211, -0.00527942,  0.01458482, -0.00746811, -0.00689599,
        0.02411399,  0.02759365, -0.02514727,  0.00704173,  0.00899752,
       -0.02838369,  0.02851159, -0.02181134,  0.02791412, -0.02825055,
        0.0025609 ,  0.0299324 ,  0.02091913, -0.01311754, -0.01199547,
        0.02737022,  0.02494899, -0.02746011, -0.01281452,  0.01520414,
       -0.01632109, -0.01131507], dtype=float32)

In [26]:
model.wv.similarity("boucher", "boulanger")

0.16438036

In [27]:
model.wv.most_similar("bruxelles", topn=10)

[('songe_nullement', 0.7396537661552429),
 ('supplementaire_chemin_fer', 0.7291504144668579),
 ('136_438', 0.7219255566596985),
 ('402', 0.719338595867157),
 ('733quater', 0.702070415019989),
 ('blb', 0.7007202506065369),
 ('495_001', 0.6926709413528442),
 ('perpetuei', 0.6871772408485413),
 ('jansmolenbeek', 0.6864557862281799),
 ('cojnmen', 0.6861243844032288)]

In [28]:
print(model.wv.most_similar(positive=['bruxelles', 'france'], negative=['belgique']))

[('beslissim', 0.7252672910690308), ('rachetat', 0.7238172888755798), ('tabellion', 0.7132728695869446), ('tnai', 0.7096638083457947), ('15615', 0.7095273733139038), ('marionnettentheater', 0.6992092132568359), ('takijke', 0.693440854549408), ('avertisseurs_colonnesaffiches', 0.6797021627426147), ('treffen_tegen_eventuele_bederving', 0.6770004034042358), ('\\-[', 0.6718602180480957)]


In [29]:
print(model.wv.most_similar(positive=['homme', 'roi'], negative=['femme']))

[('rooilij', 0.7063876390457153), ('september_1949', 0.70578932762146), ('bodenbroeck_boduognat', 0.7005714178085327), ('depoltcr', 0.6809656023979187), ('arcbilectonique', 0.668643593788147), ('henri_joseph_danco', 0.6676032543182373), ('maasdam', 0.6639767289161682), ('point_important', 0.6615000367164612), ('rlabites', 0.6591097712516785), ('tenain', 0.6546366214752197)]


In [30]:
print(model.wv.most_similar(positive=['homme', 'bourgmestre'], negative=['femme']))

[('liste_candidats_presentee', 0.7077802419662476), ('marques_generales_assentiment', 0.7053357362747192), ('primaire_charles_buis', 0.6754471063613892), ('desirsdes', 0.6651195287704468), ('paasche', 0.6649349331855774), ('communaoi', 0.6634657979011536), ('aan_weerskanten_gezaagde', 0.6630767583847046), ('calligraphe', 0.6605660915374756), ('34755', 0.6604280471801758), ('films_televisieuitzendingen', 0.6579362154006958)]


In [31]:
print(model.wv.most_similar(positive=['bourgmestre', 'bruxelles'], negative=['echevin']))

[('premier_etablissement_elevaient', 0.7212549448013306), ('placemient', 0.6985436677932739), ('marieanne', 0.6888855695724487), ('pourra_introduite', 0.6842411160469055), ('certaines_oeuvres', 0.6720647215843201), ('plaintes_produites', 0.6712937355041504), ('lie_phil', 0.6712707877159119), ('120137', 0.670115053653717), ('ecole_moyenne_rue_louvain', 0.6694977283477783), ('fiacres_automobiles_reglement', 0.6671410202980042)]


In [32]:
print(model.wv.most_similar(positive=['bruxelles', 'paris'], negative=['belgique']))

[('440s', 0.7245204448699951), ('bespoedigd_worden', 0.6799979209899902), ('82159', 0.6735992431640625), ('bekomen_resultaten_bij_ontvangsten', 0.6733414530754089), ('preferant', 0.6703314185142517), ('percues_locations', 0.6633291840553284), ('auraifpas', 0.6547234058380127), ('schaerbetk', 0.6533769369125366), ('parfaire_formation', 0.6508330702781677), ('250145', 0.6505926251411438)]


In [33]:
print(model.wv.most_similar(positive=['homme', 'fidelite'], negative=['femme']))

[('contrarie', 0.72808837890625), ('fcirtaiitornatiqt', 0.7042990326881409), ('mic1iel', 0.6951599717140198), ('72r', 0.6911953687667847), ('volumes_eau_consommes', 0.686896026134491), ('tochuit', 0.6828369498252869), ('aoite', 0.6800817251205444), ('mauchard', 0.6769009232521057), ('46982774', 0.667982280254364), ('blanche_institutrice', 0.6672805547714233)]


In [34]:
print(model.wv.most_similar(positive=['femme', 'fidelite'], negative=['homme']))

[('plaatser', 0.7989534139633179), ('een_maandelijks', 0.7004958391189575), ('trouve_regrettable', 0.6930721998214722), ('^),', 0.6926151514053345), ('66565', 0.6836583614349365), ('seaaauanaa', 0.6823300719261169), ('rejctc', 0.6819170117378235), ('aedificandi_cas', 0.6812053322792053), ('rechtsomkeer', 0.6806226968765259), ('visqueux', 0.68030846118927)]


In [35]:
print(model.wv.most_similar(positive=['femme', 'homme'], negative=['fidelite']))

[('savait_quelles', 0.7003591060638428), ('comportait', 0.6985844373703003), ('cal51', 0.6946401596069336), ('plantes_graines', 0.6838655471801758), ('243370', 0.6784873008728027), ('297921_297991', 0.6783518195152283), ('reseau_reseau', 0.6728825569152832), ('universitatis', 0.6690184473991394), ('existerait_embarras', 0.6676788330078125), ('besluit_heer', 0.6674354672431946)]
