# Clustering et word2vec

Sur la base des éléments méthodologiques et des enseignements techniques présentés lors du cours théorique, il est demandé dans le cadre de ce TP :
- d’effectuer un clustering des bulletins pour une décennie au choix et d’interpréter les résultats
- d’entraîner un modèle word2vec sur l’ensemble des bulletins et d’explorer les relations entre vecteurs

Pour ce faire, vous utiliserez différentes librairies Python vues au cours comme scikit-learn et gensim.

## 1. Clustering

## 2. Word2Vec

#### Librairies nécessaires

In [1]:
import sys

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec

import nltk
from nltk.tokenize import wordpunct_tokenize
from unidecode import unidecode

# stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

#### Fichier à nettoyer

In [2]:
infile = f"../data/sents_2.txt"

#### Nettoyage du fichier

In [3]:
sw = stopwords.words("french")
sw += ["les", "plus", "cette", "fait", "faire", "être", "deux", "comme", "dont", "tout", 
       "ils", "bien", "sans", "peut", "tous", "après", "ainsi", "donc", "cet", "sous",
       "celle", "entre", "encore", "toutes", "pendant", "moins", "dire", "cela", "non",
       "faut", "trois", "aussi", "dit", "avoir", "doit", "contre", "depuis", "autres",
       "van", "het", "autre", "jusqu"]

In [4]:
def clean_text(my_file):
    output_path = f"../data/sents_2_clean.txt"
    output = open(output_path, "w", encoding='utf-8')
    with open(my_file, encoding='utf-8') as f:
        text = f.read()
        kword = nltk.wordpunct_tokenize(text)
        kept = [w.lower() for w in kword if len(w) > 2 and w.isalpha() and w.lower() not in sw]
        kept_string = " ".join(kept)
        output.write(kept_string)
    return f'Output has been written in {output_path}!'

In [5]:
clean_text(infile)

'Output has been written in ../data/sents_2_clean.txt!'

In [6]:
cleaned_infile = f"../data/sents_2_clean.txt"

#### Chargement des phrases

In [7]:
class MySentences(object):
    """Tokenize and Lemmatize sentences"""
    def __init__(self, filename):
        self.filename = filename

    def __iter__(self):
        for line in open(self.filename, encoding='utf-8', errors="backslashreplace"):
            yield [unidecode(w.lower()) for w in wordpunct_tokenize(line)]

In [8]:
sentences = MySentences(cleaned_infile)

#### Création des bigrammes

In [9]:
bigram_phrases = Phrases(sentences)

In [10]:
len(bigram_phrases.vocab.keys())

8906117

In [14]:
# exploration des bigrammes

key_ = list(bigram_phrases.vocab.keys())[145]
print(key_)

bourgmestre_sommaire


In [15]:
# score de la clé

bigram_phrases.vocab[key_]

1577

In [16]:
print(list(bigram_phrases.vocab.keys())[:100])

['bruxelles', 'bulletin', 'bruxelles_bulletin', 'ires', 'bulletin_ires', 'conseil', 'ires_conseil', 'communal', 'conseil_communal', 'annee', 'communal_annee', 'ville', 'annee_ville', 'ville_bulletin', 'bulletin_conseil', 'aes', 'conseil_aes', 'seances', 'aes_seances', 'seances_communal', 'annee_bruxelles', 'imprimerie', 'bruxelles_imprimerie', 'rite', 'imprimerie_rite', 'faubourg', 'rite_faubourg', 'consei', 'faubourg_consei', 'dibi', 'consei_dibi', 'communication', 'dibi_communication', 'conclusions', 'communication_conclusions', 'section', 'conclusions_section', 'nouvel', 'section_nouvel', 'hospice', 'nouvel_hospice', 'enraisonde', 'hospice_enraisonde', 'absence', 'enraisonde_absence', 'maladie', 'absence_maladie', 'maladie_conseil', 'ajourne', 'conseil_ajourne', 'leurs', 'ajourne_leurs', 'pierre', 'leurs_pierre', 'marchai', 'pierre_marchai', 'cles', 'marchai_cles', 'taxes', 'cles_taxes', 'communale', 'taxes_communale', 'bieniaance', 'communale_bieniaance', 'eldeseianv', 'bieniaance_

In [17]:
%time bigram_phrases[sentences]

CPU times: user 1min 21s, sys: 1.74 s, total: 1min 23s
Wall time: 1min 27s


<gensim.interfaces.TransformedCorpus at 0x7feb805cb130>

In [18]:
bigram_phraser = Phraser(phrases_model=bigram_phrases)

In [19]:
%time bigram_phraser[sentences]

CPU times: user 1min 29s, sys: 2.18 s, total: 1min 31s
Wall time: 1min 35s


<gensim.interfaces.TransformedCorpus at 0x7feb80596e20>

In [20]:
trigram_phrases = Phrases(bigram_phraser[sentences])

In [21]:
trigram_phraser = Phraser(phrases_model=trigram_phrases)

In [22]:
corpus = list(trigram_phraser[bigram_phraser[sentences]])

In [24]:
# print(corpus[:10])

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [25]:
%%time
model = Word2Vec(
    corpus, # On passe le corpus de ngrams que nous venons de créer
    vector_size=32, # Le nombre de dimensions dans lesquelles le contexte des mots devra être réduit, aka. vector_size
    window=5, # La taille du "contexte", ici 5 mots avant et après le mot observé
    min_count=1, # On ignore les mots qui n'apparaissent pas au moins 5 fois dans le corpus
    workers=4, # Permet de paralléliser l'entraînement du modèle en 4 threads
    epochs=10 # Nombre d'itérations du réseau de neurones sur le jeu de données pour ajuster les paramètres avec la descende de gradient, aka. epochs.
)

CPU times: user 27.8 s, sys: 127 ms, total: 28 s
Wall time: 28 s


In [26]:
outfile = f"../data/bulletins_tp3_1.model"
model.save(outfile)

In [27]:
model = Word2Vec.load("../data/bulletins_tp3_1.model")

In [28]:
model.wv["bruxelles"]

array([-0.02856385, -0.02491177,  0.00148107,  0.02353539, -0.02817992,
        0.00980195,  0.01736984,  0.01888244,  0.02273956, -0.01897097,
        0.00256455,  0.02112452, -0.00662612, -0.01973851, -0.0045003 ,
       -0.02654227, -0.01687697,  0.02658183,  0.00769452,  0.02725005,
        0.0275328 ,  0.02975384, -0.00952221, -0.00146625,  0.00780555,
       -0.014429  ,  0.02500482, -0.0340938 ,  0.02078272,  0.00892002,
       -0.01924244,  0.01486543], dtype=float32)

In [29]:
model.wv["boucher"]

array([ 0.00814233, -0.0172377 ,  0.02529807,  0.00614849,  0.02423477,
       -0.02679351, -0.01373259, -0.00162214,  0.02771173,  0.01275488,
       -0.02829717,  0.02214047,  0.00972006,  0.01646418,  0.02256383,
       -0.00120972, -0.00703827,  0.00895417,  0.01078752, -0.00861427,
        0.00106915, -0.00067653,  0.01223742, -0.02747203,  0.01652838,
       -0.02672914,  0.00670581,  0.02702001,  0.00011779, -0.03072757,
        0.02718076, -0.00172947], dtype=float32)

In [30]:
model.wv.similarity("boucher", "boulanger")

0.0900372

In [31]:
model.wv.most_similar("bruxelles", topn=10)

[('middelgrote_woningen', 0.7307690978050232),
 ('actes_accomplis', 0.7276468276977539),
 ('nist', 0.7273675203323364),
 ('riece', 0.7057778835296631),
 ('directrice', 0.6950398683547974),
 ('avrz', 0.6900056600570679),
 ('pusance', 0.688913881778717),
 ('mwellingen', 0.6862605214118958),
 ('noitedo', 0.6828559637069702),
 ('nivelles_vilvorde', 0.6795886754989624)]

In [32]:
print(model.wv.most_similar(positive=['bruxelles', 'france'], negative=['belgique']))

[('dessiccation', 0.7397866249084473), ('vts', 0.7307472229003906), ('insererez', 0.7049958109855652), ('immondices', 0.7017472386360168), ('warenhuis', 0.6798452138900757), ('note_plaisir', 0.6751720905303955), ('chaussee_anvers_rue_masui', 0.6659534573554993), ('admis_frequenter', 0.6645496487617493), ('comprend_pose_tuyaux_acier', 0.6629272699356079), ('effecluer', 0.6626400351524353)]


In [33]:
print(model.wv.most_similar(positive=['homme', 'roi'], negative=['femme']))

[('fixer_suit', 0.6808350682258606), ('viiii', 0.6788467764854431), ('mesjieurs', 0.672312319278717), ('waarvan_sommige', 0.6688390970230103), ('jardins_enfants_ecoles_primaires', 0.6676877737045288), ('deduiraient', 0.6673752665519714), ('bibliotheque_artistique', 0.6673320531845093), ('laqken', 0.6583156585693359), ('auvents_location', 0.6574030518531799), ('fagrement', 0.6531345248222351)]


In [34]:
print(model.wv.most_similar(positive=['homme', 'bourgmestre'], negative=['femme']))

[('manneken_pis_remises_costumes', 0.6827032566070557), ('chaux_tournay', 0.679751992225647), ('respiration', 0.6733065843582153), ('prool', 0.6648819446563721), ('fagard', 0.6567695736885071), ('while', 0.6546394228935242), ('femme_femme', 0.651695966720581), ('pimit', 0.6457104086875916), ('monteus', 0.6407697796821594), ('regentenregiem', 0.638964056968689)]


In [35]:
print(model.wv.most_similar(positive=['bourgmestre', 'bruxelles'], negative=['echevin']))

[('conseiller_heyvaert', 0.7409437298774719), ('loyers_proprietes', 0.7105417847633362), ('nivelles_vilvorde', 0.704888105392456), ('eclateront', 0.7030313611030579), ('societe_royale_philharmonie', 0.6978018879890442), ('aqaud', 0.6951518058776855), ('vivantes_enregistrees_bruxelles', 0.6911153197288513), ('etant_donne', 0.6867313385009766), ('ethode', 0.6825902462005615), ('rouanne', 0.6825338006019592)]


In [36]:
print(model.wv.most_similar(positive=['bruxelles', 'paris'], negative=['belgique']))

[('arrelcs', 0.7184481024742126), ('pense_vaudrait', 0.7023138403892517), ('groscmcins', 0.6951763033866882), ('nisaties', 0.6934218406677246), ('ultramodernes', 0.6933059692382812), ('commissariats_divisions', 0.6890398263931274), ('sument', 0.6868327856063843), ('reponse_rassurante', 0.6861447691917419), ('prestations_accomplies', 0.6799437999725342), ('uobert', 0.6745166778564453)]


In [37]:
print(model.wv.most_similar(positive=['homme', 'fidelite'], negative=['femme']))

[('omimnud', 0.7624201774597168), ('eoses', 0.7208698987960815), ('leurs_griefs', 0.7176406979560852), ('definitivement_acquises', 0.6963922381401062), ('doorfebovenstevakk', 0.6764378547668457), ('faible_redevance', 0.673572301864624), ('acidulee', 0.6691086888313293), ('dentistes_depots', 0.6676341891288757), ('georges_nelis', 0.6628955602645874), ('tabacs_bois', 0.6586688756942749)]


In [38]:
print(model.wv.most_similar(positive=['femme', 'fidelite'], negative=['homme']))

[('popu', 0.7023522257804871), ('eludes', 0.6969157457351685), ('chargee_cours_enseignement_professionnel', 0.6902368664741516), ('quinquennale_courbe', 0.6897304058074951), ('onvermijdbaar', 0.6756871938705444), ('ciaires', 0.6674429178237915), ('acfiuns', 0.6673502326011658), ('bouwfonds', 0.6636354923248291), ('ltntercommunale', 0.6633958220481873), ('zal_toelaten', 0.6590200662612915)]


In [39]:
print(model.wv.most_similar(positive=['femme', 'homme'], negative=['fidelite']))

[('gestemd_vote_servaes_heren', 0.7346726655960083), ('fusain_angle', 0.7324280142784119), ('bruxf', 0.7012267708778381), ('terugtrekt', 0.7010139226913452), ('krautli', 0.6997595429420471), ('inotfavt', 0.6851938366889954), ('baasrode', 0.6749435067176819), ('bouhez', 0.670317530632019), ('toute_reconnaissance', 0.6675642132759094), ('eail', 0.6591938138008118)]
