Auteur: Alassane Watt

In [3]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency
import multiprocessing
import unicodedata
from gensim.models.word2vec import Word2Vec
from gensim.models.fasttext import FastText
import gensim
import os

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [4]:
def strip_accents(text):
    """
    Strip accents from input String.

    :param text: The input string.
    :type text: String.

    :returns: The processed String.
    :rtype: String.
    """
    try:
        text = unicode(text, 'utf-8')
    except (TypeError, NameError): # unicode is a default on python 3 
        pass
    text = unicodedata.normalize('NFD', text)
    text = text.encode('ascii', 'ignore')
    text = text.decode("utf-8")
    return str(text)

def lower(text):
    return text.lower()

def remove_special_chars(text):
    return re.sub("[^A-Za-z0-9']+", ' ', text)

In [5]:
# nlp = spacy.load('fr_core_news_sm', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

# def cleaning(doc):
#     # Lemmatizes and removes stopwords
#     # doc needs to be a spacy Doc object
#     txt = [str(token) for token in doc if not token.is_stop]
#     # Word2Vec uses context words to learn the vector representation of a target word,
#     # if a sentence is only one or two words long,
#     # the benefit for the training is very small
#     if len(txt) > 2:
#         return ' '.join(txt)

In [6]:
def preprocess_text(document):
    # strip accents
    document = strip_accents(document)
    
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(document))

    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)

    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)

    # Converting to Lowercase
    document = document.lower()

    # Lemmatization
#         tokens = document.split()
#         tokens = [stemmer.lemmatize(word) for word in tokens]
#         tokens = [word for word in tokens if word not in en_stop]
#         tokens = [word for word in tokens if len(word) > 3]

#         preprocessed_text = ' '.join(tokens)

    return document

In [7]:
wv_dim = 100
wv_min_count = 1

# Corpus QUAERO_FrenchMed

In [8]:
med_path = "./QUAERO_FrenchMed/QUAERO_FrenchMed_traindev.ospl"
with open(med_path) as f:
    sentences = [line.strip() for line in f.readlines()]

We are lemmatizing and removing the stopwords and non-alphabetic characters for each line of dialogue

In [9]:
df = pd.Series(sentences)
df.head()

0                                   EMEA / H / C / 551
1                                               PRIALT
2                             Qu ’ est ce que Prialt ?
3    Prialt est une solution pour perfusion contena...
4              Dans quel cas Prialt est - il utilisé ?
dtype: object

In [10]:
df = df.apply(preprocess_text)

In [11]:
df.head(10)

0                                           emea c 551
1                                               prialt
2                                qu est ce que prialt 
3    prialt est une solution pour perfusion contena...
4                 dans quel cas prialt est il utilise 
5    prialt est indique pour le traitement des doul...
6    comme le nombre de patients souffrant de doule...
7                       comment prialt est il utilise 
8    le traitement par prialt ne doit etre realise ...
9    prialt doit etre administre en perfusion conti...
dtype: object

In [12]:
sentences_med = [s.split(" ") for s in df.to_list()]

### Skip-gram

In [29]:
sg_model = Word2Vec(sentences=sentences_med, size=wv_dim, sg=1, hs=1, min_count=1, iter=50)
if not os.path.exists('./WE_models'):
    os.mkdir('./WE_models')
sg_model.save('WE_models/w2v_sg_100D')

INFO - 18:56:34: collecting all words and their counts
INFO - 18:56:34: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 18:56:34: collected 7826 word types from a corpus of 42967 raw words and 3091 sentences
INFO - 18:56:34: Loading a fresh vocabulary
INFO - 18:56:34: effective_min_count=1 retains 7826 unique words (100% of original 7826, drops 0)
INFO - 18:56:34: effective_min_count=1 leaves 42967 word corpus (100% of original 42967, drops 0)
INFO - 18:56:34: deleting the raw counts dictionary of 7826 items
INFO - 18:56:34: sample=0.001 downsamples 34 most-common words
INFO - 18:56:34: downsampling leaves estimated 31810 word corpus (74.0% of prior 42967)
INFO - 18:56:34: constructing a huffman tree from 7826 words
INFO - 18:56:34: built huffman tree with maximum node depth 16
INFO - 18:56:34: estimated required memory for 7826 words and 100 dimensions: 14869400 bytes
INFO - 18:56:34: resetting layer weights
INFO - 18:56:34: training model with 3 workers on 78

INFO - 18:56:38: worker thread finished; awaiting finish of 2 more threads
INFO - 18:56:38: worker thread finished; awaiting finish of 1 more threads
INFO - 18:56:38: worker thread finished; awaiting finish of 0 more threads
INFO - 18:56:38: EPOCH - 22 : training on 42967 raw words (31802 effective words) took 0.2s, 204973 effective words/s
INFO - 18:56:38: worker thread finished; awaiting finish of 2 more threads
INFO - 18:56:38: worker thread finished; awaiting finish of 1 more threads
INFO - 18:56:38: worker thread finished; awaiting finish of 0 more threads
INFO - 18:56:38: EPOCH - 23 : training on 42967 raw words (31904 effective words) took 0.2s, 199735 effective words/s
INFO - 18:56:38: worker thread finished; awaiting finish of 2 more threads
INFO - 18:56:38: worker thread finished; awaiting finish of 1 more threads
INFO - 18:56:38: worker thread finished; awaiting finish of 0 more threads
INFO - 18:56:38: EPOCH - 24 : training on 42967 raw words (31920 effective words) took 0.

INFO - 18:56:42: worker thread finished; awaiting finish of 2 more threads
INFO - 18:56:42: worker thread finished; awaiting finish of 1 more threads
INFO - 18:56:42: worker thread finished; awaiting finish of 0 more threads
INFO - 18:56:42: EPOCH - 46 : training on 42967 raw words (31883 effective words) took 0.1s, 222363 effective words/s
INFO - 18:56:42: worker thread finished; awaiting finish of 2 more threads
INFO - 18:56:42: worker thread finished; awaiting finish of 1 more threads
INFO - 18:56:42: worker thread finished; awaiting finish of 0 more threads
INFO - 18:56:42: EPOCH - 47 : training on 42967 raw words (31899 effective words) took 0.1s, 222035 effective words/s
INFO - 18:56:42: worker thread finished; awaiting finish of 2 more threads
INFO - 18:56:42: worker thread finished; awaiting finish of 1 more threads
INFO - 18:56:42: worker thread finished; awaiting finish of 0 more threads
INFO - 18:56:42: EPOCH - 48 : training on 42967 raw words (31858 effective words) took 0.

### CBOW

In [30]:
cbow_model = Word2Vec(sentences=sentences_med, size=wv_dim, sg=0, hs=1, min_count=1, iter=50)
if not os.path.exists('./WE_models'):
    os.mkdir('./WE_models')
cbow_model.save('WE_models/w2v_cbow_100D')

INFO - 18:56:59: collecting all words and their counts
INFO - 18:56:59: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 18:56:59: collected 7826 word types from a corpus of 42967 raw words and 3091 sentences
INFO - 18:56:59: Loading a fresh vocabulary
INFO - 18:56:59: effective_min_count=1 retains 7826 unique words (100% of original 7826, drops 0)
INFO - 18:56:59: effective_min_count=1 leaves 42967 word corpus (100% of original 42967, drops 0)
INFO - 18:56:59: deleting the raw counts dictionary of 7826 items
INFO - 18:56:59: sample=0.001 downsamples 34 most-common words
INFO - 18:56:59: downsampling leaves estimated 31810 word corpus (74.0% of prior 42967)
INFO - 18:56:59: constructing a huffman tree from 7826 words
INFO - 18:56:59: built huffman tree with maximum node depth 16
INFO - 18:56:59: estimated required memory for 7826 words and 100 dimensions: 14869400 bytes
INFO - 18:56:59: resetting layer weights
INFO - 18:56:59: training model with 3 workers on 78

INFO - 18:57:00: worker thread finished; awaiting finish of 2 more threads
INFO - 18:57:01: worker thread finished; awaiting finish of 1 more threads
INFO - 18:57:01: worker thread finished; awaiting finish of 0 more threads
INFO - 18:57:01: EPOCH - 22 : training on 42967 raw words (31819 effective words) took 0.1s, 579225 effective words/s
INFO - 18:57:01: worker thread finished; awaiting finish of 2 more threads
INFO - 18:57:01: worker thread finished; awaiting finish of 1 more threads
INFO - 18:57:01: worker thread finished; awaiting finish of 0 more threads
INFO - 18:57:01: EPOCH - 23 : training on 42967 raw words (31846 effective words) took 0.1s, 580400 effective words/s
INFO - 18:57:01: worker thread finished; awaiting finish of 2 more threads
INFO - 18:57:01: worker thread finished; awaiting finish of 1 more threads
INFO - 18:57:01: worker thread finished; awaiting finish of 0 more threads
INFO - 18:57:01: EPOCH - 24 : training on 42967 raw words (31790 effective words) took 0.

INFO - 18:57:02: worker thread finished; awaiting finish of 2 more threads
INFO - 18:57:02: worker thread finished; awaiting finish of 1 more threads
INFO - 18:57:02: worker thread finished; awaiting finish of 0 more threads
INFO - 18:57:02: EPOCH - 46 : training on 42967 raw words (31787 effective words) took 0.0s, 663435 effective words/s
INFO - 18:57:02: worker thread finished; awaiting finish of 2 more threads
INFO - 18:57:02: worker thread finished; awaiting finish of 1 more threads
INFO - 18:57:02: worker thread finished; awaiting finish of 0 more threads
INFO - 18:57:02: EPOCH - 47 : training on 42967 raw words (31884 effective words) took 0.1s, 615848 effective words/s
INFO - 18:57:02: worker thread finished; awaiting finish of 2 more threads
INFO - 18:57:02: worker thread finished; awaiting finish of 1 more threads
INFO - 18:57:02: worker thread finished; awaiting finish of 0 more threads
INFO - 18:57:02: EPOCH - 48 : training on 42967 raw words (31909 effective words) took 0.

### Fatsttext

In [70]:
fasttext_model = gensim.models.fasttext.FastText(sentences=sentences_med, size=wv_dim, sg=0, hs=1, min_count=1, iter=50)
if not os.path.exists('./WE_models'):
    os.mkdir('./WE_models')
fasttext_model.save('WE_models/w2v_ft_100D')

INFO - 19:17:43: resetting layer weights
INFO - 19:17:54: collecting all words and their counts
INFO - 19:17:54: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 19:17:54: collected 7826 word types from a corpus of 42967 raw words and 3091 sentences
INFO - 19:17:54: Loading a fresh vocabulary
INFO - 19:17:54: effective_min_count=1 retains 7826 unique words (100% of original 7826, drops 0)
INFO - 19:17:54: effective_min_count=1 leaves 42967 word corpus (100% of original 42967, drops 0)
INFO - 19:17:55: deleting the raw counts dictionary of 7826 items
INFO - 19:17:55: sample=0.001 downsamples 34 most-common words
INFO - 19:17:55: downsampling leaves estimated 31810 word corpus (74.0% of prior 42967)
INFO - 19:17:55: constructing a huffman tree from 7826 words
INFO - 19:17:55: built huffman tree with maximum node depth 16
INFO - 19:17:55: estimated required memory for 7826 words, 61331 buckets and 100 dimensions: 41558608 bytes
INFO - 19:17:55: resetting layer weig

INFO - 19:18:06: EPOCH - 21 : training on 42967 raw words (31819 effective words) took 0.2s, 136423 effective words/s
INFO - 19:18:06: worker thread finished; awaiting finish of 2 more threads
INFO - 19:18:06: worker thread finished; awaiting finish of 1 more threads
INFO - 19:18:06: worker thread finished; awaiting finish of 0 more threads
INFO - 19:18:06: EPOCH - 22 : training on 42967 raw words (31735 effective words) took 0.2s, 135084 effective words/s
INFO - 19:18:06: worker thread finished; awaiting finish of 2 more threads
INFO - 19:18:06: worker thread finished; awaiting finish of 1 more threads
INFO - 19:18:06: worker thread finished; awaiting finish of 0 more threads
INFO - 19:18:06: EPOCH - 23 : training on 42967 raw words (31822 effective words) took 0.3s, 122745 effective words/s
INFO - 19:18:06: worker thread finished; awaiting finish of 2 more threads
INFO - 19:18:06: worker thread finished; awaiting finish of 1 more threads
INFO - 19:18:07: worker thread finished; await

INFO - 19:18:12: EPOCH - 45 : training on 42967 raw words (31788 effective words) took 0.2s, 135442 effective words/s
INFO - 19:18:12: worker thread finished; awaiting finish of 2 more threads
INFO - 19:18:12: worker thread finished; awaiting finish of 1 more threads
INFO - 19:18:12: worker thread finished; awaiting finish of 0 more threads
INFO - 19:18:12: EPOCH - 46 : training on 42967 raw words (31815 effective words) took 0.2s, 133008 effective words/s
INFO - 19:18:12: worker thread finished; awaiting finish of 2 more threads
INFO - 19:18:12: worker thread finished; awaiting finish of 1 more threads
INFO - 19:18:12: worker thread finished; awaiting finish of 0 more threads
INFO - 19:18:12: EPOCH - 47 : training on 42967 raw words (31847 effective words) took 0.2s, 133291 effective words/s
INFO - 19:18:12: worker thread finished; awaiting finish of 2 more threads
INFO - 19:18:13: worker thread finished; awaiting finish of 1 more threads
INFO - 19:18:13: worker thread finished; await

# Corpus QUAERO_FrenchPress

In [13]:
press_path = "./QUAERO_FrenchPress/QUAERO_FrenchPress_traindev.ospl"
with open(press_path) as f:
    sentences = [line.strip() for line in f.readlines()]

In [14]:
sentences = pd.Series(sentences)
sentences = sentences.apply(preprocess_text)
sentences = sentences.to_list()
sentences_press = [s.split(" ") for s in sentences]

### Skip gram

In [31]:
sg_model_press = Word2Vec(sentences=sentences_press, size=wv_dim, sg=1, hs=1, min_count=1, iter=50)
if not os.path.exists('./WE_models'):
    os.mkdir('./WE_models')
sg_model_press.save('WE_models/w2v_sg_p_100D')

INFO - 18:57:31: collecting all words and their counts
INFO - 18:57:31: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 18:57:31: PROGRESS: at sentence #10000, processed 278773 words, keeping 18477 word types
INFO - 18:57:31: PROGRESS: at sentence #20000, processed 531101 words, keeping 26360 word types
INFO - 18:57:31: PROGRESS: at sentence #30000, processed 809820 words, keeping 31793 word types
INFO - 18:57:31: collected 35706 word types from a corpus of 1091998 raw words and 38548 sentences
INFO - 18:57:31: Loading a fresh vocabulary
INFO - 18:57:31: effective_min_count=1 retains 35706 unique words (100% of original 35706, drops 0)
INFO - 18:57:31: effective_min_count=1 leaves 1091998 word corpus (100% of original 1091998, drops 0)
INFO - 18:57:31: deleting the raw counts dictionary of 35706 items
INFO - 18:57:31: sample=0.001 downsamples 42 most-common words
INFO - 18:57:31: downsampling leaves estimated 807950 word corpus (74.0% of prior 1091998)
INFO - 1

INFO - 18:58:17: EPOCH 11 - PROGRESS: at 50.33% examples, 183924 words/s, in_qsize 6, out_qsize 0
INFO - 18:58:18: EPOCH 11 - PROGRESS: at 76.26% examples, 185306 words/s, in_qsize 6, out_qsize 0
INFO - 18:58:19: EPOCH 11 - PROGRESS: at 96.24% examples, 186755 words/s, in_qsize 5, out_qsize 0
INFO - 18:58:19: worker thread finished; awaiting finish of 2 more threads
INFO - 18:58:19: worker thread finished; awaiting finish of 1 more threads
INFO - 18:58:20: worker thread finished; awaiting finish of 0 more threads
INFO - 18:58:20: EPOCH - 11 : training on 1091998 raw words (807904 effective words) took 4.3s, 187170 effective words/s
INFO - 18:58:21: EPOCH 12 - PROGRESS: at 22.33% examples, 182630 words/s, in_qsize 5, out_qsize 0
INFO - 18:58:22: EPOCH 12 - PROGRESS: at 53.83% examples, 194362 words/s, in_qsize 5, out_qsize 0
INFO - 18:58:23: EPOCH 12 - PROGRESS: at 78.99% examples, 198038 words/s, in_qsize 5, out_qsize 0
INFO - 18:58:24: worker thread finished; awaiting finish of 2 more

INFO - 18:59:05: EPOCH - 22 : training on 1091998 raw words (808066 effective words) took 4.1s, 197997 effective words/s
INFO - 18:59:06: EPOCH 23 - PROGRESS: at 22.33% examples, 183003 words/s, in_qsize 5, out_qsize 0
INFO - 18:59:07: EPOCH 23 - PROGRESS: at 51.57% examples, 193687 words/s, in_qsize 5, out_qsize 0
INFO - 18:59:08: EPOCH 23 - PROGRESS: at 76.26% examples, 193058 words/s, in_qsize 5, out_qsize 0
INFO - 18:59:09: EPOCH 23 - PROGRESS: at 97.24% examples, 192704 words/s, in_qsize 4, out_qsize 0
INFO - 18:59:09: worker thread finished; awaiting finish of 2 more threads
INFO - 18:59:09: worker thread finished; awaiting finish of 1 more threads
INFO - 18:59:09: worker thread finished; awaiting finish of 0 more threads
INFO - 18:59:09: EPOCH - 23 : training on 1091998 raw words (807943 effective words) took 4.2s, 194063 effective words/s
INFO - 18:59:10: EPOCH 24 - PROGRESS: at 23.65% examples, 186880 words/s, in_qsize 5, out_qsize 0
INFO - 18:59:11: EPOCH 24 - PROGRESS: at 52

INFO - 18:59:55: EPOCH 35 - PROGRESS: at 24.75% examples, 198076 words/s, in_qsize 5, out_qsize 0
INFO - 18:59:56: EPOCH 35 - PROGRESS: at 53.83% examples, 199464 words/s, in_qsize 6, out_qsize 0
INFO - 18:59:57: EPOCH 35 - PROGRESS: at 80.02% examples, 202535 words/s, in_qsize 5, out_qsize 0
INFO - 18:59:58: worker thread finished; awaiting finish of 2 more threads
INFO - 18:59:58: worker thread finished; awaiting finish of 1 more threads
INFO - 18:59:58: worker thread finished; awaiting finish of 0 more threads
INFO - 18:59:58: EPOCH - 35 : training on 1091998 raw words (807959 effective words) took 3.9s, 204606 effective words/s
INFO - 18:59:59: EPOCH 36 - PROGRESS: at 25.99% examples, 192482 words/s, in_qsize 5, out_qsize 0
INFO - 19:00:00: EPOCH 36 - PROGRESS: at 55.84% examples, 201566 words/s, in_qsize 5, out_qsize 0
INFO - 19:00:01: EPOCH 36 - PROGRESS: at 79.50% examples, 199412 words/s, in_qsize 5, out_qsize 0
INFO - 19:00:02: worker thread finished; awaiting finish of 2 more

INFO - 19:00:46: worker thread finished; awaiting finish of 0 more threads
INFO - 19:00:46: EPOCH - 47 : training on 1091998 raw words (807901 effective words) took 4.0s, 202334 effective words/s
INFO - 19:00:47: EPOCH 48 - PROGRESS: at 25.99% examples, 191527 words/s, in_qsize 5, out_qsize 0
INFO - 19:00:48: EPOCH 48 - PROGRESS: at 55.84% examples, 201929 words/s, in_qsize 5, out_qsize 0
INFO - 19:00:49: EPOCH 48 - PROGRESS: at 80.02% examples, 198231 words/s, in_qsize 5, out_qsize 0
INFO - 19:00:50: worker thread finished; awaiting finish of 2 more threads
INFO - 19:00:50: worker thread finished; awaiting finish of 1 more threads
INFO - 19:00:50: worker thread finished; awaiting finish of 0 more threads
INFO - 19:00:50: EPOCH - 48 : training on 1091998 raw words (807427 effective words) took 4.0s, 200628 effective words/s
INFO - 19:00:51: EPOCH 49 - PROGRESS: at 24.75% examples, 197868 words/s, in_qsize 5, out_qsize 0
INFO - 19:00:52: EPOCH 49 - PROGRESS: at 53.83% examples, 198171 w

### Cbow

In [32]:
cbow_model_press = Word2Vec(sentences=sentences_press, size=wv_dim, sg=0, hs=1, min_count=1, iter=50)
if not os.path.exists('./WE_models'):
    os.mkdir('./WE_models')
cbow_model_press.save('WE_models/w2v_cbow_p_100D')

INFO - 19:00:59: collecting all words and their counts
INFO - 19:00:59: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 19:00:59: PROGRESS: at sentence #10000, processed 278773 words, keeping 18477 word types
INFO - 19:00:59: PROGRESS: at sentence #20000, processed 531101 words, keeping 26360 word types
INFO - 19:00:59: PROGRESS: at sentence #30000, processed 809820 words, keeping 31793 word types
INFO - 19:00:59: collected 35706 word types from a corpus of 1091998 raw words and 38548 sentences
INFO - 19:00:59: Loading a fresh vocabulary
INFO - 19:00:59: effective_min_count=1 retains 35706 unique words (100% of original 35706, drops 0)
INFO - 19:00:59: effective_min_count=1 leaves 1091998 word corpus (100% of original 1091998, drops 0)
INFO - 19:00:59: deleting the raw counts dictionary of 35706 items
INFO - 19:00:59: sample=0.001 downsamples 42 most-common words
INFO - 19:00:59: downsampling leaves estimated 807950 word corpus (74.0% of prior 1091998)
INFO - 1

INFO - 19:01:25: worker thread finished; awaiting finish of 1 more threads
INFO - 19:01:25: worker thread finished; awaiting finish of 0 more threads
INFO - 19:01:25: EPOCH - 16 : training on 1091998 raw words (807636 effective words) took 1.2s, 681405 effective words/s
INFO - 19:01:26: EPOCH 17 - PROGRESS: at 86.66% examples, 684904 words/s, in_qsize 5, out_qsize 0
INFO - 19:01:26: worker thread finished; awaiting finish of 2 more threads
INFO - 19:01:26: worker thread finished; awaiting finish of 1 more threads
INFO - 19:01:26: worker thread finished; awaiting finish of 0 more threads
INFO - 19:01:26: EPOCH - 17 : training on 1091998 raw words (807798 effective words) took 1.2s, 694076 effective words/s
INFO - 19:01:27: EPOCH 18 - PROGRESS: at 88.34% examples, 696411 words/s, in_qsize 5, out_qsize 0
INFO - 19:01:27: worker thread finished; awaiting finish of 2 more threads
INFO - 19:01:27: worker thread finished; awaiting finish of 1 more threads
INFO - 19:01:27: worker thread finish

INFO - 19:01:47: EPOCH 35 - PROGRESS: at 81.78% examples, 639058 words/s, in_qsize 5, out_qsize 0
INFO - 19:01:48: worker thread finished; awaiting finish of 2 more threads
INFO - 19:01:48: worker thread finished; awaiting finish of 1 more threads
INFO - 19:01:48: worker thread finished; awaiting finish of 0 more threads
INFO - 19:01:48: EPOCH - 35 : training on 1091998 raw words (808042 effective words) took 1.3s, 641522 effective words/s
INFO - 19:01:49: EPOCH 36 - PROGRESS: at 78.99% examples, 610405 words/s, in_qsize 6, out_qsize 0
INFO - 19:01:49: worker thread finished; awaiting finish of 2 more threads
INFO - 19:01:49: worker thread finished; awaiting finish of 1 more threads
INFO - 19:01:49: worker thread finished; awaiting finish of 0 more threads
INFO - 19:01:49: EPOCH - 36 : training on 1091998 raw words (807788 effective words) took 1.3s, 631908 effective words/s
INFO - 19:01:50: EPOCH 37 - PROGRESS: at 78.99% examples, 610279 words/s, in_qsize 5, out_qsize 0
INFO - 19:01:5

In [15]:
fasttext_model_press = gensim.models.fasttext.FastText(sentences=sentences_med, size=wv_dim, sg=0, hs=1, min_count=1, iter=50)
if not os.path.exists('./WE_models'):
    os.mkdir('./WE_models')
fasttext_model_press.save('WE_models/w2v_ft_p_100D')

INFO - 16:32:33: resetting layer weights
INFO - 16:32:43: collecting all words and their counts
INFO - 16:32:43: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 16:32:43: collected 7826 word types from a corpus of 42967 raw words and 3091 sentences
INFO - 16:32:43: Loading a fresh vocabulary
INFO - 16:32:43: effective_min_count=1 retains 7826 unique words (100% of original 7826, drops 0)
INFO - 16:32:43: effective_min_count=1 leaves 42967 word corpus (100% of original 42967, drops 0)
INFO - 16:32:43: deleting the raw counts dictionary of 7826 items
INFO - 16:32:43: sample=0.001 downsamples 34 most-common words
INFO - 16:32:43: downsampling leaves estimated 31810 word corpus (74.0% of prior 42967)
INFO - 16:32:43: constructing a huffman tree from 7826 words
INFO - 16:32:43: built huffman tree with maximum node depth 16
INFO - 16:32:44: estimated required memory for 7826 words, 61331 buckets and 100 dimensions: 41558608 bytes
INFO - 16:32:44: resetting layer weig

INFO - 16:32:58: EPOCH - 21 : training on 42967 raw words (31788 effective words) took 0.3s, 94613 effective words/s
INFO - 16:32:58: worker thread finished; awaiting finish of 2 more threads
INFO - 16:32:58: worker thread finished; awaiting finish of 1 more threads
INFO - 16:32:58: worker thread finished; awaiting finish of 0 more threads
INFO - 16:32:58: EPOCH - 22 : training on 42967 raw words (31776 effective words) took 0.3s, 101202 effective words/s
INFO - 16:32:58: worker thread finished; awaiting finish of 2 more threads
INFO - 16:32:58: worker thread finished; awaiting finish of 1 more threads
INFO - 16:32:59: worker thread finished; awaiting finish of 0 more threads
INFO - 16:32:59: EPOCH - 23 : training on 42967 raw words (31866 effective words) took 0.3s, 96815 effective words/s
INFO - 16:32:59: worker thread finished; awaiting finish of 2 more threads
INFO - 16:32:59: worker thread finished; awaiting finish of 1 more threads
INFO - 16:32:59: worker thread finished; awaitin

INFO - 16:33:06: EPOCH - 45 : training on 42967 raw words (31792 effective words) took 0.4s, 85713 effective words/s
INFO - 16:33:06: worker thread finished; awaiting finish of 2 more threads
INFO - 16:33:06: worker thread finished; awaiting finish of 1 more threads
INFO - 16:33:06: worker thread finished; awaiting finish of 0 more threads
INFO - 16:33:06: EPOCH - 46 : training on 42967 raw words (31782 effective words) took 0.3s, 97657 effective words/s
INFO - 16:33:06: worker thread finished; awaiting finish of 2 more threads
INFO - 16:33:07: worker thread finished; awaiting finish of 1 more threads
INFO - 16:33:07: worker thread finished; awaiting finish of 0 more threads
INFO - 16:33:07: EPOCH - 47 : training on 42967 raw words (31802 effective words) took 0.3s, 98404 effective words/s
INFO - 16:33:07: worker thread finished; awaiting finish of 2 more threads
INFO - 16:33:07: worker thread finished; awaiting finish of 1 more threads
INFO - 16:33:07: worker thread finished; awaiting

In [None]:
# # Save keyed vectors
# outfolder_path = "../tp2/NeuroNLP2/experiments/models/we_models/"

# cbow_model.wv.save_word2vec_format(outfolder_path+"w2v_cbow_100D.kv.bin", binary=True)
# cbow_model_press.wv.save_word2vec_format(outfolder_path+"w2v_cbow_p_100D.kv")
# sg_model.wv.save_word2vec_format(outfolder_path+"w2v_sg_100D.kv")
# sg_model_press.wv.save_word2vec_format(outfolder_path+"w2v_sg_p_100D.kv")

# Load the models

INFO - 12:57:52: storing 7826x100 projection weights into ../tp2/NeuroNLP2/experiments/models/we_models/w2v_cbow_100D.kv.bin
INFO - 12:57:52: storing 35706x100 projection weights into ../tp2/NeuroNLP2/experiments/models/we_models/w2v_cbow_p_100D.kv
INFO - 12:57:55: storing 7826x100 projection weights into ../tp2/NeuroNLP2/experiments/models/we_models/w2v_sg_100D.kv
INFO - 12:57:55: storing 35706x100 projection weights into ../tp2/NeuroNLP2/experiments/models/we_models/w2v_sg_p_100D.kv


In [1]:
# embedding_path = outfolder_path+"w2v_cbow_100D.kv.gz"
# m = gensim.models.KeyedVectors.load_word2vec_format(embedding_path, binary=False)
# # word2vec = Word2Vec.load_word2vec_format(embedding_path, binary=False)
# embedd_dim = m.vector_size
# print("w2v", m)

In [16]:
# cbow med
cbow_model = Word2Vec.load('WE_models/w2v_cbow_100D')

# cbow press
cbow_model_press = Word2Vec.load('WE_models/w2v_cbow_p_100D')

# sg med
sg_model = Word2Vec.load('WE_models/w2v_sg_100D')

# sg press
sg_model_press = Word2Vec.load('WE_models/w2v_sg_p_100D')

# fasttext med
fasttext_model = Word2Vec.load('WE_models/w2v_ft_100D')

# fasttext press
fasttext_model_press = Word2Vec.load('WE_models/w2v_ft_p_100D')



INFO - 22:10:29: loading Word2Vec object from WE_models/w2v_cbow_100D
INFO - 22:10:30: loading wv recursively from WE_models/w2v_cbow_100D.wv.* with mmap=None
INFO - 22:10:30: setting ignored attribute vectors_norm to None
INFO - 22:10:30: loading vocabulary recursively from WE_models/w2v_cbow_100D.vocabulary.* with mmap=None
INFO - 22:10:30: loading trainables recursively from WE_models/w2v_cbow_100D.trainables.* with mmap=None
INFO - 22:10:30: setting ignored attribute cum_table to None
INFO - 22:10:30: loaded WE_models/w2v_cbow_100D
INFO - 22:10:30: loading Word2Vec object from WE_models/w2v_cbow_p_100D
INFO - 22:10:31: loading wv recursively from WE_models/w2v_cbow_p_100D.wv.* with mmap=None
INFO - 22:10:31: setting ignored attribute vectors_norm to None
INFO - 22:10:31: loading vocabulary recursively from WE_models/w2v_cbow_p_100D.vocabulary.* with mmap=None
INFO - 22:10:31: loading trainables recursively from WE_models/w2v_cbow_p_100D.trainables.* with mmap=None
INFO - 22:10:31: 

In [None]:
cbow_model.save('WE_models/w2v_cbow_100D')


In [72]:
# from gensim.models import KeyedVectors
word_vectors = cbow_model.wv
word_vectors.save('w2v_ft_100D.kv')
# KeyedVectors.load('vectors.kv')

INFO - 22:36:03: saving Word2VecKeyedVectors object under w2v_ft_100D.kv, separately None
INFO - 22:36:03: not storing attribute vectors_norm
INFO - 22:36:03: saved w2v_ft_100D.kv


# Comparison

In [18]:
top_n = 10

Mots candidats: patient, traitement, maladie, solution, jaune

## Comparaison des embeddings entrainés sur le même corpus

tester l'impact des approches (skipgram, cbow, fasttext) sur le résultats

### Med Corpus

patient

In [38]:
sg_model.wv.most_similar("patient", topn=top_n)

[('montrez', 0.6341592073440552),
 ('alerte', 0.61311936378479),
 ('cette', 0.6056946516036987),
 ('aptitude', 0.6043680906295776),
 ('souffre', 0.5990329384803772),
 ('carte', 0.5902701616287231),
 ('existante', 0.5838398337364197),
 ('conserviez', 0.5823631882667542),
 ('determiner', 0.5807697176933289),
 ('speciale', 0.5757887959480286)]

In [39]:
cbow_model.wv.most_similar("patient", topn=top_n)

[('alerte', 0.5416854619979858),
 ('medicament', 0.48750820755958557),
 ('etre', 0.4820495843887329),
 ('vous', 0.47345608472824097),
 ('prialt', 0.4709300994873047),
 ('qu', 0.4424228370189667),
 ('ils', 0.44230562448501587),
 ('soient', 0.4418569803237915),
 ('recevoir', 0.43952542543411255),
 ('urinaire', 0.4368218779563904)]

In [19]:
fasttext_model.wv.most_similar("patient", topn=top_n)

INFO - 22:12:52: precomputing L2-norms of word weight vectors
INFO - 22:12:52: precomputing L2-norms of ngram weight vectors


[('patiente', 0.8910520076751709),
 ('patients', 0.8098596334457397),
 ('parvient', 0.70711350440979),
 ('soient', 0.6613249778747559),
 ('aient', 0.6562939286231995),
 ('conscient', 0.6520581245422363),
 ('maintient', 0.6402011513710022),
 ('gradient', 0.6337660551071167),
 ('recevaient', 0.630927562713623),
 ('emportent', 0.6303962469100952)]

traitement

In [40]:
sg_model.wv.most_similar("traitement", topn=top_n)

[('par', 0.6101565957069397),
 ('instaure', 0.5734601616859436),
 ('traites', 0.5320462584495544),
 ('gynecomasties', 0.5252437591552734),
 ('commencer', 0.5163739919662476),
 ('significatifs', 0.49864545464515686),
 ('habitue', 0.49060970544815063),
 ('avant', 0.48643964529037476),
 ('reevaluer', 0.48350077867507935),
 ('experimente', 0.4764772951602936)]

In [41]:
cbow_model.wv.most_similar("traitement", topn=top_n)

[('consequent', 0.48196345567703247),
 ('medecin', 0.47441399097442627),
 ('tasmar', 0.4606321454048157),
 ('cours', 0.4571623206138611),
 ('diagnostic', 0.42869091033935547),
 ('vih', 0.4021696448326111),
 ('instauration', 0.3941475749015808),
 ('chimiotherapie', 0.3862146735191345),
 ('traites', 0.38262954354286194),
 ('placebo', 0.37556999921798706)]

In [20]:
fasttext_model.wv.most_similar("traitement", topn=top_n)

[('traitment', 0.9169358015060425),
 ('taaitement', 0.9022010564804077),
 ('evitement', 0.8458164930343628),
 ('traitements', 0.8318533897399902),
 ('allaitement', 0.8137481212615967),
 ('etroitement', 0.8132373094558716),
 ('correctement', 0.8128017783164978),
 ('recrutement', 0.8127676248550415),
 ('lentement', 0.8048897981643677),
 ('hautement', 0.7931801080703735)]

maladie

In [42]:
sg_model.wv.most_similar("maladie", topn=top_n)

[('parkinson', 0.6782119274139404),
 ('crohn', 0.6094835996627808),
 ('basedow', 0.588508129119873),
 ('vraie', 0.5692818760871887),
 ('idiopathique', 0.5633712410926819),
 ('hirsprung', 0.5631192922592163),
 ('recklinghausen', 0.5493713021278381),
 ('avance', 0.5429551601409912),
 ('fluctuations', 0.5345396399497986),
 ('bignami', 0.5342870950698853)]

In [43]:
cbow_model.wv.most_similar("maladie", topn=top_n)

[('chronique', 0.47794264554977417),
 ('sida', 0.4588952660560608),
 ('affection', 0.4492846131324768),
 ('aigue', 0.44460824131965637),
 ('type', 0.44380277395248413),
 ('inhibition', 0.4374990165233612),
 ('pancreatite', 0.4326436519622803),
 ('sep', 0.429707407951355),
 ('nombre', 0.42318040132522583),
 ('douleurs', 0.4157176911830902)]

In [21]:
fasttext_model.wv.most_similar("maladie", topn=top_n)

[('malade', 0.8126126527786255),
 ('maladies', 0.7736676335334778),
 ('amantadie', 0.7351871728897095),
 ('maldi', 0.7070671319961548),
 ('maltraitance', 0.6765587329864502),
 ('malaise', 0.6686583757400513),
 ('malades', 0.5842608213424683),
 ('malt', 0.5831559896469116),
 ('malgre', 0.5820046663284302),
 ('revelateurs', 0.5727584362030029)]

solution

In [44]:
sg_model.wv.most_similar("solution", topn=top_n)

[('contient', 0.7087570428848267),
 ('diluer', 0.685350775718689),
 ('ml', 0.6653280854225159),
 ('perfusable', 0.6614526510238647),
 ('injectable', 0.6604785919189453),
 ('buvable', 0.6535659432411194),
 ('reconstituee', 0.650699257850647),
 ('dosee', 0.6494290828704834),
 ('microgrammes', 0.6386682987213135),
 ('ajoutez', 0.6250247955322266)]

In [45]:
cbow_model.wv.most_similar("solution", topn=top_n)

[('ml', 0.620438814163208),
 ('preparation', 0.6042649745941162),
 ('lepirudine', 0.5789499878883362),
 ('vitesse', 0.5654317140579224),
 ('pompe', 0.546770453453064),
 ('instructions', 0.5461287498474121),
 ('chaque', 0.5169966220855713),
 ('bolus', 0.49907243251800537),
 ('manipulation', 0.4925003945827484),
 ('flacon', 0.47363755106925964)]

In [22]:
fasttext_model.wv.most_similar("solution", topn=top_n)

[('dissolution', 0.9642202258110046),
 ('dilution', 0.8752856850624084),
 ('pollution', 0.8713375329971313),
 ('reconstitution', 0.8186519742012024),
 ('execution', 0.8122972846031189),
 ('evolution', 0.8111284375190735),
 ('constitution', 0.8108775615692139),
 ('substitution', 0.8104674816131592),
 ('institution', 0.801041841506958),
 ('microdeletion', 0.7897745370864868)]

jaune

In [46]:
sg_model.wv.most_similar("jaune", topn=top_n)

[('pale', 0.7044824361801147),
 ('orange', 0.64982008934021),
 ('anormale', 0.6196465492248535),
 ('calotermes', 0.6084887385368347),
 ('flavicollis', 0.6013085246086121),
 ('hexagonaux', 0.5832014083862305),
 ('fabr', 0.579716145992279),
 ('incolore', 0.5791885852813721),
 ('navet', 0.5746554136276245),
 ('mosaique', 0.5725802779197693)]

In [47]:
cbow_model.wv.most_similar("jaune", topn=top_n)

[('oxyde', 0.6426818370819092),
 ('131', 0.6139801740646362),
 ('jugo', 0.604779064655304),
 ('commissuro', 0.59688800573349),
 ('tachetes', 0.573742151260376),
 ('spontane', 0.5644837021827698),
 ('intestine', 0.5587036609649658),
 ('scintigraphies', 0.5523563623428345),
 ('histidine', 0.5474417209625244),
 ('physiquement', 0.5420677065849304)]

In [23]:
fasttext_model.wv.most_similar("jaune", topn=top_n)

[('lane', 0.6069298982620239),
 ('rouge', 0.5857460498809814),
 ('jaunisse', 0.5671855807304382),
 ('oxyde', 0.5602437853813171),
 ('triacetine', 0.557691216468811),
 ('hippocrate', 0.5483880043029785),
 ('rougeole', 0.5414905548095703),
 ('titane', 0.5408645868301392),
 ('dioxyde', 0.538782000541687),
 ('fiable', 0.5368438959121704)]

### Press Corpus

patient

In [48]:
sg_model_press.wv.most_similar("patient", topn=top_n)

INFO - 19:09:29: precomputing L2-norms of word weight vectors


[('cancereux', 0.5618505477905273),
 ('soignant', 0.5616238117218018),
 ('algorithmes', 0.531581699848175),
 ('hospitalise', 0.5090411901473999),
 ('statistiquement', 0.5049449801445007),
 ('malade', 0.4799725413322449),
 ('161', 0.47708266973495483),
 ('insupportables', 0.4752955436706543),
 ('ricane', 0.4677731692790985),
 ('humble', 0.4670393168926239)]

In [49]:
cbow_model_press.wv.most_similar("patient", topn=top_n)

INFO - 19:09:29: precomputing L2-norms of word weight vectors


[('soignant', 0.43600648641586304),
 ('sex', 0.4158717095851898),
 ('tourcoing', 0.390976220369339),
 ('garcon', 0.3902435898780823),
 ('mariees', 0.37666457891464233),
 ('lannes', 0.3766070008277893),
 ('lavage', 0.37340131402015686),
 ('produit', 0.3685552775859833),
 ('bouquin', 0.3608214557170868),
 ('fou', 0.3519297242164612)]

In [24]:
fasttext_model_press.wv.most_similar("patient", topn=top_n)

INFO - 22:14:26: precomputing L2-norms of word weight vectors
INFO - 22:14:26: precomputing L2-norms of ngram weight vectors


[('patiente', 0.887475311756134),
 ('patients', 0.8198944926261902),
 ('parvient', 0.7211000323295593),
 ('soient', 0.6863570809364319),
 ('aient', 0.6725202202796936),
 ('maintient', 0.6641037464141846),
 ('conscient', 0.6631748080253601),
 ('gradient', 0.6533800363540649),
 ('recevaient', 0.6456177234649658),
 ('presentaient', 0.6199854612350464)]

traitement

In [50]:
sg_model_press.wv.most_similar("traitement", topn=top_n)

[('exciter', 0.5309458374977112),
 ('protegeait', 0.5193228125572205),
 ('medicamenteux', 0.5061299800872803),
 ('sida', 0.5061064958572388),
 ('mineur', 0.49692875146865845),
 ('antidouleur', 0.4894629120826721),
 ('generateurs', 0.48877307772636414),
 ('prevention', 0.4872512221336365),
 ('decentralise', 0.4819537401199341),
 ('fondamentales', 0.4691838026046753)]

In [51]:
cbow_model_press.wv.most_similar("traitement", topn=top_n)

[('bilingues', 0.408133864402771),
 ('terrorisme', 0.40106064081192017),
 ('cout', 0.39213138818740845),
 ('soutien', 0.3916090726852417),
 ('renforcement', 0.38820207118988037),
 ('pacte', 0.37954139709472656),
 ('nom', 0.3591303825378418),
 ('sida', 0.35791391134262085),
 ('dictateur', 0.35676631331443787),
 ('minimum', 0.35663390159606934)]

In [25]:
fasttext_model.wv.most_similar("traitement", topn=top_n)

[('traitment', 0.9169358015060425),
 ('taaitement', 0.9022010564804077),
 ('evitement', 0.8458164930343628),
 ('traitements', 0.8318533897399902),
 ('allaitement', 0.8137481212615967),
 ('etroitement', 0.8132373094558716),
 ('correctement', 0.8128017783164978),
 ('recrutement', 0.8127676248550415),
 ('lentement', 0.8048897981643677),
 ('hautement', 0.7931801080703735)]

maladie

In [52]:
sg_model_press.wv.most_similar("maladie", topn=top_n)

[('pneumopathie', 0.6438268423080444),
 ('epidemie', 0.6329473257064819),
 ('virale', 0.6205798387527466),
 ('succombent', 0.5800681114196777),
 ('161', 0.5724583268165588),
 ('atypique', 0.5670432448387146),
 ('neurologique', 0.5660151243209839),
 ('grippe', 0.564203679561615),
 ('succomber', 0.5592811107635498),
 ('transmissible', 0.5576953887939453)]

In [53]:
cbow_model_press.wv.most_similar("maladie", topn=top_n)

[('responsabilite', 0.46167290210723877),
 ('improvisation', 0.4057588577270508),
 ('fonction', 0.3996735215187073),
 ('violence', 0.398216187953949),
 ('gouvernant', 0.39815330505371094),
 ('situation', 0.395330011844635),
 ('peste', 0.3952723741531372),
 ('chambre', 0.39341360330581665),
 ('recourt', 0.38351255655288696),
 ('18000', 0.38294586539268494)]

In [26]:
fasttext_model.wv.most_similar("maladie", topn=top_n)

[('malade', 0.8126126527786255),
 ('maladies', 0.7736676335334778),
 ('amantadie', 0.7351871728897095),
 ('maldi', 0.7070671319961548),
 ('maltraitance', 0.6765587329864502),
 ('malaise', 0.6686583757400513),
 ('malades', 0.5842608213424683),
 ('malt', 0.5831559896469116),
 ('malgre', 0.5820046663284302),
 ('revelateurs', 0.5727584362030029)]

solution

In [54]:
sg_model_press.wv.most_similar("solution", topn=top_n)

[('garantissant', 0.6696721911430359),
 ('pacifique', 0.5904736518859863),
 ('sodium', 0.5753224492073059),
 ('mesure', 0.5710676908493042),
 ('lancinant', 0.5706446766853333),
 ('consensuelle', 0.5557705163955688),
 ('cochonneries', 0.5396841168403625),
 ('amelioree', 0.5380747318267822),
 ('constructif', 0.5126892328262329),
 ('prealable', 0.5051695108413696)]

In [55]:
cbow_model_press.wv.most_similar("solution", topn=top_n)

[('reponse', 0.5314513444900513),
 ('opportunite', 0.4558674693107605),
 ('mesure', 0.44619232416152954),
 ('facon', 0.4405023753643036),
 ('demarche', 0.43324095010757446),
 ('alternative', 0.43216174840927124),
 ('expertise', 0.4316878020763397),
 ('visite', 0.4223451018333435),
 ('mission', 0.4215881824493408),
 ('idee', 0.4190034866333008)]

In [27]:
fasttext_model.wv.most_similar("solution", topn=top_n)

[('dissolution', 0.9642202258110046),
 ('dilution', 0.8752856850624084),
 ('pollution', 0.8713375329971313),
 ('reconstitution', 0.8186519742012024),
 ('execution', 0.8122972846031189),
 ('evolution', 0.8111284375190735),
 ('constitution', 0.8108775615692139),
 ('substitution', 0.8104674816131592),
 ('institution', 0.801041841506958),
 ('microdeletion', 0.7897745370864868)]

jaune

In [56]:
sg_model_press.wv.most_similar("jaune", topn=top_n)

[('maillot', 0.7700062990188599),
 ('pois', 0.6189182996749878),
 ('390', 0.606795072555542),
 ('lachhab', 0.5962772965431213),
 ('pena', 0.5697344541549683),
 ('decaleront', 0.5682517290115356),
 ('bradeley', 0.55889892578125),
 ('bradley', 0.5236215591430664),
 ('metal', 0.5199170112609863),
 ('endosse', 0.5177762508392334)]

In [57]:
cbow_model_press.wv.most_similar("jaune", topn=top_n)

[('ouariour', 0.49236369132995605),
 ('souad', 0.4485026001930237),
 ('lachhab', 0.4437779188156128),
 ('aquitain', 0.42470255494117737),
 ('antonio', 0.4102002680301666),
 ('vedrine', 0.40616124868392944),
 ('empare', 0.40327855944633484),
 ('pois', 0.4008446931838989),
 ('muette', 0.39541828632354736),
 ('azur', 0.3890635371208191)]

In [28]:
fasttext_model.wv.most_similar("jaune", topn=top_n)

[('lane', 0.6069298982620239),
 ('rouge', 0.5857460498809814),
 ('jaunisse', 0.5671855807304382),
 ('oxyde', 0.5602437853813171),
 ('triacetine', 0.557691216468811),
 ('hippocrate', 0.5483880043029785),
 ('rougeole', 0.5414905548095703),
 ('titane', 0.5408645868301392),
 ('dioxyde', 0.538782000541687),
 ('fiable', 0.5368438959121704)]

## Comparer des embeddings (même approche) entrainés sur de corpus différents.

tester l'impact de données (type et quantité) sur les résultats

In [58]:
len(sentences_med)

3091

In [59]:
len(sentences_press)

38548

### Skip gram

patient

In [38]:
sg_model.wv.most_similar("patient", topn=top_n)

[('montrez', 0.6341592073440552),
 ('alerte', 0.61311936378479),
 ('cette', 0.6056946516036987),
 ('aptitude', 0.6043680906295776),
 ('souffre', 0.5990329384803772),
 ('carte', 0.5902701616287231),
 ('existante', 0.5838398337364197),
 ('conserviez', 0.5823631882667542),
 ('determiner', 0.5807697176933289),
 ('speciale', 0.5757887959480286)]

In [39]:
sg_model_press.wv.most_similar("patient", topn=top_n)

[('alerte', 0.5416854619979858),
 ('medicament', 0.48750820755958557),
 ('etre', 0.4820495843887329),
 ('vous', 0.47345608472824097),
 ('prialt', 0.4709300994873047),
 ('qu', 0.4424228370189667),
 ('ils', 0.44230562448501587),
 ('soient', 0.4418569803237915),
 ('recevoir', 0.43952542543411255),
 ('urinaire', 0.4368218779563904)]

traitement

In [40]:
sg_model.wv.most_similar("traitement", topn=top_n)

[('par', 0.6101565957069397),
 ('instaure', 0.5734601616859436),
 ('traites', 0.5320462584495544),
 ('gynecomasties', 0.5252437591552734),
 ('commencer', 0.5163739919662476),
 ('significatifs', 0.49864545464515686),
 ('habitue', 0.49060970544815063),
 ('avant', 0.48643964529037476),
 ('reevaluer', 0.48350077867507935),
 ('experimente', 0.4764772951602936)]

In [41]:
sg_model_press.wv.most_similar("traitement", topn=top_n)

[('consequent', 0.48196345567703247),
 ('medecin', 0.47441399097442627),
 ('tasmar', 0.4606321454048157),
 ('cours', 0.4571623206138611),
 ('diagnostic', 0.42869091033935547),
 ('vih', 0.4021696448326111),
 ('instauration', 0.3941475749015808),
 ('chimiotherapie', 0.3862146735191345),
 ('traites', 0.38262954354286194),
 ('placebo', 0.37556999921798706)]

maladie

In [42]:
sg_model.wv.most_similar("maladie", topn=top_n)

[('parkinson', 0.6782119274139404),
 ('crohn', 0.6094835996627808),
 ('basedow', 0.588508129119873),
 ('vraie', 0.5692818760871887),
 ('idiopathique', 0.5633712410926819),
 ('hirsprung', 0.5631192922592163),
 ('recklinghausen', 0.5493713021278381),
 ('avance', 0.5429551601409912),
 ('fluctuations', 0.5345396399497986),
 ('bignami', 0.5342870950698853)]

In [43]:
sg_model_press.wv.most_similar("maladie", topn=top_n)

[('chronique', 0.47794264554977417),
 ('sida', 0.4588952660560608),
 ('affection', 0.4492846131324768),
 ('aigue', 0.44460824131965637),
 ('type', 0.44380277395248413),
 ('inhibition', 0.4374990165233612),
 ('pancreatite', 0.4326436519622803),
 ('sep', 0.429707407951355),
 ('nombre', 0.42318040132522583),
 ('douleurs', 0.4157176911830902)]

solution

In [44]:
sg_model.wv.most_similar("solution", topn=top_n)

[('contient', 0.7087570428848267),
 ('diluer', 0.685350775718689),
 ('ml', 0.6653280854225159),
 ('perfusable', 0.6614526510238647),
 ('injectable', 0.6604785919189453),
 ('buvable', 0.6535659432411194),
 ('reconstituee', 0.650699257850647),
 ('dosee', 0.6494290828704834),
 ('microgrammes', 0.6386682987213135),
 ('ajoutez', 0.6250247955322266)]

In [45]:
sg_model_press.wv.most_similar("solution", topn=top_n)

[('ml', 0.620438814163208),
 ('preparation', 0.6042649745941162),
 ('lepirudine', 0.5789499878883362),
 ('vitesse', 0.5654317140579224),
 ('pompe', 0.546770453453064),
 ('instructions', 0.5461287498474121),
 ('chaque', 0.5169966220855713),
 ('bolus', 0.49907243251800537),
 ('manipulation', 0.4925003945827484),
 ('flacon', 0.47363755106925964)]

jaune

In [46]:
sg_model.wv.most_similar("jaune", topn=top_n)

[('pale', 0.7044824361801147),
 ('orange', 0.64982008934021),
 ('anormale', 0.6196465492248535),
 ('calotermes', 0.6084887385368347),
 ('flavicollis', 0.6013085246086121),
 ('hexagonaux', 0.5832014083862305),
 ('fabr', 0.579716145992279),
 ('incolore', 0.5791885852813721),
 ('navet', 0.5746554136276245),
 ('mosaique', 0.5725802779197693)]

In [47]:
sg_model_press.wv.most_similar("jaune", topn=top_n)

[('oxyde', 0.6426818370819092),
 ('131', 0.6139801740646362),
 ('jugo', 0.604779064655304),
 ('commissuro', 0.59688800573349),
 ('tachetes', 0.573742151260376),
 ('spontane', 0.5644837021827698),
 ('intestine', 0.5587036609649658),
 ('scintigraphies', 0.5523563623428345),
 ('histidine', 0.5474417209625244),
 ('physiquement', 0.5420677065849304)]

### CBOW

patient

In [60]:
cbow_model.wv.most_similar("patient", topn=top_n)

[('alerte', 0.5416854619979858),
 ('medicament', 0.48750820755958557),
 ('etre', 0.4820495843887329),
 ('vous', 0.47345608472824097),
 ('prialt', 0.4709300994873047),
 ('qu', 0.4424228370189667),
 ('ils', 0.44230562448501587),
 ('soient', 0.4418569803237915),
 ('recevoir', 0.43952542543411255),
 ('urinaire', 0.4368218779563904)]

In [61]:
cbow_model_press.wv.most_similar("patient", topn=top_n)

[('soignant', 0.43600648641586304),
 ('sex', 0.4158717095851898),
 ('tourcoing', 0.390976220369339),
 ('garcon', 0.3902435898780823),
 ('mariees', 0.37666457891464233),
 ('lannes', 0.3766070008277893),
 ('lavage', 0.37340131402015686),
 ('produit', 0.3685552775859833),
 ('bouquin', 0.3608214557170868),
 ('fou', 0.3519297242164612)]

traitement

In [62]:
cbow_model.wv.most_similar("traitement", topn=top_n)

[('consequent', 0.48196345567703247),
 ('medecin', 0.47441399097442627),
 ('tasmar', 0.4606321454048157),
 ('cours', 0.4571623206138611),
 ('diagnostic', 0.42869091033935547),
 ('vih', 0.4021696448326111),
 ('instauration', 0.3941475749015808),
 ('chimiotherapie', 0.3862146735191345),
 ('traites', 0.38262954354286194),
 ('placebo', 0.37556999921798706)]

In [63]:
cbow_model_press.wv.most_similar("traitement", topn=top_n)

[('bilingues', 0.408133864402771),
 ('terrorisme', 0.40106064081192017),
 ('cout', 0.39213138818740845),
 ('soutien', 0.3916090726852417),
 ('renforcement', 0.38820207118988037),
 ('pacte', 0.37954139709472656),
 ('nom', 0.3591303825378418),
 ('sida', 0.35791391134262085),
 ('dictateur', 0.35676631331443787),
 ('minimum', 0.35663390159606934)]

maladie

In [64]:
cbow_model.wv.most_similar("maladie", topn=top_n)

[('chronique', 0.47794264554977417),
 ('sida', 0.4588952660560608),
 ('affection', 0.4492846131324768),
 ('aigue', 0.44460824131965637),
 ('type', 0.44380277395248413),
 ('inhibition', 0.4374990165233612),
 ('pancreatite', 0.4326436519622803),
 ('sep', 0.429707407951355),
 ('nombre', 0.42318040132522583),
 ('douleurs', 0.4157176911830902)]

In [65]:
cbow_model_press.wv.most_similar("maladie", topn=top_n)

[('responsabilite', 0.46167290210723877),
 ('improvisation', 0.4057588577270508),
 ('fonction', 0.3996735215187073),
 ('violence', 0.398216187953949),
 ('gouvernant', 0.39815330505371094),
 ('situation', 0.395330011844635),
 ('peste', 0.3952723741531372),
 ('chambre', 0.39341360330581665),
 ('recourt', 0.38351255655288696),
 ('18000', 0.38294586539268494)]

solution

In [66]:
cbow_model.wv.most_similar("solution", topn=top_n)

[('ml', 0.620438814163208),
 ('preparation', 0.6042649745941162),
 ('lepirudine', 0.5789499878883362),
 ('vitesse', 0.5654317140579224),
 ('pompe', 0.546770453453064),
 ('instructions', 0.5461287498474121),
 ('chaque', 0.5169966220855713),
 ('bolus', 0.49907243251800537),
 ('manipulation', 0.4925003945827484),
 ('flacon', 0.47363755106925964)]

In [67]:
cbow_model_press.wv.most_similar("solution", topn=top_n)

[('reponse', 0.5314513444900513),
 ('opportunite', 0.4558674693107605),
 ('mesure', 0.44619232416152954),
 ('facon', 0.4405023753643036),
 ('demarche', 0.43324095010757446),
 ('alternative', 0.43216174840927124),
 ('expertise', 0.4316878020763397),
 ('visite', 0.4223451018333435),
 ('mission', 0.4215881824493408),
 ('idee', 0.4190034866333008)]

jaune

In [68]:
cbow_model.wv.most_similar("jaune", topn=top_n)

[('oxyde', 0.6426818370819092),
 ('131', 0.6139801740646362),
 ('jugo', 0.604779064655304),
 ('commissuro', 0.59688800573349),
 ('tachetes', 0.573742151260376),
 ('spontane', 0.5644837021827698),
 ('intestine', 0.5587036609649658),
 ('scintigraphies', 0.5523563623428345),
 ('histidine', 0.5474417209625244),
 ('physiquement', 0.5420677065849304)]

In [69]:
cbow_model_press.wv.most_similar("jaune", topn=top_n)

[('ouariour', 0.49236369132995605),
 ('souad', 0.4485026001930237),
 ('lachhab', 0.4437779188156128),
 ('aquitain', 0.42470255494117737),
 ('antonio', 0.4102002680301666),
 ('vedrine', 0.40616124868392944),
 ('empare', 0.40327855944633484),
 ('pois', 0.4008446931838989),
 ('muette', 0.39541828632354736),
 ('azur', 0.3890635371208191)]

### Fasttext

patient

In [29]:
fasttext_model.wv.most_similar("patient", topn=top_n)

[('patiente', 0.8910520076751709),
 ('patients', 0.8098596334457397),
 ('parvient', 0.70711350440979),
 ('soient', 0.6613249778747559),
 ('aient', 0.6562939286231995),
 ('conscient', 0.6520581245422363),
 ('maintient', 0.6402011513710022),
 ('gradient', 0.6337660551071167),
 ('recevaient', 0.630927562713623),
 ('emportent', 0.6303962469100952)]

In [30]:
fasttext_model_press.wv.most_similar("patient", topn=top_n)

[('patiente', 0.887475311756134),
 ('patients', 0.8198944926261902),
 ('parvient', 0.7211000323295593),
 ('soient', 0.6863570809364319),
 ('aient', 0.6725202202796936),
 ('maintient', 0.6641037464141846),
 ('conscient', 0.6631748080253601),
 ('gradient', 0.6533800363540649),
 ('recevaient', 0.6456177234649658),
 ('presentaient', 0.6199854612350464)]

traitement

In [31]:
fasttext_model.wv.most_similar("traitement", topn=top_n)

[('traitment', 0.9169358015060425),
 ('taaitement', 0.9022010564804077),
 ('evitement', 0.8458164930343628),
 ('traitements', 0.8318533897399902),
 ('allaitement', 0.8137481212615967),
 ('etroitement', 0.8132373094558716),
 ('correctement', 0.8128017783164978),
 ('recrutement', 0.8127676248550415),
 ('lentement', 0.8048897981643677),
 ('hautement', 0.7931801080703735)]

In [32]:
fasttext_model_press.wv.most_similar("traitement", topn=top_n)

[('traitment', 0.9091444611549377),
 ('taaitement', 0.9002687931060791),
 ('evitement', 0.8486114144325256),
 ('traitements', 0.840034008026123),
 ('etroitement', 0.8144130706787109),
 ('lentement', 0.8014160394668579),
 ('correctement', 0.800934910774231),
 ('allaitement', 0.7987581491470337),
 ('recrutement', 0.7970902323722839),
 ('hautement', 0.7945890426635742)]

maladie

In [33]:
fasttext_model.wv.most_similar("maladie", topn=top_n)

[('malade', 0.8126126527786255),
 ('maladies', 0.7736676335334778),
 ('amantadie', 0.7351871728897095),
 ('maldi', 0.7070671319961548),
 ('maltraitance', 0.6765587329864502),
 ('malaise', 0.6686583757400513),
 ('malades', 0.5842608213424683),
 ('malt', 0.5831559896469116),
 ('malgre', 0.5820046663284302),
 ('revelateurs', 0.5727584362030029)]

In [34]:
fasttext_model_press.wv.most_similar("maladie", topn=top_n)

[('malade', 0.8307108879089355),
 ('maladies', 0.7987197637557983),
 ('amantadie', 0.753980278968811),
 ('maldi', 0.7389033436775208),
 ('maltraitance', 0.688429594039917),
 ('malaise', 0.6574214696884155),
 ('malt', 0.6551423072814941),
 ('malades', 0.6232011318206787),
 ('zie', 0.6144228577613831),
 ('malherbe', 0.5927330255508423)]

solution

In [35]:
fasttext_model.wv.most_similar("solution", topn=top_n)

[('dissolution', 0.9642202258110046),
 ('dilution', 0.8752856850624084),
 ('pollution', 0.8713375329971313),
 ('reconstitution', 0.8186519742012024),
 ('execution', 0.8122972846031189),
 ('evolution', 0.8111284375190735),
 ('constitution', 0.8108775615692139),
 ('substitution', 0.8104674816131592),
 ('institution', 0.801041841506958),
 ('microdeletion', 0.7897745370864868)]

In [36]:
fasttext_model_press.wv.most_similar("solution", topn=top_n)

[('dissolution', 0.9617725610733032),
 ('dilution', 0.8633571863174438),
 ('pollution', 0.8465920686721802),
 ('execution', 0.8038966655731201),
 ('evolution', 0.7997591495513916),
 ('constitution', 0.7981904149055481),
 ('reconstitution', 0.7939782738685608),
 ('substitution', 0.7934744954109192),
 ('preparation', 0.7868441939353943),
 ('institution', 0.7852331399917603)]

jaune

In [37]:
fasttext_model.wv.most_similar("jaune", topn=top_n)

[('lane', 0.6069298982620239),
 ('rouge', 0.5857460498809814),
 ('jaunisse', 0.5671855807304382),
 ('oxyde', 0.5602437853813171),
 ('triacetine', 0.557691216468811),
 ('hippocrate', 0.5483880043029785),
 ('rougeole', 0.5414905548095703),
 ('titane', 0.5408645868301392),
 ('dioxyde', 0.538782000541687),
 ('fiable', 0.5368438959121704)]

In [38]:
fasttext_model_press.wv.most_similar("jaune", topn=top_n)

[('oxyde', 0.5710400342941284),
 ('lane', 0.5590745210647583),
 ('triangle', 0.5448751449584961),
 ('ethylcellulose', 0.5370972156524658),
 ('auvergne', 0.5358014106750488),
 ('dioxyde', 0.532309889793396),
 ('cellulose', 0.5294350981712341),
 ('titane', 0.5276556015014648),
 ('jaunisse', 0.527066707611084),
 ('macrogol', 0.5252885222434998)]

#TODO:
Calculer une metrique moyenne a partir des mots les plus similaires

Conclusions

On remarque que les mots les plus similaires calculés par Fasttext sont ceux s'ecrivant de maniere similaire au mot d'interet. Cela est du à la maniere dont Fasttext découpe les mots. Néanmoins ces mots souvent sont parfois sématiquement proches.

On note qu'avec les methodes Skip gram et CBOW les mots apparaissant dans le contexte d'un mot d'interet sont bien predits. Cependant il y'a parfois des stops words parmi les mots les plus similaires. Ce qui suggere de faire un pre-traitement sur le texte pour de meilleurs résultats.