In [1]:
import numpy as np
import pandas as pd
import torch
from transformers import (
    BertModel, BertTokenizer,
    XLMRobertaTokenizer, XLMRobertaModel,
)
import logging
import os

os.environ['KMP_DUPLICATE_LIB_OK']='True'

logger = logging.getLogger()
logger.level = logging.ERROR

In [2]:
# For now we are using a non-subword/bpe model (i.e. not BERT)
# so that it is easier to get the hidden state for the full word,
# rather than taking the mean of the hidden states (which is also possible).

hf_weights_name = 'xlm-roberta-base'
hf_tokenizer = XLMRobertaTokenizer.from_pretrained(hf_weights_name)
hf_model = XLMRobertaModel.from_pretrained(
    hf_weights_name,
    output_hidden_states=True,
)

In [3]:
hf_tokenizer.convert_tokens_to_ids(hf_tokenizer.tokenize("communautarisme"))

[60013, 916, 867, 5071]

In [4]:
def get_subidx(x, y):
    '''
    Used to get the start index of the subword sequence corresponding to the token of 
    interest.
    '''
    l1, l2 = len(x), len(y)
    for i in range(l1):
        if x[i:i+l2] == y:
            return i
    raise ValueError()

In [5]:
import string, re

# communautarisme
basedir = "/Users/yukatherin/Downloads/vie-publique-discours"
index_fp = os.path.join(basedir, "filtered-by-term/communautarisme.tsv")
dterm = "communautarisme"
indexdf = pd.read_csv(index_fp, header=None)
window_size_tok = 17


# get encoded idx for term
dterm_converted_idx = hf_tokenizer.convert_tokens_to_ids(hf_tokenizer.tokenize(dterm))

# set up output
termfreq_hiddenstates = []
term_text_sequences = []

for i in range(indexdf.shape[0]):
    textpath = os.path.join(basedir, indexdf.iloc[i, 0])

    # 1-load the text
    with open(textpath) as f:
        alltext = f.readlines()
    alltext = " ".join(alltext)

    # 2-find term in text, using string tokenization
    alltext_tok = re.split("(\W+)", alltext)
    try:
        dterm_ind = alltext_tok.index(dterm)
    except ValueError:
        continue

    # 3-extract window surrounding term, using whitespace tokenization
    text_seq = " ".join(alltext_tok[dterm_ind - window_size_tok: dterm_ind + window_size_tok + 1])
    term_text_sequences.append(text_seq)
    
    # 4-run through encoder to extract hidden states
    encoded_input = hf_tokenizer(text_seq, return_tensors='pt')
    output = hf_model(
        **encoded_input,
        output_hidden_states=True,
    )

    # output hidden state corresponding to word
    try:
        term_hiddenstate_index = get_subidx(encoded_input.input_ids[0].tolist(), dterm_converted_idx)
    except ValueError:
        print(encoded_input.input_ids, dterm_converted_idx)
        print("should not happen..")
        continue
    print(term_hiddenstate_index, encoded_input.input_ids)
    hiddenstates_dterm = output.hidden_states[0][0, term_hiddenstate_index: term_hiddenstate_index + len(dterm_converted_idx), :]
    hiddenstates_dterm_mean = np.mean(hiddenstates_dterm.detach().numpy(), axis=0)
    termfreq_hiddenstates.append(hiddenstates_dterm_mean)

14 tensor([[     0,  11666,  82707,    152,   8459,  37517,   1255,     95,   5737,
            191,    920,  12282,    613,     95,  60013,    916,    867,   5071,
              6,      4,    501,    242,    437,   9896,    446,    613,  15324,
              6,      5,   3157,  62630,    674,  15324,      6,      5, 117619,
              2]])
18 tensor([[     0,    335,  76234,  60614, 113439,      7,    437,     20,    405,
             41,   3419,     31,  25331,  11666,  82707,  49878,     67,     95,
          60013,    916,    867,   5071,    705,    821, 105621,     20,   9846,
            441,  53966,    294,  60170,  23256,   9713,  11670,    108,  24855,
            452,      6,      5,    845,    108,      2]])
11 tensor([[    0,   501,   242,   437, 31423,  1103,   242,   211,   113,    10,
           115, 60013,   916,   867,  5071, 39100,   807,    21, 33590,     6,
             4,  1103,   242,   211,   113,    10,     2]])
16 tensor([[     0,  12723,   4360,      6,    

16 tensor([[     0,  38016,     19,  84319, 102103,      7,    578, 211955,     42,
            224, 105475,      7,      6,      4,   4953,     95,  60013,    916,
            867,   5071,   2045,    224, 187098,      7,   4099,  77064,  10999,
              6,      5,   8018,     96,    242,  82739,      6,      4,      2]])
13 tensor([[    0,   199,    24, 31521,     6,     4, 30639,   242, 46557,  3244,
          4372,  4385,    95, 60013,   916,   867,  5071,     8,  2168,  6276,
            18, 21460,     6,     4,  3244,  4372,  4385,   199, 13830, 46643,
         21460,     6,     4,     8, 24931,     2]])
10 tensor([[     0,    452,    773,  33590,    569,  10274,  73822,     13,    613,
             95,  60013,    916,    867,   5071,    300,    613,     21, 108821,
           1148, 105475,      7,      6,      5,    636,  94324,      2]])
13 tensor([[     0,  18727,  53068, 106126,    429,      6,      5, 128570,  11497,
           2064,  84758,     86,     95,  60013,    91

19 tensor([[     0,      6,      4,    108,   2045,    452,  31917,   4757,  19736,
              6,      5, 231282,  20880,  19412,  37437,    571,  63516,  26219,
            531,  60013,    916,    867,   5071,      6,      4, 115065,    572,
         122080, 135492,  26171,    104,      6,  17335,      6, 158857,    807,
             21, 163965,      2]])
12 tensor([[     0,      8,  43374,   2970,      6, 126391,      7,      6,  60880,
             21,  79414,    531,  60013,    916,    867,   5071,      6,      5,
            891,    437,  83384,   1103,      6,  17335,     22,  21412,     91,
              6,  17335,    211,      2]])
12 tensor([[     0,  31360,  10442,      8,     21,   4830,     13, 171149,    578,
              6,  99580,     51,  60013,    916,    867,   5071,    393, 187075,
            289,      6,      5,   1775,   3244,    104,      6,  17335,  97867,
           5325,   4099, 199200,      2]])
15 tensor([[     0,      6,  17335,  16380,  45556,      7, 

12 tensor([[     0,     20,    405,     41,    405, 117913,  21084,     40,   3537,
          11832,     56,    531,  60013,    916,    867,   5071,     82,    253,
             96,    242,      6, 199417,    705,   3775,      6,      4,    253,
             96,    242,  34685,    555,    329,      2]])
12 tensor([[     0,  29582,    253,     21, 123683,      6,      4,    104,    242,
            773,  13822,      8,  60013,    916,    867,   5071,      6,      5,
         104552,     95,  73969,     11,  15537,     20,  11789,     91,    242,
              6, 230630,    705, 112635,   1103,    242,      2]])
14 tensor([[     0,     57, 147707,      8, 108821,     42,      6,      4,     55,
         113721,      6,      4,  13993,     95,  60013,    916,    867,   5071,
              6,      4,   5736, 184160, 169112,    807,     95,   2147, 111451,
              8,     96,    242,      2]])
24 tensor([[     0,      6,      5,  84602,     62,  10528,   6387,      8,  12982,
         

15 tensor([[     0,      6,      4,     82,   8843,     21,  33590,  72358,      6,
              4,      6, 193785,     13,   1609,     95,  60013,    916,    867,
           5071,     82,     96,    242,  17578,  15032, 161838,    224,  33590,
              7,      6,      4,     51,  77064,      2]])
13 tensor([[     0,   3393,   2831,    104,    242,      6, 158356,   2533,     22,
              6,  34590,  18211,     95,  60013,    916,    867,   5071,      6,
              5,    845,     45,  12794,     41,     21,  18595,   2312,    437,
          11135,  77064,  26988,      2]])
18 tensor([[     0,    224, 188442,   1100,  20129,      7,    711,     62,  30121,
         199836,  25087, 141197,     20,  43908,  94324,   9655,      6,      4,
          60013,    916,    867,   5071,      6,      4,     98,    854,    152,
           1263,    711,    891,     10,   5325,   1492,   2312, 115963,      2]])
12 tensor([[     0,  14713,  59959,  32512,      6,      4,    291, 145520,  

11 tensor([[     0, 164492, 211955,    219,   5849,    199,  88162,    569, 128120,
            660,     95,  60013,    916,    867,   5071,      8, 115963, 185436,
           2819,    569,  50571,    660,     41,     96,    242,  17578,      2]])
11 tensor([[     0,     94,     95,  33422, 167125,    474,      8,     21, 171149,
          13993,     95,  60013,    916,    867,   5071,    340,      5,   2396,
             20,  17428,    563,  62056,  14255,      6,      5,     20,  19597,
          19488,      7,   3193,      7, 107768, 184534,     20,      2]])
12 tensor([[     0,     95,  63767,   5071,      6,      4,     82,    366,     95,
           4950,   1609,     95,  60013,    916,    867,   5071,      6,      5,
          77641,    152,  26521,  57112,      6,      4,  12349,    104,    242,
           4677,    115, 114824,  73396,    674,      6,      4,      2]])
12 tensor([[     0,   9942,   1609,    199,  88162,     44,      5,    636,  63767,
           5071,     82,  

12 tensor([[     0,    437,      6,      4,     98,    113,    437,    711, 128570,
            199,  67311,    115,  60013,    916,    867,   5071,      6,      5,
          68382,   5585,  20251,    437, 133197,    531,  26158,      6,      4,
            211,    653,    242,      2]])
17 tensor([[     0, 167125,      6,      4,     95,  29582,    224,  66320,  16681,
              6,      4,    569,     23,    590,    474,   1704,     95,  60013,
            916,    867,   5071,      6,      4,     82,    104,    242,  16380,
          77148,  28430,    674,  10442,    224, 143088,      6,      5,  26977,
            944,      2]])
10 tensor([[     0,     51,  12006,  18888, 122772,     82,    807,     95,  12006,
            115,  60013,    916,    867,   5071,      6,      4,    569,     91,
            242,  38271,    217,    807,    104,    242,  16380,  38440,      6,
              5, 183433,      2]])
16 tensor([[     0, 109769,    944,    253,     96,    242,      6,  67543, 

13 tensor([[     0,  72358, 121710,    807,    104,    242,  16380,  19859, 130523,
              7,      6,      4,     95,  60013,    916,    867,   5071,    437,
         190505,     82,   4360,  26277,    446,      6,      4,    366,  18632,
             22,      2]])
12 tensor([[     0,    242,     23,  48310,    674,  10174,    115, 156405,    613,
          39836,    628,    115,  60013,    916,    867,   5071,    705,   2022,
          78387,    219,     20,    808,     20,    211,     21,  36373,     14,
           1363,    224,  62649,      7,      8,      2]])
15 tensor([[     0,      6,      5,   2161,  17095,     51,  31164,    104,    242,
              6, 215930,   3537, 143637,   1212,   5736,  60013,    916,    867,
           5071,  50734,      6,      4,    773,      6, 229033,      8,  31164,
           4099, 128652,     13,      6,      4,   4099,      2]])
13 tensor([[     0,     45,  42182,    452,      6,      4,     41,     95,   3193,
          16867,  10999, 

12 tensor([[    0,   437, 51841,    51, 99054,    20,    36, 42182,    95,  5365,
            20,     8, 60013,   916,   867,  5071,     6,     4,     8, 46512,
         15032,   253,   456, 38648,     7,     8,    21,  9942,     6,     5,
           845,     2]])
13 tensor([[     0,  43517,     90,    253,     96,    242,      6,  67543,    578,
           2973,  42658,     56,     95,  60013,    916,    867,   5071,      6,
              5,   5813, 184534,     20,   1023,    115,  21293,  19183,      8,
          37135,      2]])
10 tensor([[     0,    366,  67193,  14834,  55475, 202350, 209508,     95,  40559,
            115,  60013,    916,    867,   5071,    705,    853,     76,   1674,
           7650,     86,   7814,     34,      8,   3453,  65985,    152,   3157,
          62630,    674,      6,      5,    313,    242,    437,      6,  68053,
           7092,   1236,      6,      5,      2]])
11 tensor([[    0,    95, 37517,  1255,  1806,  1609, 12847,   199, 54054,     7,
  

12 tensor([[     0,   1023,    158,      7,  47804,   7817,   3739,  72005,    773,
         125195,   5206,    115,  60013,    916,    867,   5071,    705,      6,
          67099,  71621,    335, 174444,   1062,  75559,  11670,  46923,   1103,
            242,    211,    113,     10,    224,      2]])
14 tensor([[     0,  15889,     91,    242,    437,   9219,     82,   1806,  43894,
              7, 119857,      6,      5,    992,  60013,    916,    867,   5071,
              6,      4,  19269,     13,     20,    580,  55519,   2970,   2393,
            365,     20, 105459,   5245,      6,      4,     91,    242,    393,
          13644,   1430,      2]])
15 tensor([[     0,     57,  51584,   3996,     86,     22,  56361,      8,     21,
           9392, 112113,      6,      4,    613,     95,  60013,    916,    867,
           5071,    705,    891,    108,   2064,     10,    452, 219595,   6868,
              6,      4,   1869,      6,      4,      2]])
10 tensor([[    0,     8,   

In [6]:
import joblib
joblib.dump(termfreq_hiddenstates, "/Users/yukatherin/Downloads/vie-publique-discours/embeddings/communautarisme2.joblib")
joblib.dump(term_text_sequences, "/Users/yukatherin/Downloads/vie-publique-discours/embeddings/communautarisme2_term_text_sequences.joblib")



['/Users/yukatherin/Downloads/vie-publique-discours/embeddings/communautarisme2_term_text_sequences.joblib']

## Unmasking tokens

In [7]:

from transformers import pipeline

unmasker = pipeline('fill-mask', model='xlm-roberta-base')

# discours_index_scraping-2//1036.txt:Et quel chemin on va mettre. Mais je tiens à dire une chose, il ne faut pas séparer, il ne faut pas mélanger les enjeux de séparatisme, de laïcité, je suis effectivement la ministre chargée d'assurer la neutralité des services publics, puisque au coeur du statut des fonctionnaires, il y a la laïcité, ça, c'est une chose. Et de l'autre, il y a un autre enjeu qui est comment dans notre pays toute notre jeunesse a accès à des métiers, qui sont ceux du service de l'intérêt général, qui ne peuvent pas être réservés à certains…
unmasker("il ne faut pas séparer, il ne faut pas mélanger les enjeux de séparatisme, de <mask>, je suis effectivement la ministre chargée d'assurer la neutralité des services publics, puisque au coeur du statut des fonctionnaires")


# discours_index_scraping-2//1045.txt:Eric DUPOND-MORETTI, le séparatisme, laïcité renforcée, projet de loi le 9 décembre, présenté par le gouvernement, l'ennemi, c'est l'islamisme ?
# unmasker("le séparatisme, <mask> renforcé, projet de loi le 9 décembre, présenté par le gouvernement, l'ennemi, c'est l'islamisme ?")

# les sénateurs, peuvent proposer des amendements pour modifier le titre, donc c'est un titre, pour la laïcité, pour renforcer les valeurs de la République
# unmasker("donc c'est un titre, pour le <mask>, pour renforcer les valeurs de la République")


[{'sequence': "il ne faut pas séparer, il ne faut pas mélanger les enjeux de séparatisme, de, je suis effectivement la ministre chargée d'assurer la neutralité des services publics, puisque au coeur du statut des fonctionnaires",
  'score': 0.08240126818418503,
  'token': 2,
  'token_str': '</s>'},
 {'sequence': "il ne faut pas séparer, il ne faut pas mélanger les enjeux de séparatisme, de collaboration, je suis effectivement la ministre chargée d'assurer la neutralité des services publics, puisque au coeur du statut des fonctionnaires",
  'score': 0.06204807385802269,
  'token': 133258,
  'token_str': 'collaboration'},
 {'sequence': "il ne faut pas séparer, il ne faut pas mélanger les enjeux de séparatisme, de démocratie, je suis effectivement la ministre chargée d'assurer la neutralité des services publics, puisque au coeur du statut des fonctionnaires",
  'score': 0.030399424955248833,
  'token': 234532,
  'token_str': 'démocratie'},
 {'sequence': "il ne faut pas séparer, il ne faut