# Gensim and Latent Dirichlet Allocation

In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN)

In [2]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import re, json, numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from gensim import models, corpora

# nltk.download('stopwords')

In [3]:
stop_words = set(nltk.corpus.stopwords.words('french'))
stemmer = nltk.stem.snowball.FrenchStemmer()

# Paremeters

In [4]:
# Number of topics
n_topics = 8

## Load preprocessed data

Data is already gathered in a single parquet files with selected columns about opinions (proposals):
- title and descriptions
- votes (number, positive, mitigate, negative)
- arguments (pros, cons)

In [99]:
df = pd.read_parquet('leVraiDebat-opinions.parquet')

**Take only proposals with more than 10 votes**

In [100]:
df_sel = df[df.contributions_votesCount >= 10]
df_sel.shape

(10766, 10)

In [101]:
df_sel.columns

Index(['master_tag', 'contributions_id', 'contribution_versions_title',
       'contribution_versions_bodyText', 'contributions_votesCount',
       'contributions_votesCountOk', 'contributions_votesCountMitige',
       'contributions_votesCountNok', 'contributions_argumentsCountFor',
       'contributions_argumentsCountAgainst'],
      dtype='object')

# Simple bag of word model 

Tokenize in words and compute word frequencies with Scikit-Learn

In [8]:
def compute_words_frequencies(texts):
    """ Remove punctuation, stop words, tokenize and compute frequencies """
    vect = CountVectorizer(analyzer='word', stop_words=stop_words).fit(texts)
    bag_of_words = vect.transform(texts)
    sum_words = bag_of_words.sum(axis=0) 
    freq = [(word, sum_words[0, idx]) for word, idx in vect.vocabulary_.items()]
    return bag_of_words, freq

In [31]:
bow, word_freqs = compute_words_frequencies(df_sel['contribution_versions_bodyText']) #df_sel['contribution_versions_title'])

In [32]:
sorted(word_freqs, key=lambda x: x[1], reverse=True)[:10]

[('plus', 5965),
 ('être', 3292),
 ('tous', 2524),
 ('faire', 2200),
 ('tout', 2096),
 ('france', 2039),
 ('comme', 1846),
 ('cette', 1823),
 ('sans', 1710),
 ('faut', 1709)]

In [33]:
type(bow), bow.shape

(scipy.sparse.csr.csr_matrix, (10766, 36216))

# Tokenize, lemmatize and cleanup

In [34]:
def tokenize_clean(text, stem_map):
    text = re.sub(r"[,;\.\?!:\-'\"/\(\)]+", ' ', text).lower()
    tokens = nltk.word_tokenize(text)
    # Remove stop words
    tokens = filter(lambda w: w not in stop_words, tokens)
    # Filter short tokens (length < 3)
    tokens = filter(lambda w: len(w) > 2, tokens)
    final_tok = []
    for tok in tokens:
        stem = stemmer.stem(tok)
        # Keep the shortest word corresponding to the stem
        if stem in stem_map:
            if len(tok) < len(stem_map[stem]):
                stem_map[stem] = tok
        else:
            stem_map.update({stem: tok})
        final_tok.append(stem)
    return final_tok

In [53]:
stem_map = {}
tokens = df_sel['contribution_versions_bodyText'].apply(tokenize_clean, args=[stem_map]) # contribution_versions_title

In [54]:
tokens.iloc[0]

['vot',
 'blanc',
 'nul',
 'donnent',
 'droit',
 'élus',
 'tirag',
 'sort',
 'list',
 'électoral',
 'exigent',
 'tir',
 'sort',
 'avoir',
 'vot',
 'être',
 'accord',
 'altern',
 'femm',
 'hommecet',
 'modal',
 'appliqu',
 'tout',
 'élect']

In [55]:
dictionary = corpora.Dictionary(tokens)
corpus = tokens.apply(dictionary.doc2bow)
(type(dictionary), len(dictionary), type(corpus), corpus.shape)

(gensim.corpora.dictionary.Dictionary,
 21151,
 pandas.core.series.Series,
 (10766,))

In [56]:
corpus.iloc[0]

[(0, 1),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 1),
 (10, 1),
 (11, 1),
 (12, 1),
 (13, 2),
 (14, 1),
 (15, 1),
 (16, 1),
 (17, 2),
 (18, 1),
 (19, 1),
 (20, 1),
 (21, 1)]

# TF-IDF

- TF-IDF = Term frequency times inverse document frequency

In [57]:
tfidf = models.TfidfModel(list(corpus))

In [58]:
corpus_tfidf = tfidf[corpus]

In [59]:
next(corpus_tfidf.__iter__())

[(0, 0.1685044479789146),
 (1, 0.21631486497120403),
 (2, 0.15842368976675686),
 (3, 0.11787815534573075),
 (4, 0.18490425325506987),
 (5, 0.2684397000973235),
 (6, 0.11518919872132773),
 (7, 0.25537145611030976),
 (8, 0.17811087419595098),
 (9, 0.432845753035672),
 (10, 0.1945145531807684),
 (11, 0.2523625398498655),
 (12, 0.23681448721712134),
 (13, 0.2748202668186016),
 (14, 0.18536358722256044),
 (15, 0.2382273838448666),
 (16, 0.06722287043004423),
 (17, 0.2494090732486327),
 (18, 0.15304766510804282),
 (19, 0.20774001062600111),
 (20, 0.13586809834482402),
 (21, 0.07748457223941105)]

#  LSI

- LSI = latent semantic indexing

In [60]:
model_lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=n_topics)  # initialize an LSI transformation
corpus_lsi = model_lsi[corpus_tfidf]  # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi

In [61]:
model_lsi.print_topics(n_topics)

[(0,
  '0.167*"plus" + 0.153*"retrait" + 0.142*"franc" + 0.136*"tout" + 0.130*"être" + 0.119*"tous" + 0.106*"citoyen" + 0.105*"salair" + 0.100*"impôt" + 0.098*"vot"'),
 (1,
  '-0.674*"retrait" + -0.258*"index" + -0.224*"csg" + -0.194*"inflat" + 0.178*"vot" + -0.153*"pension" + -0.145*"augment" + -0.129*"salair" + 0.104*"élect" + 0.095*"citoyen"'),
 (2,
  '-0.381*"vot" + -0.280*"retrait" + -0.253*"élect" + -0.179*"déput" + 0.179*"impôt" + 0.178*"tax" + -0.177*"blanc" + -0.167*"mandat" + 0.142*"fiscal" + -0.139*"assembl"'),
 (3,
  '0.302*"impôt" + 0.274*"tax" + 0.254*"fiscal" + -0.236*"enfant" + 0.209*"tva" + 0.199*"produit" + 0.189*"vot" + 0.135*"élect" + 0.130*"suppress" + -0.128*"écol"'),
 (4,
  '0.411*"salair" + -0.358*"vot" + 0.198*"président" + 0.192*"déput" + -0.185*"blanc" + 0.181*"fonctionnair" + 0.180*"élus" + 0.177*"mandat" + 0.175*"avantag" + -0.165*"retrait"'),
 (5,
  '-0.342*"vot" + -0.276*"salair" + -0.212*"blanc" + 0.202*"franc" + -0.179*"enfant" + 0.168*"europ" + 0.162*"

In [62]:
next(corpus_lsi.__iter__())

[(0, 0.1348633016600394),
 (1, 0.12795015157825454),
 (2, -0.2349694901724251),
 (3, 0.07786074507526446),
 (4, -0.13428832890831666),
 (5, -0.10558403834894382),
 (6, -0.01393762408604308),
 (7, 0.0421819250643696)]

## LDA

- LDA = Latent Dirichlet Allocation

It is an alternative to LSI

In [63]:
# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 10
iterations = 400
eval_every = 10  # None # Don't evaluate model perplexity, takes too much time.

model_lda = models.LdaModel(
    corpus=corpus_tfidf,
    # Make a index to word dictionary.
    id2word=dictionary.id2token,
    chunksize=2000,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=n_topics,
    passes=passes,
    eval_every=eval_every
)

In [64]:
top_topics = model_lda.top_topics(corpus_lsi) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: 0.0000.
[([(0.0010049989, 'loup'),
   (0.00073100405, 'suprim'),
   (0.0007097097, 'polut'),
   (0.000648844, 'accetuent'),
   (0.0006225631, 'chéqui'),
   (0.0005979181, 'gafam'),
   (0.00048847497, 'assang'),
   (0.00046044556, 'détentric'),
   (0.00044841712, 'vehicul'),
   (0.00042725384, 'exhorbit'),
   (0.0004053292, 'perso'),
   (0.0003859664, 'financebien'),
   (0.0003859664, 'calculet'),
   (0.00036922356, 'déport'),
   (0.00033569333, 'proportionnal'),
   (0.00030790886, 'trap'),
   (0.0001953721, 'dom'),
   (0.00016925643, 'tom'),
   (0.00016423094, 'publicitair'),
   (0.00013948757, 'golf')],
  0.0),
 ([(0.008044101, 'radar'),
   (0.002212101, 'dépendr'),
   (0.0017695649, 'suspendr'),
   (0.0014891364, 'poul'),
   (0.0010300595, 'pain'),
   (0.00081653113, 'different'),
   (0.0007247895, 'cliniqu'),
   (0.0006720397, 'epargn'),
   (0.00054665306, 'monetair'),
   (0.00046617186, 'ras'),
   (0.00043776163, 'reindex'),
   (0.0004236827, 'deductibl'),


In [65]:
all_scores = np.array(list(map(lambda t: list(map(lambda tt: tt[0], t[0])), top_topics)))

### Transform stems in closest shortest word and prepare for JSON output

In [88]:
top_topics_reworked = []
font_scale = 50 / all_scores.max()
font_size_min = 9
font_size_max = 36
max_words = 10
for topic in top_topics:
    topic_reworked = [{'size': min(max(word_duplet[0] * font_scale, font_size_min), font_size_max), 'text' : stem_map[word_duplet[1]]} for word_duplet in topic[0][:max_words]]
    top_topics_reworked.append(topic_reworked)

In [89]:
pprint(top_topics_reworked)

[[{'size': 9, 'text': 'loups'},
  {'size': 9, 'text': 'suprimés'},
  {'size': 9, 'text': 'polution'},
  {'size': 9, 'text': 'accetuent'},
  {'size': 9, 'text': 'chéquier'},
  {'size': 9, 'text': 'gafam'},
  {'size': 9, 'text': 'assange'},
  {'size': 9, 'text': 'détentrice'},
  {'size': 9, 'text': 'vehicule'},
  {'size': 9, 'text': 'exhorbitant'}],
 [{'size': 34.3787352342942, 'text': 'radar'},
  {'size': 9.454036960624705, 'text': 'dépendre'},
  {'size': 9, 'text': 'suspendre'},
  {'size': 9, 'text': 'poule'},
  {'size': 9, 'text': 'pain'},
  {'size': 9, 'text': 'different'},
  {'size': 9, 'text': 'clinique'},
  {'size': 9, 'text': 'epargne'},
  {'size': 9, 'text': 'monetaire'},
  {'size': 9, 'text': 'ras'}],
 [{'size': 9, 'text': 'tirons'},
  {'size': 9, 'text': 'aix'},
  {'size': 9, 'text': 'a9hicule_'},
  {'size': 9, 'text': 'a0_air_comprim'},
  {'size': 9, 'text': 'mayotte'},
  {'size': 9, 'text': 'redinamiser'},
  {'size': 9, 'text': 'alsace'},
  {'size': 9, 'text': 'moselle'},
  

In [90]:
with open('topics.json', 'w') as f:
    json.dump(top_topics_reworked, f)

In [69]:
corpus_lda = model_lda[corpus] 

In [91]:
for doc, as_text in zip(corpus_lda, df_sel['contribution_versions_title'][:10]):
    print(doc, as_text)

[(3, 0.9065392), (5, 0.07001161), (6, 0.013914187)] Tirage au sort pour les Votes blancs
[(3, 0.9277998), (5, 0.031275228), (6, 0.0317344)] PAS D'ANONYMAT DANS L'ABSTENTION
[(3, 0.9316617), (5, 0.029754499), (6, 0.037294496)] Pic-Ric --- Action pour une organisation du mouvement par vote (mise en place immédiat du 1er RIC) et création d'un espace conviviale et pacifiste
[(3, 0.9299177), (5, 0.040551785), (6, 0.017493946)] Organiser des Référendums pour toutes les questions importantes (idem nos voisins suisses) et que le résultat soit pris en compte et respecté.
[(3, 0.9264364), (5, 0.04256608), (6, 0.018362926)] Avoir une formation à la démocratie à l'école (Maternelle à l'Université) dans la vie associative et la vie courante
[(3, 0.92259043), (5, 0.044791408), (6, 0.019322945)] Avant chaque vote, le député organise un débat local sur la loi
[(3, 0.9245579), (5, 0.043653093), (6, 0.01883184)] "Sanctionner financièrement le non vote à tous les scrutins,   ( votes rendus obligatoires)"

## References and inspirations

- https://radimrehurek.com/gensim/auto_examples/core/run_topics_and_transformations.html#sphx-glr-auto-examples-core-run-topics-and-transformations-py
- https://radimrehurek.com/gensim/auto_examples/tutorials/run_lda.html#sphx-glr-auto-examples-tutorials-run-lda-py
- https://towardsdatascience.com/topic-modeling-with-latent-dirichlet-allocation-by-example-3b22cd10c835


In [161]:
corpus_width = [len(x) for x in corpus_lda]
np.max(corpus_width)

In [164]:
docs_for_topics = [sum([list(map(lambda y: (t, y[0][i][1], y[1], y[2]), filter(lambda x: len(x[0]) > i and x[0][i][0] == t, zip(corpus_lda, df_sel['contribution_versions_title'], df_sel['contribution_versions_bodyText'])))) for i in range(4)], []) for t in range(n_topics)]

In [165]:
df_for_topics = pd.DataFrame(sum(docs_for_topics, []), columns=['topic', 'score', 'title', 'body'])

In [166]:
df_for_topics.head()

Unnamed: 0,topic,score,title,body
0,0,0.052258,salaires hauts fonctionaires (exemple:chantal ...,revoir les salaires exhorbitants des hauts fon...
1,0,0.031012,introduction de votes proportionnels aux légis...,Introduire une proportionnalité aux élections ...
2,0,0.033588,STOP à : MOI JE VEUX UNE AIDE POUR...,On remarque l’énorme décalage entre les demand...
3,0,0.031979,Proportionnel,Mettre une dose de proportionnalité de 20%
4,0,0.010713,Finir le travail du CNR,Appel des Résistants aux jeunes générations du...


In [168]:
df_for_topics[df_for_topics.topic == 2].sort_values('score', ascending=False).head(10)

Unnamed: 0,topic,score,title,body
190,2,0.191916,Suppression de la CSG et redistribution solidaire,Suppression de la CSG +100 mdsSuppression des ...
177,2,0.187459,Suppression des niches fiscales et De la CSG,Suppression de la CSG +100 mdsSuppression des ...
176,2,0.070888,Concordats,Abrogation des Concordats d'Alsace-Moselle et ...
175,2,0.063484,Concordats,Abrogation des concordats d'Alsace-Moselle et ...
187,2,0.060509,"« L’Etat chez lui, L’Eglise chez elle » disai...",Abrogation du Statut clérical d’exception d’Al...
192,2,0.057602,Developper la voiture a air comprimé,https://fr.wikipedia.org/wiki/V%C3%A9hicule_%C...
184,2,0.047641,Etendre le respect du principe de laïcité à to...,Il s'agit d'abroger le régime concordataire qu...
173,2,0.04378,Ajout de points de TVA sur les produits de lux...,https://www.toute-la-franchise.com/vie-de-la-f...
186,2,0.038703,Référendum,Un référendum avait été organisé à Mayotte po...
178,2,0.038333,MAYOTTE ET LES COMMORES,Le gouvernement de Mayotte refuse depuis 6 moi...
