# Gensim and Latent Dirichlet Allocation

In [40]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN)

In [44]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import re, json, numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from gensim import models, corpora

# nltk.download('stopwords')

In [3]:
stop_words = set(nltk.corpus.stopwords.words('french'))
stemmer = nltk.stem.snowball.FrenchStemmer()

# Paremeters

In [4]:
# Number of topics
n_topics = 8

## Load preprocessed data

Data is already gathered in a single parquet files with selected columns about opinions (proposals):
- title and descriptions
- votes (number, positive, mitigate, negative)
- arguments (pros, cons)

In [5]:
df = pd.read_parquet('leVraiDebat-opinions.parquet')

  Numpy8 = numba.jitclass(spec8)(NumpyIO)
  Numpy32 = numba.jitclass(spec32)(NumpyIO)


**Take only proposals with more than 10 votes**

In [6]:
df_sel = df[df.contributions_votesCount >= 10]
df_sel.shape

(10766, 9)

In [7]:
df_sel.columns

Index(['master_tag', 'contribution_versions_title',
       'contribution_versions_bodyText', 'contributions_votesCount',
       'contributions_votesCountOk', 'contributions_votesCountMitige',
       'contributions_votesCountNok', 'contributions_argumentsCountFor',
       'contributions_argumentsCountAgainst'],
      dtype='object')

# Simple bag of word model 

Tokenize in words and compute word frequencies with Scikit-Learn

In [8]:
def compute_words_frequencies(texts):
    """ Remove punctuation, stop words, tokenize and compute frequencies """
    vect = CountVectorizer(analyzer='word', stop_words=stop_words).fit(texts)
    bag_of_words = vect.transform(texts)
    sum_words = bag_of_words.sum(axis=0) 
    freq = [(word, sum_words[0, idx]) for word, idx in vect.vocabulary_.items()]
    return bag_of_words, freq

In [9]:
bow, word_freqs = compute_words_frequencies(df_sel['contribution_versions_title'])

In [10]:
sorted(word_freqs, key=lambda x: x[1], reverse=True)[:10]

[('suppression', 468),
 ('plus', 341),
 ('tous', 333),
 ('retraites', 237),
 ('retraite', 235),
 ('france', 233),
 ('élus', 217),
 ('non', 215),
 ('supprimer', 214),
 ('fin', 204)]

In [11]:
type(bow), bow.shape

(scipy.sparse.csr.csr_matrix, (10766, 8551))

# Tokenize, lemmatize and cleanup

In [12]:
def tokenize_clean(text, stem_map):
    text = re.sub(r"[,;\.\?!:\-'\"/\(\)]+", ' ', text).lower()
    tokens = nltk.word_tokenize(text)
    # Remove stop words
    tokens = filter(lambda w: w not in stop_words, tokens)
    # Filter short tokens (length < 3)
    tokens = filter(lambda w: len(w) > 2, tokens)
    final_tok = []
    for tok in tokens:
        stem = stemmer.stem(tok)
        # Keep the shortest word corresponding to the stem
        if stem in stem_map:
            if len(tok) < len(stem_map[stem]):
                stem_map[stem] = tok
        else:
            stem_map.update({stem: tok})
        final_tok.append(stem)
    return final_tok

In [13]:
stem_map = {}
tokens = df_sel['contribution_versions_title'].apply(tokenize_clean, args=[stem_map])

In [14]:
tokens.iloc[0]

['tirag', 'sort', 'vot', 'blanc']

In [15]:
dictionary = corpora.Dictionary(tokens)
corpus = tokens.apply(dictionary.doc2bow)
(type(dictionary), len(dictionary), type(corpus), corpus.shape)

(gensim.corpora.dictionary.Dictionary,
 5502,
 pandas.core.series.Series,
 (10766,))

In [16]:
corpus.iloc[0]

[(0, 1), (1, 1), (2, 1), (3, 1)]

# TF-IDF

- TF-IDF = Term frequency times inverse document frequency

In [17]:
tfidf = models.TfidfModel(list(corpus))

In [18]:
corpus_tfidf = tfidf[corpus]

In [19]:
next(corpus_tfidf.__iter__())

[(0, 0.48367984551600596),
 (1, 0.4233321886454536),
 (2, 0.6724132396813086),
 (3, 0.3670205719023794)]

#  LSI

- LSI = latent semantic indexing

In [20]:
model_lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=n_topics)  # initialize an LSI transformation
corpus_lsi = model_lsi[corpus_tfidf]  # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi

In [21]:
model_lsi.print_topics(n_topics)

[(0,
  '-0.971*"retrait" + -0.129*"index" + -0.082*"csg" + -0.082*"inflat" + -0.059*"an" + -0.056*"pension" + -0.049*"revaloris" + -0.043*"salair" + -0.038*"tous" + -0.038*"élus"'),
 (1,
  '0.382*"fiscal" + 0.374*"suppress" + 0.288*"vot" + 0.226*"salair" + 0.165*"tax" + 0.155*"justic" + 0.153*"immigr" + 0.145*"tous" + 0.143*"élus" + 0.137*"impôt"'),
 (2,
  '0.913*"immigr" + 0.212*"vot" + -0.203*"fiscal" + -0.129*"suppress" + 0.085*"blanc" + 0.079*"obligatoir" + -0.075*"justic" + -0.059*"salair" + -0.054*"tax" + -0.048*"fraud"'),
 (3,
  '0.716*"vot" + -0.353*"immigr" + 0.292*"blanc" + -0.279*"fiscal" + 0.279*"obligatoir" + -0.121*"suppress" + 0.102*"droit" + -0.099*"justic" + 0.098*"compt" + 0.082*"pris"'),
 (4,
  '0.577*"fiscal" + -0.523*"salair" + -0.186*"élus" + 0.181*"vot" + 0.178*"justic" + -0.171*"augment" + 0.138*"fraud" + -0.122*"vi" + 0.120*"nich" + 0.119*"évas"'),
 (5,
  '-0.566*"suppress" + 0.480*"salair" + -0.320*"tax" + 0.245*"fiscal" + 0.173*"justic" + 0.158*"europ" + -0.1

In [22]:
next(corpus_lsi.__iter__())

[(0, -0.003000854374886836),
 (1, 0.1800590547562096),
 (2, 0.12091227374977637),
 (3, 0.40913069092361465),
 (4, 0.10551468892350148),
 (5, 0.053888461376503244),
 (6, -0.017683437239955975),
 (7, -0.053811371534191826)]

## LDA

- LDA = Latent Dirichlet Allocation

It is an alternative to LSI

In [23]:
# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 10
iterations = 400
eval_every = 10  # None # Don't evaluate model perplexity, takes too much time.

model_lda = models.LdaModel(
    corpus=corpus_tfidf,
    # Make a index to word dictionary.
    id2word=dictionary.id2token,
    chunksize=2000,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=n_topics,
    passes=passes,
    eval_every=eval_every
)

In [27]:
top_topics = model_lda.top_topics(corpus_lsi) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -0.5817.
[([(0.020474954, 'franc'),
   (0.016864885, 'publiqu'),
   (0.016292073, 'animal'),
   (0.015947426, 'gratuit'),
   (0.014196418, 'europ'),
   (0.011098131, 'vi'),
   (0.010521832, 'fair'),
   (0.010080845, 'handicap'),
   (0.009545414, 'état'),
   (0.009489485, 'retour'),
   (0.009081237, 'carbur'),
   (0.009035756, 'financ'),
   (0.009024999, 'nouvel'),
   (0.008571795, 'fonction'),
   (0.008537779, 'mettr'),
   (0.008234618, 'train'),
   (0.00818418, 'plac'),
   (0.008182746, 'isf'),
   (0.008101468, 'non'),
   (0.0076976814, 'person')],
  0.0),
 ([(0.02426842, 'retrait'),
   (0.021148501, 'justic'),
   (0.019564627, 'chass'),
   (0.016613975, 'an'),
   (0.015638296, 'contrôl'),
   (0.012035295, 'alloc'),
   (0.011106583, 'familial'),
   (0.010849664, 'techniqu'),
   (0.010209721, 'bas'),
   (0.009569257, 'jaun'),
   (0.0094048465, 'subvent'),
   (0.009135148, 'gilet'),
   (0.008898271, 'linky'),
   (0.008711955, 'fraud'),
   (0.008226878, 'réform')

In [59]:
all_scores = np.array(list(map(lambda t: list(map(lambda tt: tt[0], t[0])), top_topics)))

In [62]:
top_topics_reworked = []
scale = 50 / all_scores.max()
for topic in top_topics:
    topic_reworked = [{'size': word_duplet[0] * scale, 'text' : stem_map[word_duplet[1]]} for word_duplet in topic[0]]
    top_topics_reworked.append(topic_reworked)

In [63]:
pprint(top_topics_reworked)

[[{'size': 18.156777106393918, 'text': 'franc'},
  {'size': 14.955440571296242, 'text': 'publique'},
  {'size': 14.447482574797306, 'text': 'animal'},
  {'size': 14.141856461720053, 'text': 'gratuit'},
  {'size': 12.58909821045423, 'text': 'europe'},
  {'size': 9.841599052024046, 'text': 'vie'},
  {'size': 9.330548967800993, 'text': 'faire'},
  {'size': 8.939490717294978, 'text': 'handicap'},
  {'size': 8.464680991357682, 'text': 'état'},
  {'size': 8.415084531930265, 'text': 'retour'},
  {'size': 8.05305837537879, 'text': 'carburant'},
  {'size': 8.012726616829877, 'text': 'finance'},
  {'size': 8.003187724411735, 'text': 'nouvel'},
  {'size': 7.601295604577479, 'text': 'fonction'},
  {'size': 7.571130405826604, 'text': 'mettre'},
  {'size': 7.302293859992514, 'text': 'train'},
  {'size': 7.2575659521499425, 'text': 'place'},
  {'size': 7.256294925705655, 'text': 'isf'},
  {'size': 7.184218889418553, 'text': 'non'},
  {'size': 6.8261491020557425, 'text': 'personne'}],
 [{'size': 21.52

In [64]:
with open('topics.json', 'w') as f:
    json.dump(top_topics_reworked, f)

In [25]:
corpus_lda = model_lda[corpus] 

In [26]:
for doc, as_text in zip(corpus_lda, df_sel['contribution_versions_title'][:5]):
    print(doc, as_text)

[(0, 0.22027753), (1, 0.043466527), (2, 0.08856979), (3, 0.30207163), (4, 0.079111174), (5, 0.06400101), (6, 0.045604676), (7, 0.1568977)] Tirage au sort pour les Votes blancs
[(0, 0.1011175), (1, 0.058582455), (2, 0.11937088), (3, 0.40691215), (4, 0.105146036), (5, 0.08625802), (6, 0.061464164), (7, 0.061148833)] PAS D'ANONYMAT DANS L'ABSTENTION
[(0, 0.43552265), (1, 0.018058907), (2, 0.16328576), (3, 0.12573983), (4, 0.08609264), (5, 0.1334559), (6, 0.018985784), (7, 0.0188585)] Pic-Ric --- Action pour une organisation du mouvement par vote (mise en place immédiat du 1er RIC) et création d'un espace conviviale et pacifiste
[(0, 0.16035853), (1, 0.021416375), (2, 0.27556804), (3, 0.14890605), (4, 0.108738974), (5, 0.10521894), (6, 0.08611774), (7, 0.09367536)] Organiser des Référendums pour toutes les questions importantes (idem nos voisins suisses) et que le résultat soit pris en compte et respecté.
[(0, 0.18998747), (1, 0.037034117), (2, 0.18478711), (3, 0.2571339), (4, 0.13097402),

## References and inspirations

- https://radimrehurek.com/gensim/auto_examples/core/run_topics_and_transformations.html#sphx-glr-auto-examples-core-run-topics-and-transformations-py
- https://radimrehurek.com/gensim/auto_examples/tutorials/run_lda.html#sphx-glr-auto-examples-tutorials-run-lda-py
- https://towardsdatascience.com/topic-modeling-with-latent-dirichlet-allocation-by-example-3b22cd10c835
