# Testing the Topic Modelling using the Gensim Library

Usual imports come first.

In [1]:
import pandas as pd
import glob
import os
import numpy as np
from time import time
import logging
import gensim
import bz2



# 1. Load the data from the Transcript files
At the moment, we only consider the entries for which the field `LanguageOfText` is `FR`, namely the ones in French. We will consider the text in German later on. We show below one example of the text we consider.

In [2]:
dataset = []

path = 'datas/Transcript/'
allFiles = glob.glob(os.path.join(path, '*.csv'))

for file_ in allFiles:
    data = pd.read_csv(file_)
    dataset = dataset + list(data[(data['Text'] == data['Text']) & (data['LanguageOfText'] == 'FR')]['Text'].values)
    
print('Length of the dataset', len(dataset))
print(dataset[0],'\n',dataset[1])

Length of the dataset 15353
La délégation vous propose de prendre acte de son rapport écrit.
[VS]
Vom Bericht wird Kenntnis genommen
Il est pris acte du rapport
 
 La première chose que j'aimerais rappeler à cette assemblée, c'est que l'initiative populaire dont nous débattons aujourd'hui est soutenue par le Conseil fédéral qui représente tout de même l'ensemble des colorations politiques de ce Parlement - (Brouhaha) je dis bien l'ensemble puisque, effectivement, nous avons deux conseillers fédéraux UDC qui, forcément, soutiennent cette initiative. La majorité des parlementaires, la majorité des cantons également soutiennent le principe d'une limitation du droit de recours des organisations.
Même si l'initiative est acceptée, ce droit de recours des organisations contre les décisions administratives continuera d'exister. Lors des décisions prises par le peuple et/ou par les représentants du peuple que sont les parlements, les organisations sont certes privées du droit de recours, mais 

The length of the transcripts largely vary from an entry to another, but it reflects exactly what is discussed at the federal parliament. Processing them correctly will allow us to grasp the topic which are discussed at the parliament.

# 2. Format the data in order to use LDA with Gensim
First of all, we load the `stop_words`, a list which refers all the common words for French, and that we must not take into accoung when doing the topic modelling, as they do not convey any useful information. The pipeline we follow is the following :
 1. Load the `stop_words`
 2. Remove those common words and tokenize our dataset (break it down into words) 
 3. We count the frequency of the words and remove the ones that appear only once in total.
 4. Implement the *Stemming* of the data (cf. [a French stemming algorithm](http://snowball.tartarus.org/algorithms/french/stemmer.html)). (Done with the [nltk](http://www.nltk.org/api/nltk.stem.html) library)
 5.  Remove all the words of length <= 2.
 
 **N.B. THIS ALGORITHM IS VER SLOW !!!!**

In [17]:
## First of all we load the stop_words list
import re
from stop_words import get_stop_words

# German stop words because there are some german words even in the french transcript
stop_words_de = get_stop_words('de')

# Loading the custom french stop-words list
with open ("French_stop_words_changed.txt", "r") as myfile:
    stop_words=myfile.read()  
stop_words = stop_words.split(',')

stop_words = stop_words_de+stop_words

## Secondly we remove the common words in our document corpus and tokenize 
# The re.split function takes as first arguments everything we split at. At the moment, this is 
# ' ' - '\' - ''' (apostrophe) -  '\n' - '(', ')' - ',' - '.' - ':' - ';'
# We also filter the words which are shorter than 3 letters, as they are very unlikely to provide any information, 
# and finally, we remove the common words.
texts = [[word for word in re.split(' |\'|\n|\(|\)|,|;|:|\.|\[|\]',
                                    document.lower()) if (len(word) > 2 and (word not in stop_words))] 
         for document in dataset]

# Thirdly we remove the words that appear only once in a text - Consider the stemmed version
from collections import defaultdict
from nltk.stem.snowball import FrenchStemmer
FS = FrenchStemmer()

frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[FS.stem(token)] += 1


texts = [[FS.stem(token) for token in text if frequency[FS.stem(token)] > 1]
         for text in texts]

print(texts[0:2])

[['déleg', 'acte', 'écrit', 'bericht', 'kenntn', 'genommen', 'acte'], ['rappel', 'assembl', 'populair', 'débatton', 'soutenu', 'représent', 'color', 'polit', 'brouhah', 'puisqu', 'effect', 'conseiller', 'fédéral', 'udc', 'forc', 'soutiennent', 'parlementair', 'soutiennent', 'princip', 'limit', 'recour', 'organis', 'accept', 'recour', 'organis', 'décis', 'administr', 'continu', 'exist', 'lor', 'décis', 'pris', 'et/ou', 'représent', 'parl', 'organis', 'cert', 'priv', 'recour', 'disposent', 'arme', 'puiss', 'référendum', 'autor', 'priv', 'continuent', 'dispos', 'recour', 'sort', 'préserv', 'maintien', 'etat', 'plein', 'garant', 'mêm', 'autor', 'disposent', 'arsenal', 'législ', 'complet', 'protect', 'environ', 'natur', 'paysag', 'aménag', 'territoir', 'eau', 'lois', 'permettent', 'regl', 'exhaust', 'problem', 'concernent', 'environ', 'autor', 'but', 'mission', 'respect', 'autor', 'respectent', 'foi', 'populair', 'rappel', 'direct', 'applic', 'garant', 'sain', 'appliqu', 'respect', 'volont'

# 3. Perform the LDA topic modelling and print the results.

Formatting the data into a dictionnary and a corpus, necessary entries for the LdaModel function of Gensim.

In [18]:
dictionary = gensim.corpora.Dictionary(texts)
# Converts a collection of words to its bag of word representation (list of word_id, word_frequency 2-tuples$)
corpus = [dictionary.doc2bow(text) for text in texts]

Note that in the algorithm below, we need to choose the number of topics, which is the number of clusters of data that we want to find. Note that the accuracy of our algorithm depends a lot on picking a good number of topics.

In [19]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=6, id2word = dictionary, passes=1)
ldamodel.print_topics()

[(0,
  '0.010*"polit" + 0.007*"européen" + 0.006*"certain" + 0.005*"international" + 0.005*"etat" + 0.005*"travail" + 0.005*"concern" + 0.004*"union" + 0.004*"relat" + 0.004*"droit"'),
 (1,
  '0.006*"mesur" + 0.005*"certain" + 0.005*"médecin" + 0.004*"protect" + 0.004*"national" + 0.004*"tribunal" + 0.004*"domain" + 0.004*"pénal" + 0.004*"rejet" + 0.004*"concern"'),
 (2,
  '0.010*"fiscal" + 0.005*"impôt" + 0.005*"imposit" + 0.005*"entrepris" + 0.005*"system" + 0.005*"économ" + 0.004*"certain" + 0.004*"etat" + 0.004*"rent" + 0.004*"taux"'),
 (3,
  '0.008*"etat" + 0.007*"trait" + 0.006*"national" + 0.006*"voix" + 0.006*"concern" + 0.006*"décis" + 0.006*"adopt" + 0.005*"enfant" + 0.005*"propos" + 0.005*"disposit"'),
 (4,
  '0.020*"franc" + 0.013*"million" + 0.007*"économ" + 0.007*"augment" + 0.006*"développ" + 0.005*"financ" + 0.005*"dépens" + 0.005*"mesur" + 0.005*"budget" + 0.005*"milliard"'),
 (5,
  '0.011*"assur" + 0.006*"coût" + 0.006*"system" + 0.006*"mesur" + 0.005*"financ" + 0.005

In [22]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=20, id2word = dictionary, passes=1)
ldamodel.print_topics()

[(0,
  '0.027*"franc" + 0.013*"financ" + 0.013*"million" + 0.011*"milliard" + 0.009*"augment" + 0.008*"dépens" + 0.008*"financi" + 0.008*"transport" + 0.006*"mont" + 0.006*"budget"'),
 (1,
  '0.013*"pénal" + 0.011*"armé" + 0.011*"civil" + 0.011*"cod" + 0.010*"victim" + 0.009*"militair" + 0.007*"certain" + 0.007*"acte" + 0.007*"interdict" + 0.006*"infract"'),
 (2,
  '0.011*"produit" + 0.011*"agricol" + 0.010*"agricultur" + 0.009*"prix" + 0.007*"déleg" + 0.006*"alimentair" + 0.006*"polit" + 0.006*"représent" + 0.006*"product" + 0.006*"import"'),
 (3,
  '0.016*"autor" + 0.013*"pénal" + 0.012*"banqu" + 0.009*"etat" + 0.008*"inform" + 0.007*"tribunal" + 0.007*"surveil" + 0.007*"financi" + 0.007*"administr" + 0.005*"system"'),
 (4,
  '0.013*"asil" + 0.012*"travail" + 0.012*"mesur" + 0.008*"étranger" + 0.007*"integr" + 0.007*"dur" + 0.007*"certain" + 0.005*"social" + 0.005*"révis" + 0.005*"économ"'),
 (5,
  '0.016*"social" + 0.012*"travail" + 0.012*"salair" + 0.010*"assur" + 0.009*"rent" + 0.