# 01. Topic Modelling using the Gensim Library

Usual imports come first.

In [None]:
import pandas as pd
import glob
import os
import numpy as np
from time import time
import logging
import gensim
import bz2

## 1. Load the data from the Transcript files
At the moment, we only consider the entries for which the field `LanguageOfText` is `FR`, namely the ones in French. We will consider the text in German later on. We show below one example of the text we consider.

In [None]:
dataset = []

path = '../datas/treated_data/Transcript/'
#path = 'datas/Vote/'
allFiles = glob.glob(os.path.join(path, 'FR*.csv'))

for file_ in allFiles:
    data = pd.read_csv(file_)
    dataset = dataset + list(data[(data['Text'] == data['Text'])]['Text'].values)
    #dataset = dataset + list(data[(data['BusinessTitle'] == data['BusinessTitle'])]['BusinessTitle'].values+' ')

    
print('Length of the dataset', len(dataset))
#print(dataset[0],'\n',dataset[1])
#data.head()

The length of the transcripts largely vary from an entry to another, but it reflects exactly what is discussed at the federal parliament. Processing them correctly will allow us to grasp the topic which are discussed at the parliament.

## 2. Format the data in order to use LDA with Gensim
First of all, we load the `stop_words`, a list which refers all the common words for French, and that we must not take into accoung when doing the topic modelling, as they do not convey any useful information. The pipeline we follow is the following :
 1. Load the `stop_words`
 2. Remove those common words and tokenize our dataset (break it down into words) 
 3. We count the frequency of the words and remove the ones that appear only once in total.
 4. (Implement the *Stemming* of the data (cf. [a French stemming algorithm](http://snowball.tartarus.org/algorithms/french/stemmer.html)). (Done with the [nltk](http://www.nltk.org/api/nltk.stem.html) library) ) -> Not implemented at the moment
 5.  Remove all the words of length <= 2.
 
 **N.B. THIS ALGORITHM IS VERY SLOW !!!!**

In [None]:
## First of all we load the stop_words list
import re
from stop_words import get_stop_words

# German stop words because there are some german words even in the french transcript
stop_words_de = get_stop_words('de')

# Loading the custom french stop-words list
with open ("stop_dictionaries/French_stop_words_changed.txt", "r") as myfile:
    stop_words=myfile.read()  
stop_words = stop_words.split(',')

stop_words = stop_words_de+stop_words

## Secondly we remove the common words in our document corpus and tokenize 
# The re.split function takes as first arguments everything we split at. At the moment, this is 
# ' ' - '\' - ''' (apostrophe) -  '\n' - '(', ')' - ',' - '.' - ':' - ';'
# We also filter the words which are shorter than 3 letters, as they are very unlikely to provide any information, 
# and finally, we remove the common words.
texts = [[word for word in re.split(' |\'|\n|\(|\)|,|;|:|\.|\[|\]|\’',
                                    document.lower()) if (len(word) > 4 and (word not in stop_words))] 
         for document in dataset]

# Thirdly we remove the words that appear only once in a text - Consider the stemmed version
from collections import defaultdict
from nltk.stem.snowball import FrenchStemmer
from nltk.stem import WordNetLemmatizer
FS = FrenchStemmer()

# NOTE THAT AT THE MOMENT, WE DO NOT DO STEMMING EVEN IF IT IS LOADED

frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1


texts = [[token for token in text if frequency[token] > 1]
         for text in texts]



## 3. Perform the LDA topic modelling and print the results.

Formatting the data into a dictionnary and a corpus, necessary entries for the LdaModel function of Gensim.

In [None]:
dictionary = gensim.corpora.Dictionary(texts)
# Converts a collection of words to its bag of word representation (list of word_id, word_frequency 2-tuples$)
corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
if not os.path.exists("../datas/lda"):
    os.makedirs("../datas/lda")

In [None]:
dictionary.save('../datas/lda/ldaDictionary')

Note that in the algorithm below, we need to choose the number of topics, which is the number of clusters of data that we want to find. Note that the accuracy of our algorithm depends a lot on picking a good number of topics.

In [None]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus,num_topics=11,id2word = dictionary)#, passes=1)
#ldamodel = gensim.models.hdpmodel.HdpModel(corpus, id2word=dictionary)


In [None]:
for i, bag in enumerate(ldamodel.print_topics(num_words=8)):
    print("============")
    print("Cluster ", i, ": ", bag)

In [None]:
ldamodel.save('../datas/lda/ldamodel')