# Testing the Topic Modelling using the Gensim Library

Usual imports come first.

In [None]:
import pandas as pd
import glob
import os
import numpy as np
from time import time
import logging
import gensim
import bz2

# 1. Load the data from the Transcript files
At the moment, we only consider the entries for which the field `LanguageOfText` is `FR`, namely the ones in French. We will consider the text in German later on. We show below one example of the text we consider.

In [None]:
dataset = []

path = 'datas/Transcript/'
allFiles = glob.glob(os.path.join(path, '*.csv'))

for file_ in allFiles:
    data = pd.read_csv(file_)
    dataset = dataset + list(data[(data['Text'] == data['Text']) & (data['LanguageOfText'] == 'FR')]['Text'].values)
    
print('Length of the dataset', len(dataset))
print(dataset[0],'\n',dataset[1],'\n',dataset[2])

The length of the transcripts largely vary from an entry to another, but it reflects exactly what is discussed at the federal parliament. Processing them correctly will allow us to grasp the topic which are discussed at the parliament.

# 2. Format the data in order to use LDA with Gensim
First of all, we load the `stop_words`, a list which refers all the common words for French, and that we must not take into accoung when doing the topic modelling, as they do not convey any useful information. The pipeline we follow is the following :
 1. Load the `stop_words`
 2. Remove those common words and tokenize our dataset (break it down into words) 
 3. We count the frequency of the words and remove the ones that appear only once in total.
 4. **TODO - ** Implement the *Stemming* of the data (cf. [a French stemming algorithm](http://snowball.tartarus.org/algorithms/french/stemmer.html)).

In [None]:
from stop_words import get_stop_words
stop_words = get_stop_words('fr')
print(stop_words)

In [None]:
## First of all we load the stop_words list
import re

with open ("French_stop_words_changed.txt", "r") as myfile:
    stop_words=myfile.read()  
stop_words = stop_words.split(',')
## Secondly we remove the common words in our document corpus and tokenize 
# The re.split function takes as first arguments everything we split at. At the moment, this is 
# ' ' - '\' - ''' (apostrophe) -  '\n' - '(', ')' - ',' - '.' - ':' - ';'
texts = [[word for word in re.split(' |\'|\n|\(|\)|,|;|:|\.',document.lower()) if word not in stop_words]
         for document in dataset]

# Thirdly we remove the words that appear only once in a text
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
    
texts = [[token for token in text if frequency[token] > 1]
         for text in texts]
print(texts[0:5])

In [None]:
stop_words[1:20]

# 3. Perform the LDA topic modelling and print the results.

Formatting the data into a dictionnary and a corpus, necessary entries for the LdaModel function of Gensim.

In [None]:
dictionary = gensim.corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=1)

In [None]:
ldamodel.print_topics()