# Original static topics

In [33]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import re, json, numpy as np

import nltk
from gensim import models, corpora

## Load preprocessed data

Data is already gathered in a single parquet files with selected columns about opinions (proposals):
- title and descriptions
- votes (number, positive, mitigate, negative)
- arguments (pros, cons)

In [24]:
df = pd.read_parquet('leVraiDebat-opinions.parquet')

In [25]:
df.columns

Index(['master_tag', 'contributions_id', 'contribution_versions_title',
       'contribution_versions_bodyText', 'contributions_votesCount',
       'contributions_votesCountOk', 'contributions_votesCountMitige',
       'contributions_votesCountNok', 'contributions_argumentsCountFor',
       'contributions_argumentsCountAgainst'],
      dtype='object')

# Selection of opinions

For each topic save the 20 best proposals

In [26]:
#df.set_index('contributions_id', inplace=True)

In [27]:
topics = df.master_tag.unique()
topics

array(['democratie-institutions-referendum-dinitiative-citoyenne',
       'economie-finances-travail-compte-public',
       'education-jeunesse-enseignement-superieur-recherche-et-innovation',
       'europe-affaires-etrangeres-outre-mer', 'justice-police-armee',
       'sante-solidarite-handicap', 'sport-culture', 'expression-libre',
       'transition-ecologique-solidaire-agriculture-alimentation'],
      dtype=object)

In [30]:
saved_cols = ['contributions_id', 'contribution_versions_title',
       'contribution_versions_bodyText', 'contributions_votesCount',
       'contributions_votesCountOk', 'contributions_votesCountMitige',
       'contributions_votesCountNok', 'contributions_argumentsCountFor',
       'contributions_argumentsCountAgainst']
for i, topic in enumerate(topics):
    main_contributions = df[df.master_tag == topic].sort_values('contributions_votesCountOk', ascending=False)[:20]
    main_contributions[saved_cols].to_json('topic_%d_main_contributions.json' % i, orient='records')

In [31]:
# Without topic filter => root
main_contributions = df.sort_values('contributions_votesCountOk', ascending=False)[:20]
main_contributions[saved_cols].to_json('topic_root_main_contributions.json', orient='records')

## TF-IDF on each topic

### Cleanup, stemming and tokenization

In [34]:
stop_words = set(nltk.corpus.stopwords.words('french'))
stemmer = nltk.stem.snowball.FrenchStemmer()

In [35]:
def tokenize_clean(text, stem_map):
    text = re.sub(r"[,;\.\?!:\-'\"/\(\)]+", ' ', text).lower()
    tokens = nltk.word_tokenize(text)
    # Remove stop words
    tokens = filter(lambda w: w not in stop_words, tokens)
    # Filter short tokens (length < 3)
    tokens = filter(lambda w: len(w) > 2, tokens)
    final_tok = []
    for tok in tokens:
        stem = stemmer.stem(tok)
        # Keep the shortest word corresponding to the stem
        if stem in stem_map:
            if len(tok) < len(stem_map[stem]):
                stem_map[stem] = tok
        else:
            stem_map.update({stem: tok})
        final_tok.append(stem)
    return final_tok

In [43]:
topic = topics[0]
df_sel = df[df.master_tag == topic]
df_sel.shape

(4063, 10)

In [37]:
stem_map = {}
tokens = df_sel['contribution_versions_bodyText'].apply(tokenize_clean, args=[stem_map]) # contribution_versions_title

In [44]:
tokens.shape, tokens.dtypes

((4063,), dtype('O'))

### Bag of words

To be submitted to the TF-IDF model

In [38]:
dictionary = corpora.Dictionary(tokens)
corpus = tokens.apply(dictionary.doc2bow)
(type(dictionary), len(dictionary), type(corpus), corpus.shape)

(gensim.corpora.dictionary.Dictionary,
 12166,
 pandas.core.series.Series,
 (4063,))

### TF-IDF

In [39]:
tfidf = models.TfidfModel(list(corpus))

Apply TF-IDF on corpus

In [45]:
corpus_tfidf = tfidf[corpus]
len(corpus_tfidf)

4063

On doc #0 (limited to 10 words):

In [52]:
tfidf_corpus_iter = corpus_tfidf.__iter__()
next(tfidf_corpus_iter)[:10]

[(0, 0.08214360962119363),
 (1, 0.10157241824588523),
 (2, 0.08312896364881604),
 (3, 0.09284609445027353),
 (4, 0.07289347596852896),
 (5, 0.16492382410125303),
 (6, 0.14116672975382005),
 (7, 0.2578995055662561),
 (8, 0.19750615073436856),
 (9, 0.14264491602257198)]

On doc #1 (limited to 10 words):

In [53]:
next(tfidf_corpus_iter)[:10]

[(0, 0.027587732813359644),
 (2, 0.027918661582693897),
 (12, 0.024347633972334515),
 (16, 0.09447991585059799),
 (18, 0.0426333906125081),
 (20, 0.020882217670552847),
 (26, 0.02751559840296238),
 (27, 0.03242901216322029),
 (37, 0.03293481490743726),
 (41, 0.06983914172174212)]

In [None]:
tfidf.