# Original static topics

In [149]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import re, json, numpy as np

import nltk
from gensim import models, corpora
import fasttext
import gc

## Load preprocessed data

Data is already gathered in a single parquet files with selected columns about opinions (proposals):
- title and descriptions
- votes (number, positive, mitigate, negative)
- arguments (pros, cons)

In [2]:
df = pd.read_parquet('leVraiDebat-opinions.parquet')

  Numpy8 = numba.jitclass(spec8)(NumpyIO)
  Numpy32 = numba.jitclass(spec32)(NumpyIO)


In [3]:
df.columns

Index(['master_tag', 'contributions_id', 'contribution_versions_title',
       'contribution_versions_bodyText', 'contributions_votesCount',
       'contributions_votesCountOk', 'contributions_votesCountMitige',
       'contributions_votesCountNok', 'contributions_argumentsCountFor',
       'contributions_argumentsCountAgainst'],
      dtype='object')

# Selection of opinions

For each topic save the 20 best proposals

In [4]:
#df.set_index('contributions_id', inplace=True)

In [5]:
topics = df.master_tag.unique()
topics

array(['democratie-institutions-referendum-dinitiative-citoyenne',
       'economie-finances-travail-compte-public',
       'education-jeunesse-enseignement-superieur-recherche-et-innovation',
       'europe-affaires-etrangeres-outre-mer', 'justice-police-armee',
       'sante-solidarite-handicap', 'sport-culture', 'expression-libre',
       'transition-ecologique-solidaire-agriculture-alimentation'],
      dtype=object)

In [6]:
saved_cols = ['contributions_id', 'contribution_versions_title',
       'contribution_versions_bodyText', 'contributions_votesCount',
       'contributions_votesCountOk', 'contributions_votesCountMitige',
       'contributions_votesCountNok', 'contributions_argumentsCountFor',
       'contributions_argumentsCountAgainst']
for i, topic in enumerate(topics):
    main_contributions = df[df.master_tag == topic].sort_values('contributions_votesCountOk', ascending=False)[:20]
    main_contributions[saved_cols].to_json('topic_%d_main_contributions.json' % i, orient='records')

In [7]:
# Without topic filter => root
main_contributions = df.sort_values('contributions_votesCountOk', ascending=False)[:20]
main_contributions[saved_cols].to_json('topic_root_main_contributions.json', orient='records')

## TF-IDF on each topic

TF-IDF in order to get the top N words and from them the 8 keywords (Step2)

In [20]:
#topic = topics[0]
#df_sel = df[df.master_tag == topic]
#df_sel.shape

(4063, 10)

### Cleanup, stemming and tokenization

In [8]:
stop_words = set(nltk.corpus.stopwords.words('french'))
stemmer = nltk.stem.snowball.FrenchStemmer()

In [55]:
def tokenize_clean_one(text, stem_map):
    text = re.sub(r"[,;\.\?!:…\-'\"/\(\)]+", ' ', text).lower()
    tokens = nltk.word_tokenize(text)
    # Remove stop words
    tokens = filter(lambda w: w not in stop_words, tokens)
    # Filter short tokens (length < 3)
    tokens = filter(lambda w: len(w) > 2, tokens)
    final_tok = []
    for tok in tokens:
        stem = stemmer.stem(tok)
        # Keep the shortest word corresponding to the stem
        if stem in stem_map:
            if len(tok) < len(stem_map[stem]):
                stem_map[stem] = tok
        else:
            stem_map.update({stem: tok})
        final_tok.append(stem)
    return final_tok

In [57]:
def tokenize_corpus(corpus):
    stem_map = {}
    tokens = corpus.apply(tokenize_clean, args=[stem_map])
    return tokens, stem_map
#tokens, stem_map = tokenize_corpus(df_sel['contribution_versions_bodyText'])
#tokens.shape

(4063,)

### Bag of words

To be submitted to the TF-IDF model

In [54]:
def get_bag_of_words(tokens):
    dictionary = corpora.Dictionary(tokens)
    #print("Voc length before filtering extremes:", len(dictionary))
    dictionary.filter_extremes(no_below=5, no_above=0.5)
    corpus = tokens.apply(dictionary.doc2bow)
    return dictionary, corpus
#dictionary, corpus_bow = get_bag_of_words(tokens)
#print("Voc length:", len(dictionary))

Voc length: 3316


### TF-IDF

In [61]:
def get_tfidf_model(corpus):
    tokens, stem_map = tokenize_corpus(corpus)
    dictionary, corpus_bow = get_bag_of_words(tokens)
    tfidf = models.TfidfModel(list(corpus_bow))
    return tfidf, dictionary, stem_map
#tfidf, dico, stem_map = get_tfidf_model(df_sel['contribution_versions_bodyText'])

In [60]:
#corpus_tfidf = tfidf[corpus]
#len(corpus_tfidf)
#tfidf_corpus_iter = corpus_tfidf.__iter__()
#next(tfidf_corpus_iter)[:10]
#next(tfidf_corpus_iter)[:10]

## Get top TF-IDF => auto select keywords

In [67]:
def get_top_tfidf(tfidf, stem_map, n=30):
    top_tfidf = sorted(tfidf.idfs.items(), key=lambda i: i[1], reverse=True)[:n]
    top_stems = map(lambda i: dictionary[i[0]], top_tfidf)
    top_words = map(lambda w: stem_map[w], filter(lambda s: s in stem_map, top_stems))
    return list(top_words)

In [68]:
top30_words = {}
for topic in topics:
    df_sel = df[df.master_tag == topic]
    print("Topic %s, %d documents" % (topic, len(df_sel)))
    tfidf, dico, stem_map = get_tfidf_model(df_sel['contribution_versions_bodyText']) #!! Not using contribution_versions_title
    print("\tVoc size (stems): %d/%d" % (len(dico), len(stem_map)))
    top30_words[topic] = get_top_tfidf(tfidf, stem_map, 30)

Topic democratie-institutions-referendum-dinitiative-citoyenne, 4063 documents
	Voc size (stems): 3316/12016
Topic economie-finances-travail-compte-public, 6410 documents
	Voc size (stems): 4007/14718
Topic education-jeunesse-enseignement-superieur-recherche-et-innovation, 1581 documents
	Voc size (stems): 1827/7219
Topic europe-affaires-etrangeres-outre-mer, 963 documents
	Voc size (stems): 1344/5605
Topic justice-police-armee, 1400 documents
	Voc size (stems): 1768/6997
Topic sante-solidarite-handicap, 1900 documents
	Voc size (stems): 2084/8058
Topic sport-culture, 593 documents
	Voc size (stems): 794/4094
Topic expression-libre, 3769 documents
	Voc size (stems): 3996/14643
Topic transition-ecologique-solidaire-agriculture-alimentation, 2600 documents
	Voc size (stems): 2772/10107


## Manual curation to avoid adjectives and wrong spelling or syntax

In [95]:
selected_keywords = {}

In [116]:
print("Topic 0:", topics[0])
top30_words[topics[0]]

Topic 0: democratie-institutions-referendum-dinitiative-citoyenne


['101',
 'aléatoire',
 'catimini',
 'dissimulées',
 'faim',
 'mourir',
 'sdf',
 'affranchis',
 'estiment',
 'socle',
 '120',
 'google',
 'gré',
 'précaire',
 'sus',
 'tpe',
 'fierté',
 'claude',
 'ladite',
 'micro',
 'organisationnels',
 'pécuniaire',
 'découverte',
 'connus',
 'opaque',
 'gris',
 'pole',
 'réferendum',
 'racket',
 'pourvus']

In [183]:
selected_keywords[topics[0]] = ['faim', 'mourir', 'sdf', 'Google', 'précaire', 'entreprise', 'réferendum', 'racket']

In [184]:
print("Topic 1:", topics[1])
top30_words[topics[1]]

Topic 1: economie-finances-travail-compte-public


['associé',
 'fin',
 'voiture',
 'cadre',
 'public',
 'double',
 'discipline',
 'développe',
 'mené',
 'permet',
 'précaire',
 'simple',
 'équivalent',
 'parallèle',
 'parle',
 'chois',
 'alimentaire',
 'apprécié',
 'blanc',
 'bonus',
 'chiffre',
 'déterminé',
 'tôt',
 'convient',
 'découvert',
 'haut',
 'personnage',
 'préférence',
 'retourne',
 'patrimoine']

In [185]:
selected_keywords[topics[1]] = ['discipline', 'public', 'développement', 'précaire', 'choix', 'découvert', 'patrimoine', 'préférence']

In [186]:
print("Topic 2:", topics[2])
top30_words[topics[2]]

Topic 2: education-jeunesse-enseignement-superieur-recherche-et-innovation


['constitutionnel',
 'dessous',
 'etat',
 'enfant',
 'fin',
 'cependant',
 'afin',
 'défendre',
 'mélangé',
 'nouveau',
 'effectif',
 'entendu',
 'jamais',
 'justice',
 'moindre',
 'passe',
 'pratique',
 'renforce',
 'souhaite',
 'souvent',
 'vocation',
 'devoir',
 'toujours',
 '600',
 'privé',
 'choisi']

In [187]:
selected_keywords[topics[2]] = ['constitutionnel', 'etat', 'enfant', 'effectif', 'justice', 'pratique', 'vocation', 'privé']

In [188]:
print("Topic 3:", topics[3])
top30_words[topics[3]]

Topic 3: europe-affaires-etrangeres-outre-mer


['doit',
 'peut',
 'avantage',
 'différent',
 'territoire',
 'vérités',
 'écart',
 'écrit',
 'révoqué',
 'car',
 'économie',
 'construction',
 'dépend',
 'impose',
 'mal',
 'constate',
 'défendre',
 'entreprise',
 'manque',
 'adopté',
 'alourdira',
 'ancien',
 'budget',
 'citoyenneté',
 'concentrant',
 'concernent']

In [189]:
selected_keywords[topics[3]] = ['avantage', 'économie', 'construction', 'entreprise', 'budget', 'citoyenneté', 'budget', 'révoquer']

In [190]:
print("Topic 4:", topics[4])
top30_words[topics[4]]

Topic 4: justice-police-armee


['ainsi',
 'résultat',
 'corps',
 'motivé',
 'nombre',
 'signature',
 'total',
 'indigne',
 'mange',
 'président',
 'dialogue',
 'dispose',
 'dépend',
 'eau',
 'outre',
 'amène',
 'chômeurs',
 'concernent',
 'criminel',
 'déontologie',
 'développe',
 'effectif',
 'environ',
 'fait',
 'global',
 'intermédiaires',
 'magistrature',
 'malgré',
 'modalité',
 'pérennes']

In [191]:
selected_keywords[topics[4]] = ['résultat', 'corps', 'nombre', 'président', 'chômeurs', 'criminel', 'déontologie', 'intermédiaires']

In [192]:
print("Topic 5:", topics[5])
top30_words[topics[5]]

Topic 5: sante-solidarite-handicap


['commande',
 'avis',
 'divers',
 'mettre',
 'partage',
 'pluralité',
 'réel',
 'non',
 'vie',
 'article',
 'car',
 'économie',
 'mauvais',
 'baissé',
 'outre',
 'alors',
 'aujourd',
 'cahier',
 'création',
 'dit',
 'domaine',
 'favorise',
 'groups',
 'innovants',
 'observatoire',
 'prestation',
 'revoir',
 'surveiller',
 'tard']

In [193]:
selected_keywords[topics[5]] = ['avis', 'pluralité', 'réel', 'économie', 'création', 'domaine', 'observatoire', 'prestation']

In [194]:
print("Topic 6:", topics[6])
top30_words[topics[6]]

Topic 6: sport-culture


['avant',
 'personne',
 'professionnel',
 'avis',
 'cela',
 'charge',
 'contrôle',
 'corps',
 'dessous',
 'difficile',
 'exprimé',
 'idée',
 'moyen',
 'nombreux',
 'permanence',
 'populisme',
 'présence',
 'quel',
 'réduit',
 'vision',
 'mandats',
 'mis',
 'réflexion',
 'cet',
 'suppression',
 'charte']

In [211]:
selected_keywords[topics[6]] = ['professionnel', 'salaires', 'contrôle', 'chasse', 'populisme', 'télévision', 'mandats', 'charte']

In [212]:
print("Topic 7:", topics[7])
top30_words[topics[7]]

Topic 7: expression-libre


['organisé',
 'plus',
 'propos',
 'ressort',
 'réalité',
 'siège',
 'tour',
 'élément',
 'adulte',
 'réforme',
 'tenté',
 'peux',
 'salarié',
 '2017',
 'minime',
 'minimal',
 'sérieux',
 'évolue',
 'évoqué',
 'probable',
 'demain',
 'fondamental',
 'liberté',
 'manipulé',
 'million',
 'mêmes',
 'scientifique',
 'sénat',
 'garanti',
 'carte']

In [213]:
selected_keywords[topics[7]] = ['réalité', 'siège', 'tour', 'adulte', 'réforme', '2017', 'fondamental', 'liberté']

In [214]:
print("Topic 8:", topics[8])
top30_words[topics[8]]

Topic 8: transition-ecologique-solidaire-agriculture-alimentation


['nom',
 'professionnel',
 'société',
 'tous',
 'consulté',
 'deux',
 'journal',
 'leurs',
 'réalité',
 'total',
 'très',
 'élément',
 'dur',
 'agissent',
 'argent',
 'chaque',
 'digne',
 'indigne',
 'monétaire',
 'après',
 'universel',
 'utile',
 'lieu',
 'étonne',
 'apparente',
 'attribué',
 'automatique',
 'citoyenneté',
 'civil',
 'deviennent']

In [215]:
selected_keywords[topics[8]] = ['professionnel', 'société', 'consultation', 'journal', 'réalité', 'élément', 'argent', 'universel']

In [225]:
for topic in topics:
    print(topic, json.dumps([{"id": j, "name": keyword} for (j, keyword) in enumerate(selected_keywords[topic])]))

democratie-institutions-referendum-dinitiative-citoyenne [{"id": 0, "name": "faim"}, {"id": 1, "name": "mourir"}, {"id": 2, "name": "sdf"}, {"id": 3, "name": "Google"}, {"id": 4, "name": "pr\u00e9caire"}, {"id": 5, "name": "entreprise"}, {"id": 6, "name": "r\u00e9ferendum"}, {"id": 7, "name": "racket"}]
economie-finances-travail-compte-public [{"id": 0, "name": "discipline"}, {"id": 1, "name": "public"}, {"id": 2, "name": "d\u00e9veloppement"}, {"id": 3, "name": "pr\u00e9caire"}, {"id": 4, "name": "choix"}, {"id": 5, "name": "d\u00e9couvert"}, {"id": 6, "name": "patrimoine"}, {"id": 7, "name": "pr\u00e9f\u00e9rence"}]
education-jeunesse-enseignement-superieur-recherche-et-innovation [{"id": 0, "name": "constitutionnel"}, {"id": 1, "name": "etat"}, {"id": 2, "name": "enfant"}, {"id": 3, "name": "effectif"}, {"id": 4, "name": "justice"}, {"id": 5, "name": "pratique"}, {"id": 6, "name": "vocation"}, {"id": 7, "name": "priv\u00e9"}]
europe-affaires-etrangeres-outre-mer [{"id": 0, "name": "

# Compute related documents

## Step 1 : train fastText model on vocabulary

In [217]:
def clean_text(text):
    return re.sub(r'[;,\.!\?\(\)]', ' ', text).replace('\n', ' ').replace('[\s+]', ' ')

In [218]:
with open('cleaned_contributions.txt', 'w') as f:
    cleaned_corpus = df.contribution_versions_bodyText.apply(clean_text)
    f.writelines(rec + '\n' for rec in cleaned_corpus)

In [219]:
corpus_model = fasttext.train_unsupervised('cleaned_contributions.txt', 'skipgram')

## Step 3 : compute embedding of each document, distance to each (topic $\times$ keyword) and select top 20 for each keyword

In [220]:
df['document_embeddings'] = [corpus_model.get_sentence_vector(doc) for doc in cleaned_corpus]

In [221]:
def cosine_similarity(a, b):
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return np.dot(a, b) / (norm_a * norm_b)

def get_strongest(weights, topn):
    """ Get the top n strongest """
    return np.argpartition(weights, -topn, axis=0)[-topn:]

In [222]:
for t, topic in enumerate(topics):
    df_sel = df[df.master_tag == topic]
    for k, keyword in enumerate(selected_keywords[topic]):
        keyword_embedding = corpus_model.get_word_vector(keyword)
        similarities = df_sel['document_embeddings'].apply(lambda de: cosine_similarity(de, keyword_embedding))
        scores = similarities * np.log(df_sel.contributions_votesCountOk + 1) # Add weight for document votes
        strongest = get_strongest(scores, 20)
        df_sel.iloc[strongest][saved_cols].to_json('topic_%d_keyword_%d_main_contributions.json' % (t, k), orient='records')
        