# Imports

For the following notebook to work in google COLAB, you must create a folder called 1_shs_code in your google drive directory and extract the files contained in the repository there. In other words, the inside of 1_shs_code must have the same structure and files as the repository. (if you want to do it in another folder, you must modify the corresponding line in the next cell) (Furthermore, the directory quote_extractor is not necessary for this notebook so you do not have to upload it to your google drive)

In [2]:
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False

if IN_COLAB:
    PATH = '../' 
    PATH_DATA = '../Data/'
    from google.colab import drive
    drive.mount('/content/drive')

    # if you are using a folder other than 1_shs_code in your drive folder you must change '1_shs_code' in the following line of code : 
    %cd /content/drive/MyDrive/1_shs_code/Code/

    %pip install bertopic
    # install the french pipeline if don't already have it
    !python -m spacy download fr_core_news_sm
else:
    PATH = '../'
    PATH_DATA = '../Data/'
    # install the french pipeline if don't already have it
    # !python -m spacy download fr_core_news_sm

In [3]:
import pandas as pd
import numpy as np

import os
import fnmatch
import json
import spacy

import matplotlib.pyplot as plt
import seaborn as sns
import pickle

In [4]:
from bertopic import BERTopic
import torch

In [16]:
import transformers

In [17]:
transformers.__version__

'4.11.3'

In [14]:
from transformers import CamembertModel, CamembertTokenizer
from sklearn.cluster import KMeans

In [None]:
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
camembert = CamembertModel.from_pretrained("camembert-base")

camembert.eval()

In [None]:
# activate graphics card if available
# recommended as it loweres training time from 3h to 3min approx on the news articles dataset
if torch.cuda.is_available():      
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
    
device

# Loading data & Preprocessing

## Quick load

In [10]:
df_citations = pd.read_csv(PATH_DATA+'citations.csv', index_col=0)

with open(PATH_DATA + 'corpus_sentences_no_sw.pkl', 'rb') as f:
    corpus_sentences_no_sw = pickle.load(f)

df_articles = pd.read_csv('../Data/articles_data.tsv', sep='\t') 

with open(PATH_DATA + 'article_list_no_sw.pkl', 'rb') as f:
    article_list_no_sw = pickle.load(f)

## Detailed

### Citations

In [None]:
df_citations = pd.read_csv(PATH_DATA+'citations.csv', index_col=0)

In [8]:
corpus_sentences = df_citations['quote'].tolist()

In [10]:
# stopwords from https://github.com/stopwords-iso/stopwords-fr
def load_fr_stopwords(file_name = './stopwords-fr.txt' ):
    with open(file_name, "r",encoding='utf8') as f:
        stopwords = f.read() #.readlines()
        stopwords = stopwords.split('\n')
        
    return stopwords 

In [15]:
stopwords_list = load_fr_stopwords(file_name = PATH_DATA+ 'stopwords-fr.txt' )
stopwords_list = stopwords_list + ['qu’']
stopwords_list = set(stopwords_list) 

In [12]:
# install the spacy french pipeline if this cell returns an error
# !python -m spacy download fr_core_news_sm
# if in google COLAB, you must restart the runtime before running this cell again
# to restart runtime : Runtime> Restart runtime

nlp = spacy.load("fr_core_news_sm")

In [13]:
def remove_sw_punct(quote, nlp, stopwords_list):
    doc = nlp(quote)
    # tokenize and remove punctuation
    tokens = [token.lemma_ for token in doc if not token.is_punct]
    # remove stopwords
    words = [token for token in tokens if token not in stopwords_list]
    untokenize = ' '.join(words)
    return untokenize

In [16]:
# example
quote = corpus_sentences[10]
print(quote)
remove_sw_punct(quote, nlp, stopwords_list)

que, pour pouvoir aller au concert(!), il faudrait présenter un certificat "vaccinal" prouvant qu’on a été vacciné!


'pouvoir aller concert faudrait présenter certificat vaccinal prouver vacciner'

In [None]:
len(corpus_sentences)

In [None]:
# apply on all quotes : ( take ~15 minutes to run)
corpus_sentences_no_sw = [remove_sw_punct(quote, nlp, stopwords_list) for quote in corpus_sentences]

In [None]:
len(corpus_sentences_no_sw)

72259

In [None]:
# with open(PATH_DATA + 'corpus_sentences_no_sw.pkl', 'wb') as f:
#     pickle.dump(corpus_sentences_no_sw, f)

Load me: (you do not need to run the above code as it takes a long time)

In [27]:
with open(PATH_DATA + 'corpus_sentences_no_sw.pkl', 'rb') as f:
    corpus_sentences_no_sw = pickle.load(f)

In [28]:
len(corpus_sentences_no_sw)

72259

### articles

In [46]:
df_articles = pd.read_csv(PATH_DATA + 'articles_data.tsv', sep='\t') 

In [47]:
article_list = df_articles['text'].to_list()

This cell takes quite some time to run, so you can skip and just load the the pickle file two cells below

In [None]:
article_list_no_sw = [remove_sw_punct(article, nlp, stopwords_list) for article in article_list]

In [None]:
# with open(PATH_DATA + 'article_list_no_sw.pkl', 'wb') as f:
#     pickle.dump(article_list_no_sw, f)

Load me : 

In [22]:
with open(PATH_DATA + 'article_list_no_sw.pkl', 'rb') as f:
    article_list_no_sw = pickle.load(f)

In [23]:
# sanity check
print(len(article_list))
print(len(article_list_no_sw))

8095

# Topic modelling : Citations

### Default embeddings

#### with preprocessing

In [25]:
# loading the model
topic_model_p = BERTopic(nr_topics="25", verbose=True, language="french")  # multilingual

In [29]:
# obtaining the topics by fitting the model to our corpus
topics_p, probs_p = topic_model_p.fit_transform(corpus_sentences_no_sw)

Downloading:   0%|          | 0.00/968 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.79k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/645 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/471M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

Batches:   0%|          | 0/2259 [00:00<?, ?it/s]

2022-05-21 16:55:11,930 - BERTopic - Transformed documents to Embeddings
2022-05-21 16:57:35,854 - BERTopic - Reduced dimensionality
2022-05-21 16:57:47,555 - BERTopic - Clustered reduced embeddings
2022-05-21 16:58:06,679 - BERTopic - Reduced number of topics from 1563 to 181


In [32]:
# if you want to save and load models from your session use the following code :

# save model
# topic_model_p.save("../Data/models/BERTopic_nosw")
# load model
# topic_model_p = BERTopic.load("../Data/models/BERTopic_nosw") 

# saved models weren't provided in the repo due to their large size

In [33]:
topic_model_p.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,39395,0_faire_pouvoir_pays_lundi
1,-1,24171,-1_pouvoir_faire_an_devoir
2,1,377,1_twitter___
3,2,249,2_police_policier_officier_patrouille
4,3,226,3_vote_élection_voter_électoral
...,...,...,...
174,175,11,175_mensonge_manipulation_truffer_céder
173,176,11,176_ramper_inflexion_atterrir_déterminer
172,177,11,177_adoption_adopter_publication_suspendre
171,178,11,178_province_incendie_forêt_colombie


In [34]:
topic_model_p.get_topic(0)

[('faire', 0.004178447012841117),
 ('pouvoir', 0.004018900208358022),
 ('pays', 0.003676931965672919),
 ('lundi', 0.0036416873466050014),
 ('devoir', 0.003627881840761775),
 ('an', 0.003606964250429584),
 ('président', 0.0035034045741253773),
 ('américain', 0.003247329970786104),
 ('aller', 0.0031776464709594976),
 ('déclarer', 0.003058824234831245)]

In [35]:
topic_model_p.visualize_topics()

In [36]:
topic_model_p.visualize_barchart() 

In [37]:
topic_model_p.visualize_heatmap()

In [None]:
# to reduce the number of topics we can use : 
# topics_red, probs_red = topic_model_p.reduce_topics(corpus_sentences_no_sw, topics_p, probs_p, nr_topics=50)

#### without preprocessing

In [38]:
# loading the model
topic_model = BERTopic(nr_topics="25", verbose=True, language="french")  # multilingual

# obtaining the topics by fitting the model to our corpus
topics, probs = topic_model.fit_transform(corpus_sentences)

Batches:   0%|          | 0/2259 [00:00<?, ?it/s]

2022-05-21 17:03:18,009 - BERTopic - Transformed documents to Embeddings
2022-05-21 17:05:11,640 - BERTopic - Reduced dimensionality
2022-05-21 17:05:24,123 - BERTopic - Clustered reduced embeddings
2022-05-21 17:05:44,300 - BERTopic - Reduced number of topics from 1514 to 295


In [39]:
# topic_model.save("../Data/models/BERTopic")
# topic_model = BERTopic.load(PATH_DATA + "models/BERTopic") 

In [40]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,29723,0_de_la_le_un
1,-1,28063,-1_de_la_et_le
2,1,739,1_femmes_hommes_égalité_harcèlement
3,2,498,2_suisse_suisses_zurich_romande
4,3,496,3_santé_soins_médecin_patients
...,...,...,...
287,289,11,289_cortisol_stress_dérégler_taux
286,290,11,290_souveraineté_piètre_volontairement_excellence
285,291,11,291_poids_pédale_obèse_booster
284,292,11,292_superflues_nlrb_disproportionnées_illégales


In [41]:
topic_model.get_topic(0)

[('de', 0.0033483292439836342),
 ('la', 0.0032654415446630235),
 ('le', 0.0032150899221602015),
 ('un', 0.003097170766967727),
 ('des', 0.003065948928007974),
 ('en', 0.003047999230848345),
 ('et', 0.003044221972691781),
 ('dans', 0.0029910542644329416),
 ('les', 0.002962469263707288),
 ('du', 0.0029464471257962055)]

### camemBERT embeddings

#### without preproc

In [42]:
# loading the model
topic_model_camem = BERTopic(nr_topics="100", verbose=True, language="french", embedding_model=camembert)  # multilingual

# obtaining the topics by fitting the model to our corpus
topics_camem, probs_camem = topic_model_camem.fit_transform(corpus_sentences)

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Batches:   0%|          | 0/2259 [00:00<?, ?it/s]

2022-05-21 17:07:07,578 - BERTopic - Transformed documents to Embeddings
2022-05-21 17:09:06,686 - BERTopic - Reduced dimensionality
2022-05-21 17:09:18,907 - BERTopic - Clustered reduced embeddings
2022-05-21 17:09:46,357 - BERTopic - Reduced number of topics from 1452 to 395


In [44]:
# topic_model_camem.save("../Data/models/BERTopic_camem")
# topic_model = BERTopic.load("../Data/models/BERTopic_camem") 

In [45]:
topic_model_camem.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,28949,0_de_le_la_et
1,-1,23070,-1_de_le_les_en
2,1,727,1_vaccination_vacciner_vaccinés_vaccins
3,2,453,2_patients_patient_médecin_médecins
4,3,443,3_canton_cantons_cantonal_vaud
...,...,...,...
377,387,11,387_anticipé_débâcle_vite_avance
376,386,11,386_toilettes_wc_type_inventrice
375,385,11,385_like_that_just_sometime
374,388,11,388_faillite_faillites_existences_insolvabilité


#### with preprocessing

In [50]:
# loading the model
topic_model_camem_nosw = BERTopic(nr_topics="25", verbose=True, language="french", embedding_model=camembert)  # multilingual

# obtaining the topics by fitting the model to our corpus
topics_camem_nosw, probs_camem_nosw = topic_model_camem_nosw.fit_transform(corpus_sentences_no_sw)

Batches:   0%|          | 0/2259 [00:00<?, ?it/s]

2022-05-21 17:11:21,154 - BERTopic - Transformed documents to Embeddings
2022-05-21 17:13:26,369 - BERTopic - Reduced dimensionality
2022-05-21 17:13:36,298 - BERTopic - Clustered reduced embeddings
2022-05-21 17:13:52,086 - BERTopic - Reduced number of topics from 1565 to 245


In [51]:
# topic_model_camem_nosw.save("../Data/models/BERTopic_camem_nosw")
# topic_model = BERTopic.load("../Data/models/BERTopic_camem_nosw") 

In [52]:
topic_model_camem_nosw.get_topic_info().head(20)

Unnamed: 0,Topic,Count,Name
0,0,37292,0_pouvoir_faire_devoir_lundi
1,-1,23266,-1_faire_pouvoir_an_devoir
2,1,517,1_that_horrible_just_like
3,2,487,2_sanitaire_militaire_humanitaire_crise
4,3,441,3_film_cinéma_théâtre_oscar
5,4,395,4_taliban_afghan_afghanistan_kaboul
6,5,342,5_musique_musical_musicien_instrument
7,6,295,6_élection_électoral_voter_anticiper
8,7,263,7_milliard_dollar_franc_euro
9,8,242,8_charger_charge_cahier_prise


# Topic modelling : Articles

### Default embeddings

#### without preprocessing

In [59]:
# loading the model
topic_model_p_art = BERTopic(nr_topics="25", verbose=True, language="french")  # multilingual

# obtaining the topics by fitting the model to our corpus
topics_p_art, probs_p_art = topic_model_p_art.fit_transform(article_list)

Batches:   0%|          | 0/253 [00:00<?, ?it/s]

2022-05-21 17:15:53,498 - BERTopic - Transformed documents to Embeddings
2022-05-21 17:16:14,752 - BERTopic - Reduced dimensionality
2022-05-21 17:16:15,198 - BERTopic - Clustered reduced embeddings
2022-05-21 17:16:28,259 - BERTopic - Reduced number of topics from 128 to 11


In [60]:
topic_model_p_art.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,4543,0_de_la_le_les
1,-1,3196,-1_de_la_le_les
2,1,138,1_de_le_la_les
3,2,52,2_de_les_des_la
4,3,39,3_de_israël_des_palestiniens
5,4,32,4_de_le_chômage_les
6,5,26,5_de_avalanche_la_skieur
7,6,25,6_facebook_de_le_la
8,7,19,7_apple_de_la_le
9,8,13,8_de_les_la_tabac


#### with preprocessing

In [63]:
# loading the model
topic_model_p_art_no_sw = BERTopic(nr_topics="25", verbose=True, language="french")  # multilingual

# obtaining the topics by fitting the model to our corpus
topics_p_art_no_sw, probs_p_art_no_sw = topic_model_p_art_no_sw.fit_transform(article_list_no_sw)

Batches:   0%|          | 0/253 [00:00<?, ?it/s]

2022-05-21 17:17:04,833 - BERTopic - Transformed documents to Embeddings
2022-05-21 17:17:26,753 - BERTopic - Reduced dimensionality
2022-05-21 17:17:27,155 - BERTopic - Clustered reduced embeddings
2022-05-21 17:17:35,405 - BERTopic - Reduced number of topics from 124 to 74


In [64]:
topic_model_p_art_no_sw.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,3275,-1_faire_pouvoir_an_devoir
1,0,1629,0_pouvoir_faire_devoir_président
2,1,519,1_vaccin_vaccination_vacciner_dose
3,2,167,2_police_victime_homme_an
4,3,147,3_navire_mort_pluie_eau
...,...,...,...
69,68,11,68_catalan_indépendantiste_catalogne_bartomeu
70,69,11,69_plastique_recyclage_unilever_réutilisable
71,70,11,70_manifestant_ottawa_canada_canadien
72,71,11,71_google_actualités_smartphone_stadia


### camemBERT embeddings

#### with preprocessing

In [65]:
# loading the model
topic_model_p_art_no_sw_camem = BERTopic(nr_topics="25", verbose=True, language="french", embedding_model=camembert)  # multilingual

# obtaining the topics by fitting the model to our corpus
topics_p_art_no_sw_camem, probs_p_art_no_sw_camem = topic_model_p_art_no_sw_camem.fit_transform(article_list_no_sw)

Batches:   0%|          | 0/253 [00:00<?, ?it/s]

2022-05-21 17:19:34,637 - BERTopic - Transformed documents to Embeddings
2022-05-21 17:20:02,155 - BERTopic - Reduced dimensionality
2022-05-21 17:20:02,590 - BERTopic - Clustered reduced embeddings
2022-05-21 17:20:10,768 - BERTopic - Reduced number of topics from 113 to 42


In [66]:
topic_model_p_art_no_sw_camem.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,2906,-1_faire_pouvoir_an_devoir
1,0,2755,0_faire_pouvoir_an_devoir
2,1,735,1_vaccin_vaccination_covid_vacciner
3,2,204,2_musique_concert_musicien_artiste
4,3,124,3_co2_émission_loi_climatique
5,4,121,4_federer_set_finale_open
6,5,113,5_film_cinéma_acteur_meilleur
7,6,88,6_animal_espèce_loup_faune
8,7,84,7_art_artiste_musée_exposition
9,8,74,8_taliban_afghanistan_afghan_kaboul


#### without preprocessing

In [67]:
# loading the model
topic_model_p_art_camem = BERTopic(nr_topics="25", verbose=True, language="french", embedding_model=camembert)  # multilingual

# obtaining the topics by fitting the model to our corpus
topics_p_art_camem, probs_p_art_camem = topic_model_p_art_camem.fit_transform(article_list)

Batches:   0%|          | 0/253 [00:00<?, ?it/s]

2022-05-21 17:20:47,677 - BERTopic - Transformed documents to Embeddings
2022-05-21 17:21:11,594 - BERTopic - Reduced dimensionality
2022-05-21 17:21:12,016 - BERTopic - Clustered reduced embeddings
2022-05-21 17:21:25,286 - BERTopic - Reduced number of topics from 106 to 23


In [68]:
topic_model_p_art_camem.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,3945,0_de_la_le_les
1,-1,3627,-1_de_la_le_les
2,1,71,1_talibans_afghanistan_les_de
3,2,53,2_de_le_la_et
4,3,42,3_israël_palestiniens_jérusalem_gaza
5,4,41,4_iran_accord_nucléaire_téhéran
6,5,36,5_de_les_des_la
7,6,28,6_clubs_ligue_league_de
8,7,25,7_facebook_de_instagram_des
9,8,23,8_spatiale_station_la_espace


# Using k-means instead of HDSBscan

## Citations

In [69]:
cluster_model = KMeans(n_clusters=50)
# loading the model
topic_model_nosw_kmeans = BERTopic(nr_topics="50", verbose=True, language="french", embedding_model=camembert, hdbscan_model=cluster_model)  # multilingual

# obtaining the topics by fitting the model to our corpus
topics_nosw_kmeans, probs_camem_nosw_kmeans = topic_model_nosw_kmeans.fit_transform(corpus_sentences_no_sw)

Batches:   0%|          | 0/2259 [00:00<?, ?it/s]

2022-05-21 17:22:10,413 - BERTopic - Transformed documents to Embeddings
2022-05-21 17:24:15,775 - BERTopic - Reduced dimensionality
2022-05-21 17:24:25,605 - BERTopic - Clustered reduced embeddings
2022-05-21 17:24:29,503 - BERTopic - Reduced number of topics from 50 to 43


In [70]:
# # save model
# topic_model_nosw_kmeans.save("../Data/models/topic_model_nosw_kmeans")
# # load model
# topic_model_nosw_kmeans = BERTopic.load("../Data/models/topic_model_nosw_kmeans") 

In [71]:
topic_model_nosw_kmeans.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,12244,0_pouvoir_pays_européen_accord
1,1,3563,1_avocat_liberté_jeune_gauche
2,2,3299,2_police_violence_tribunal_prison
3,3,3257,3_président_élection_politique_parti
4,4,3229,4_service_personnel_faire_organisation
5,5,3090,5_émission_transport_accident_co2
6,6,3033,6_film_musique_artiste_histoire
7,7,3030,7_covid_vaccin_vaccination_vacciner
8,8,3019,8_falloir_aller_venir_moment
9,9,2689,9_ville_prix_faire_mètre


## Articles

In [72]:
cluster_model_a = KMeans(n_clusters=50)
# loading the model
topic_model_nosw_kmeans_art = BERTopic(nr_topics="50", verbose=True, language="french", hdbscan_model=cluster_model_a)  # multilingual

# obtaining the topics by fitting the model to our corpus
topics_nosw_kmeans_art, probs_camem_nosw_kmeans_art = topic_model_nosw_kmeans_art.fit_transform(article_list_no_sw)

Batches:   0%|          | 0/253 [00:00<?, ?it/s]

2022-05-21 17:24:58,829 - BERTopic - Transformed documents to Embeddings
2022-05-21 17:25:25,589 - BERTopic - Reduced dimensionality
2022-05-21 17:25:26,781 - BERTopic - Clustered reduced embeddings
2022-05-21 17:25:37,054 - BERTopic - Reduced number of topics from 50 to 38


In [73]:
topic_model_nosw_kmeans_art.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,1612,0_suisse_faire_pouvoir_devoir
1,1,523,1_parti_candidat_élection_président
2,2,476,2_match_équipe_joueur_club
3,3,352,3_vaccin_vaccination_vacciner_dose
4,4,348,4_match_faire_suisse_équipe
5,5,333,5_film_faire_scène_acteur
6,6,277,6_tribunal_avocat_cour_procès
7,7,273,7_incendie_pompier_dimanche_lundi
8,8,247,8_climatique_climat_émission_co2
9,9,225,9_russe_russie_ukraine_moscou


# Saving generated topics to files

## Articles 

In [74]:
def topic_dict(topic_dict):
    topics = {}
    for key in topic_dict:

        topic_names = []
        for item in topic_dict[key]:
            topic_names.append(item[0])
         
        topics[key]=topic_names
    
    return topics

In [75]:
topic_model_nosw_kmeans_art_dict = topic_model_nosw_kmeans_art.get_topics()
topic_model_nosw_kmeans_art_dict = topic_dict(topic_model_nosw_kmeans_art_dict)

In [76]:
df_articles_w_topic_article = df_articles.copy()

In [77]:
df_articles_w_topic_article['topic_cam_kmeans_article'] = pd.DataFrame(topics_nosw_kmeans_art)
df_articles_w_topic_article['topic_cam_kmeans_article'] = df_articles_w_topic_article['topic_cam_kmeans_article'].apply(lambda x: topic_model_nosw_kmeans_art_dict[x])

In [78]:
df_articles_w_topic_article['topic_cam_kmeans_article']

0       [match, équipe, joueur, club, saison, but, fai...
1       [nucléaire, iran, iranien, accord, téhéran, co...
2       [tribunal, avocat, cour, procès, an, ancien, a...
3       [climatique, climat, émission, co2, réchauffem...
4       [suisse, faire, pouvoir, devoir, an, franc, an...
                              ...                        
8090    [vaccin, vaccination, vacciner, dose, covid, s...
8091    [produit, suisse, animal, eau, initiative, pou...
8092    [facebook, informatique, apple, entreprise, ré...
8093    [match, faire, suisse, équipe, aller, jeu, oly...
8094    [école, enfant, élève, jeune, étudiant, format...
Name: topic_cam_kmeans_article, Length: 8095, dtype: object

In [80]:
topic_model_nosw_kmeans_art.get_topic_info().to_pickle(PATH_DATA + 'topic_model_nosw_kmeans_art.pkl', compression='infer', protocol=4, storage_options=None)
df_articles_w_topic_article.to_pickle(PATH_DATA + 'df_articles_wtopic_per_article.pkl', compression='infer', protocol=4, storage_options=None)

## Citations

In [81]:
def topic_dict(topic_dict):
    topics = {}
    for key in topic_dict:

        topic_names = []

        for item in topic_dict[key]:
            topic_names.append(item[0])
        
        topics[key]=topic_names
    
    return topics

In [82]:
df_citations_wtopic = df_citations.copy()

In [83]:
topic_model_nosw_kmeans_dict = topic_model_nosw_kmeans.get_topics()
topic_model_nosw_kmeans_dict = topic_dict(topic_model_nosw_kmeans_dict)

In [84]:
df_citations_wtopic['topic_cam_kme'] = pd.DataFrame(topics_nosw_kmeans)
df_citations_wtopic['topic_cam_kme_kw'] = df_citations_wtopic['topic_cam_kme'].apply(lambda x: topic_model_nosw_kmeans_dict[x])

In [85]:
topic_model_nosw_kmeans.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,12244,0_pouvoir_pays_européen_accord
1,1,3563,1_avocat_liberté_jeune_gauche
2,2,3299,2_police_violence_tribunal_prison
3,3,3257,3_président_élection_politique_parti
4,4,3229,4_service_personnel_faire_organisation
5,5,3090,5_émission_transport_accident_co2
6,6,3033,6_film_musique_artiste_histoire
7,7,3030,7_covid_vaccin_vaccination_vacciner
8,8,3019,8_falloir_aller_venir_moment
9,9,2689,9_ville_prix_faire_mètre


In [86]:
df_citations_wtopic.to_pickle(PATH_DATA + 'df_citations_wtopic.pkl', compression='infer', protocol=4, storage_options=None)
topic_model_nosw_kmeans.get_topic_info().to_pickle(PATH_DATA + 'topic_model_nosw_kmeans.pkl', compression='infer', protocol=4, storage_options=None)

# other