# Code for the topic modelling

Tuto - https://www.youtube.com/watch?v=arMTVXEbV9Q

## Import packages

In [4]:
# nltk
import nltk
# gensim
import gensim
import gensim.corpora as corpora
# spacy
import spacy
#vis
import pyLDAvis
import pyLDAvis.gensim

In [17]:
import pandas as pd

## Import data

In [None]:
db_init = pd.read_csv("C:/Users/valentin.stuhlfauth/OneDrive - univ-lyon2.fr/2_2025-DUT systematic review/2_Topic modelling/0_code/2025_Pol4PED_systematic_review/2025-02-27_Screening_result.csv",sep=';',encoding='ISO-8859-1')

In [14]:
db_init["Approach"].unique()

array(['Conceptual', 'Methodological', 'Qualitative', nan, 'Reviews',
       'Quantitative', 'Modelling', 'Ingineering'], dtype=object)

In [72]:
db_init=db_init.reset_index()

In [42]:
df_text = pd.DataFrame()
df_text["text_topic"] = data["Article Title"] + " " + data["Abstract"]
df_text = df_text["text_topic"]

## Pretreatment

### Lemmatisation, generation and bigrams

In [44]:
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    texts_out = []
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
    return (texts_out)

In [46]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

In [47]:
data_words = gen_words(lemmatization(df_text))

In [49]:
#BIGRAMS AND TRIGRAMS
## threshold (float, optional) – Represent a score threshold for forming the phrases (higher means fewer phrases) (init = 100)
bigram_phrases = gensim.models.Phrases(data_words, min_count=5, threshold=10)
trigram_phrases = gensim.models.Phrases(bigram_phrases[data_words], threshold=10)

bigram = gensim.models.phrases.Phraser(bigram_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def make_bigrams(texts):
    return([bigram[doc] for doc in texts])

def make_trigrams(texts):
    return ([trigram[bigram[doc]] for doc in texts])

data_bigrams = make_bigrams(data_words)
data_bigrams_trigrams = make_trigrams(data_bigrams)

print (data_bigrams_trigrams[0])

['theory', 'paper_analyze', 'relationship', 'low_carbon_city', 'climate_change', 'background', 'global_warming', 'low_carbon_city', 'construction', 'important', 'measure', 'mitigate_climate_change', 'crisis', 'low_carbon_urban', 'planning', 'key', 'technology', 'low_carbon_urban', 'construction', 'research', 'aspect', 'low_carbon_urban', 'planning', 'theory', 'concept', 'practice', 'low', 'zero', 'carbon', 'community', 'low_carbon_city', 'development', 'future', 'sum', 'low_carbon_urban', 'planning', 'research', 'progress', 'domestic', 'overseas', 'put', 'future', 'trend', 'low_carbon_urban', 'planning', 'research', 'aspect', 'demarcation', 'path', 'low_carbon_city', 'low_carbon_urban', 'planning', 'construction', 'popularity', 'low_carbon_technology', 'enforcement', 'mechanism', 'innovation', 'low_carbon_city', 'construction']


### TF-IDF (play with hypervalue)

In [None]:
#TF-IDF REMOVAL
from gensim.models import TfidfModel

id2word = corpora.Dictionary(data_bigrams_trigrams)

texts = data_bigrams_trigrams

corpus = [id2word.doc2bow(text) for text in texts]
# print (corpus[0][0:20])

tfidf = TfidfModel(corpus, id2word=id2word)

low_value = 0.02
words  = []
words_missing_in_tfidf = []
for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = [] #reinitialize to be safe. You can skip this.
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words+words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf socre 0 will be missing

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    corpus[i] = new_bow

In [None]:
list(dict.fromkeys(words))

### Manual Thesaurus

In [None]:
# Stop words 
stop_words=[]
stop_words.append(['CO2'])

print(stop_words)

In [None]:
# Stop words suppression 
data_bigrams_trigrams_stop = [[mot for mot in doc if not mot in stop_words] for doc in data_bigrams_trigrams]

## Topic modelling global

### Dictionary

In [50]:
from gensim.corpora import Dictionary
dico = Dictionary(data_bigrams_trigrams)
print(dico)

Dictionary<8958 unique tokens: ['aspect', 'background', 'carbon', 'climate_change', 'community']...>


In [51]:
id2word = corpora.Dictionary(data_bigrams_trigrams)

corpus = []
for text in data_bigrams_trigrams:
    new = id2word.doc2bow(text)
    corpus.append(new)

print (corpus[0][0:20])

word = id2word[[0][:1][0]]
print (word)

[(0, 2), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 4), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 2), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 5), (19, 1)]
aspect


### LDA (init passes=10)

In [62]:
# lissing topic appartenance = alpha (close to 0 = crisp (binary) / more than 10 = lissing of the results) (deffault = 1/num_topics)
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus[:-1],
                                           id2word=id2word,
                                           num_topics=50,
                                           #random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=5,
                                           alpha="auto")

In [59]:
lda_model.print_topics()[0:2]

[(5,
  '0.000*"outgrow" + 0.000*"superlinear" + 0.000*"definite" + 0.000*"implant" + 0.000*"implantation" + 0.000*"protective" + 0.000*"allometric" + 0.000*"explored" + 0.000*"restrain" + 0.000*"commissioning"'),
 (13,
  '0.000*"outgrow" + 0.000*"superlinear" + 0.000*"definite" + 0.000*"implant" + 0.000*"implantation" + 0.000*"protective" + 0.000*"allometric" + 0.000*"explored" + 0.000*"restrain" + 0.000*"commissioning"')]

In [60]:
lda_model.show_topic(0)

[('approach', 0.054836888),
 ('project', 0.052121513),
 ('transition', 0.046698086),
 ('process', 0.044073105),
 ('sector', 0.042188846),
 ('community', 0.042035833),
 ('framework', 0.027259815),
 ('urban_agglomeration', 0.026877375),
 ('sustainable', 0.024459409),
 ('implementation', 0.023843277)]

In [56]:
# serach terms in topic
lda_model.get_term_topics("sharing", minimum_probability=0)

[(10, 0.0028584623)]

### Vizualizing the Data

In [63]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis

## Topic modelling "Scale and Resources" > 

## Data analysis

In [64]:
# description of topics' space
doc_topics = lda_model.get_document_topics(corpus[:-1])
print(doc_topics)

<gensim.interfaces.TransformedCorpus object at 0x0000020BF42976B0>


In [65]:
# "sparce" matrix transformation
from gensim.matutils import corpus2csc
mat_sparce = corpus2csc(doc_topics)

print(mat_sparce)

  (5, 0)	0.026408571749925613
  (6, 0)	0.017804166302084923
  (10, 0)	0.08102951943874359
  (15, 0)	0.028569821268320084
  (17, 0)	0.014475645497441292
  (21, 0)	0.013427573256194592
  (26, 0)	0.16010437905788422
  (29, 0)	0.018302064388990402
  (32, 0)	0.015604554675519466
  (33, 0)	0.05355236306786537
  (34, 0)	0.05800873786211014
  (39, 0)	0.03783108666539192
  (42, 0)	0.08864591270685196
  (46, 0)	0.19876927137374878
  (47, 0)	0.017536601051688194
  (49, 0)	0.10699625313282013
  (5, 1)	0.04468901455402374
  (6, 1)	0.041244301944971085
  (13, 1)	0.021645143628120422
  (15, 1)	0.023561712354421616
  (18, 1)	0.013892151415348053
  (21, 1)	0.013131385669112206
  (24, 1)	0.04393310099840164
  (26, 1)	0.06432025879621506
  (29, 1)	0.03627924993634224
  :	:
  (20, 2025)	0.015533776953816414
  (24, 2025)	0.011222349479794502
  (27, 2025)	0.014651557430624962
  (29, 2025)	0.12063784152269363
  (33, 2025)	0.20882399380207062
  (34, 2025)	0.023475809022784233
  (38, 2025)	0.02848108299076557


In [66]:
# data frame
mat_dt = mat_sparce.T.toarray()
mat_dt

array([[0.        , 0.        , 0.        , ..., 0.0175366 , 0.        ,
        0.10699625],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.08628862],
       [0.        , 0.        , 0.02039475, ..., 0.        , 0.        ,
        0.20102744],
       ...,
       [0.02747214, 0.01926056, 0.        , ..., 0.01365835, 0.        ,
        0.21855922],
       [0.02381448, 0.        , 0.        , ..., 0.05243335, 0.        ,
        0.14668371],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.15873881]])

In [74]:
# data frame with polarities
dfTopic = pd.DataFrame(mat_dt, columns = ["T" + str(i) for i in range(mat_dt.shape[1])])
df_Topic_treatment = dfTopic
df_Topic_treatment['id'] = db_init['index']

print(df_Topic_treatment.head())

    T0        T1        T2   T3   T4        T5        T6   T7        T8   T9  \
0  0.0  0.000000  0.000000  0.0  0.0  0.026409  0.017804  0.0  0.000000  0.0   
1  0.0  0.000000  0.000000  0.0  0.0  0.044689  0.041244  0.0  0.000000  0.0   
2  0.0  0.000000  0.020395  0.0  0.0  0.028749  0.000000  0.0  0.097484  0.0   
3  0.0  0.000000  0.000000  0.0  0.0  0.010393  0.000000  0.0  0.000000  0.0   
4  0.0  0.067151  0.000000  0.0  0.0  0.000000  0.000000  0.0  0.000000  0.0   

   ...  T41       T42  T43  T44       T45       T46       T47  T48       T49  \
0  ...  0.0  0.088646  0.0  0.0  0.000000  0.198769  0.017537  0.0  0.106996   
1  ...  0.0  0.054272  0.0  0.0  0.017556  0.205746  0.000000  0.0  0.086289   
2  ...  0.0  0.016196  0.0  0.0  0.000000  0.262996  0.000000  0.0  0.201027   
3  ...  0.0  0.015630  0.0  0.0  0.000000  0.135810  0.000000  0.0  0.265583   
4  ...  0.0  0.000000  0.0  0.0  0.000000  0.125462  0.000000  0.0  0.047965   

   id  
0   0  
1   1  
2   2  
3   3 