# Notebook Exploration

## Functions

In [1]:
import gensim
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import spacy
import time

def evaluate_and_log_metrics(experiment_name, lda_model_title, corpus_test_title, lda_transformed_title_test):
    
    # Calculer la log-perplexité
    log_perplexity_title = lda_model_title.log_perplexity(lda_transformed_title_test)
    
    # Calculer la perplexité à partir de la log-perplexité
    perplexity_title = np.exp(-log_perplexity_title / len(corpus_test_title))
    
    print(f"Log-Perplexity: {log_perplexity_title}")
    print(f"Perplexity: {perplexity_title}")

    with mlflow.start_run():
        mlflow.set_experiment(experiment_name)
        mlflow.log_metric("Log-Perplexity", log_perplexity_title)
        mlflow.log_metric("Perplexity", perplexity_title)

    return log_perplexity_title, perplexity_title

## Split dataset

In [2]:
from sklearn.model_selection import train_test_split

data = pd.read_csv("dataset_cleaned.csv")
train_data, test_data = train_test_split(data, test_size=0.2)

## LDA (Latent Dirichlet allocation)

### Préparer LDA sur les titres (pour un modèle simple)

In [3]:
from gensim.models import LdaModel
from gensim.corpora import Dictionary

# train
text_data_train_title = [text.split() for text in train_data['title_lemmatized']]
dictionary_train_title = Dictionary(text_data_train_title)
corpus_train_title = [dictionary_train_title.doc2bow(word_list) for word_list in text_data_train_title]

# test
text_data_test_title = [text.split() for text in test_data['title_lemmatized']]
dictionary_test_title = Dictionary(text_data_test_title)
corpus_test_title = [dictionary_test_title.doc2bow(word_list) for word_list in text_data_test_title]

# Entraînement du modèle LDA sur l'ensemble d'entraînement
lda_model_title = LdaModel(corpus_train_title, num_topics=5, id2word=dictionary_train_title, passes=10)

# Afficher les mots associés à chaque topic pour le titre
print("Mots associés à chaque topic pour le titre:")
for topic_idx, topic in lda_model_title.print_topics():
    print(f"Topic {topic_idx + 1}: {topic}")

# LDA pour transformer les données de test sur l'ensemble de test
lda_transformed_title_test = lda_model_title[corpus_test_title]

# Calculer la log-perplexité
log_perplexity_title = lda_model_title.log_perplexity(lda_transformed_title_test)

# Calculer la perplexité à partir de la log-perplexité
perplexity_title = np.exp(-log_perplexity_title / len(corpus_test_title))

print(f"Log-Perplexity: {log_perplexity_title}")
print(f"Perplexity: {perplexity_title}")

Mots associés à chaque topic pour le titre:
Topic 1: 0.005*"multiple" + 0.005*"data" + 0.005*"app" + 0.005*"using" + 0.005*"['use" + 0.004*"function" + 0.004*"error" + 0.004*"python" + 0.003*"file" + 0.003*"['django"
Topic 2: 0.007*"image" + 0.006*"['create" + 0.006*"using" + 0.006*"['using" + 0.005*"vs" + 0.005*"error" + 0.004*"['make" + 0.004*"custom" + 0.004*"application" + 0.004*"data"
Topic 3: 0.010*"using" + 0.009*"['get" + 0.006*"files" + 0.006*"file" + 0.006*"way" + 0.005*"code" + 0.004*"error" + 0.004*"cannot" + 0.004*"server" + 0.003*"studio"
Topic 4: 0.020*"using" + 0.007*"array" + 0.006*"file" + 0.006*"string" + 0.006*"c" + 0.005*"c']" + 0.004*"server" + 0.004*"object" + 0.004*"json" + 0.003*"['convert"
Topic 5: 0.007*"android" + 0.004*"type" + 0.004*"studio" + 0.004*"multiple" + 0.004*"file']" + 0.003*"database" + 0.003*"error" + 0.003*"run" + 0.003*"function" + 0.003*"vs"
Log-Perplexity: -25.75107479480978
Perplexity: 1.0129587840270604


### Trouver le nombre de topics avec la Perplexité (Avec title seulement)

In [4]:
from gensim.models import LdaModel
from gensim.corpora import Dictionary
import numpy as np


text_data_train_title = [text.split() for text in train_data['title_lemmatized']]
text_data_test_title = [text.split() for text in test_data['title_lemmatized']]

dictionary_train_title = Dictionary(text_data_train_title)
dictionary_test_title = Dictionary(text_data_test_title)

dictionary_train_title.filter_extremes(no_below=5, no_above=0.6)
dictionary_test_title.filter_extremes(no_below=5, no_above=0.6)

corpus_train_title = [dictionary_train_title.doc2bow(word_list) for word_list in text_data_train_title]
corpus_test_title = [dictionary_test_title.doc2bow(word_list) for word_list in text_data_test_title]

# Liste des nombres de topics à tester
num_topics_list = [1, 2, 3, 4, 5, 6, 7, 8]

# Calculer la perplexité pour différents nombres de topics
perplexity_scores = []

for num_topics in num_topics_list:
    lda_model_title = LdaModel(corpus_train_title, num_topics=num_topics, id2word=dictionary_train_title, passes=10)
    log_perplexity_title = lda_model_title.log_perplexity(corpus_test_title)
    perplexity_title = np.exp(-log_perplexity_title / len(corpus_test_title))
    print(perplexity_title)
    perplexity_scores.append(perplexity_title)

# nombre de topics avec la perplexité la plus basse
best_num_topics = num_topics_list[np.argmin(perplexity_scores)]

print(f"Le nombre optimal de topics est : {best_num_topics}")


1.0039885535836495
1.0042789358906539
1.0044769844880357
1.004619701838748
1.0047316753698625
1.0048189650468928
1.0048756610214797
1.0049355958484012
Le nombre optimal de topics est : 1


### Trouver le nombre de topics avec la Perplexité (Avec title et body)

In [5]:
from gensim.models import LdaModel
from gensim.corpora import Dictionary
import numpy as np

text_data_train_combined = [text.split() for text in train_data['title_lemmatized'] + train_data['body_lemmatized']]
text_data_test_combined = [text.split() for text in test_data['title_lemmatized'] + test_data['body_lemmatized']]


dictionary_train_combined = Dictionary(text_data_train_combined)
dictionary_test_combined = Dictionary(text_data_test_combined)

print(len(dictionary_train_combined))
print(len(dictionary_test_combined))

dictionary_train_combined.filter_extremes(no_below=5, no_above=0.6)
dictionary_test_combined.filter_extremes(no_below=5, no_above=0.6)

corpus_train_combined = [dictionary_train_combined.doc2bow(word_list) for word_list in text_data_train_combined]
corpus_test_combined = [dictionary_test_combined.doc2bow(word_list) for word_list in text_data_test_combined]

print(len(dictionary_train_combined))
print(len(dictionary_test_combined))

num_topics_list = [1, 2, 3, 4, 5, 6, 7, 8]
perplexity_scores_combined = []

for num_topics in num_topics_list:    
    lda_model_combined = LdaModel(corpus_train_combined, num_topics=num_topics, id2word=dictionary_train_combined, passes=10)

    log_perplexity_combined = lda_model_combined.log_perplexity(corpus_test_combined)
    perplexity_combined = np.exp(-log_perplexity_combined / len(corpus_test_combined))
    print(perplexity_combined)
    perplexity_scores_combined.append(perplexity_combined)

best_num_topics_combined = num_topics_list[np.argmin(perplexity_scores_combined)]
print(f"Le nombre optimal de topics pour les mots combinés est : {best_num_topics_combined}")

topics = lda_model_combined.show_topics(num_topics=num_topics, formatted=False)
for topic in topics:
    print(topic)


165750
54976
11601
4259
1.0045971360282582
1.0046617386295658
1.0047139941609777
1.0047445885498818
1.004786085999776
1.004816658726013
1.0048456938822585
1.0048652216406742
Le nombre optimal de topics pour les mots combinés est : 1
(0, [('line', 0.024931056), ('file', 0.018267043), ('def', 0.01282137), ('test', 0.01086217), ('method', 0.009485462), ('return', 0.009122204), ('call', 0.008327333), ('end', 0.007862711), ('event', 0.0074588945), ('function', 0.0067801434)])
(1, [('like', 0.013032735), ('would', 0.011303859), ('use', 0.010486804), ('using', 0.0099492455), ('way', 0.007725866), ('im', 0.007699554), ('one', 0.0067669563), ('code', 0.0063188914), ('need', 0.0048392485), ('know', 0.0045391014)])
(2, [('1', 0.04431643), ('0', 0.029987337), ('2', 0.024028843), ('3', 0.016465899), ('x', 0.011571607), ('data', 0.010405485), ('4', 0.009918827), ('import', 0.00852962), ('array', 0.007948), ('using', 0.007145466)])
(3, [('int', 0.02178521), ('c', 0.018224802), ('function', 0.01644537

In [6]:
from sklearn.decomposition import LatentDirichletAllocation

# Définir le nombre de topics
num_topics = best_num_topics

# LDA sur le titre
lda_title = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda_title.fit(X_title_train)

# LDA sur le body
lda_body = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda_body.fit(X_body_train)

M_topics_words = lda_title.components_
M_quest_topics_train = lda_title.transform(X_title_train)

# Afficher les mots associés à chaque topic pour le titre
print("Mots associés à chaque topic pour le titre:")
for topic_idx, topic in enumerate(lda_title.components_):
    top_words_idx = topic.argsort()[:-10 - 1:-1]  # Sélectionner les 10 meilleurs mots pour chaque topic
    top_words = [feature_names_title[i] for i in top_words_idx]
    print(f"Topic {topic_idx + 1}: {', '.join(top_words)}")

# Afficher les mots associés à chaque topic pour le corps
#print("\nMots associés à chaque topic pour le corps:")
#for topic_idx, topic in enumerate(lda_body.components_):
 #   top_words_idx = topic.argsort()[:-10 - 1:-1]  # Sélectionner les 10 meilleurs mots pour #chaque topic
   # top_words = [feature_names_body[i] for i in top_words_idx]
    #print(f"Topic {topic_idx + 1}: {', '.join(top_words)}")

# Transformez les données de test en distributions de topics
X_title_test_topics = lda_title.transform(X_title_test)
X_body_test_topics = lda_body.transform(X_body_test)


NameError: name 'X_title_train' is not defined

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import pyLDAvis
import pyLDAvis.lda_model

# Visualisation pour le titre
pyLDAvis.enable_notebook()
vis_title = pyLDAvis.lda_model.prepare(lda_title, X_title_test, vectorizer_title, mds='tsne')
pyLDAvis.display(vis_title)

# Visualisation pour le body
# vis_body = pyLDAvis.sklearn.prepare(lda_body, X_body_test, vectorizer_body, mds='tsne')
# pyLDAvis.display(vis_body)


In [None]:
num_topics = M_topics_words.shape[0]
num_top_words = 10

for topic_idx in range(num_topics):
    top_words_idx = M_topics_words[topic_idx].argsort()[:-num_top_words - 1:-1]
    top_words = [feature_names_title[i] for i in top_words_idx]
    print(f"Topic {topic_idx + 1}: {', '.join(top_words)}")


In [None]:
num_questions = M_quest_topics_train.shape[0]
num_top_topics = 3

for question_idx in range(num_questions):
    top_topics_idx = M_quest_topics_train[question_idx].argsort()[:-num_top_topics - 1:-1]
    print(f"Question {question_idx + 1} - Topics Principaux : {', '.join(map(str, top_topics_idx))}")
