In [None]:
import pandas as pd

# Remplace 'nom_du_fichier.csv' par le chemin réel de ton fichier CSV
chemin_fichier = 'flickr_data2.csv'

# Lecture du fichier CSV
try:
    data = pd.read_csv(chemin_fichier, sep=',')
    print("Fichier importé avec succès !")
except Exception as e:
    print(f"Une erreur est survenue : {e}")

In [None]:
print("### Informations générales sur le dataset ###")
print(data.info())

# Nombre total de lignes et colonnes
print("\nNombre total de lignes et colonnes :")
print(f"Lignes : {data.shape[0]}, Colonnes : {data.shape[1]}")

# Statistiques descriptives
print("\n### Statistiques descriptives ###")
#print(data.describe(include='all'))  # Inclut toutes les colonnes

# Comptage des valeurs nulles par colonne
print("\n### Nombre de valeurs nulles par colonne ###")
print(data.isnull().sum())

# Pourcentage de valeurs nulles
print("\n### Pourcentage de valeurs nulles par colonne ###")
print((data.isnull().sum() / data.shape[0]) * 100)

# Détection des doublons
print("\n### Nombre de doublons ###")
print(data.duplicated().sum())

# Aperçu des premières lignes
print("\n### Aperçu des premières lignes ###")
print(data.head())


In [None]:

try:
    # Suppression des doublons sur toutes les colonnes
    data_sans_doublons = data.drop_duplicates()
    print("Les doublons ont été supprimés du dataset.")

    # Identifier les 3 dernières colonnes
    dernieres_colonnes = data_sans_doublons.columns[-3:]  # Noms des 3 dernières colonnes
    print("### Les 3 dernières colonnes identifiées sont : ###")
    print(dernieres_colonnes)

    # Filtrer les lignes où au moins une des 3 dernières colonnes contient une valeur non nulle
    lignes_problemes = data_sans_doublons[dernieres_colonnes].notnull().any(axis=1)

    lignes_problemes_df = data_sans_doublons[lignes_problemes]

    lignes_problemes_df['title_date_concat'] = (
    lignes_problemes_df[' title'].astype(str) + '_:' + lignes_problemes_df[' date_upload_minute'].astype(str)
    )
    # Réassigner les colonnes en utilisant .loc
    lignes_problemes_df.loc[:, ' date_upload_minute'] = lignes_problemes_df[' date_upload_hour']
    lignes_problemes_df.loc[:, ' date_upload_hour'] = lignes_problemes_df[' date_upload_day']
    lignes_problemes_df.loc[:, ' date_upload_day'] = lignes_problemes_df[' date_upload_month']
    lignes_problemes_df.loc[:, ' date_upload_month'] = lignes_problemes_df[' date_upload_year']
    lignes_problemes_df.loc[:, ' date_upload_year'] = lignes_problemes_df['Unnamed: 16']
    lignes_problemes_df.loc[:, ' title'] = lignes_problemes_df['title_date_concat']

    lignes_problemes_df['annee'] = lignes_problemes_df[' date_taken_minute']
    lignes_problemes_df.loc[:, ' date_taken_minute'] = lignes_problemes_df[' date_taken_hour']
    lignes_problemes_df.loc[:,' date_taken_hour'] = lignes_problemes_df[' date_taken_day']
    lignes_problemes_df.loc[:,' date_taken_day'] = lignes_problemes_df[' date_taken_month']
    lignes_problemes_df.loc[:,' date_taken_month'] = lignes_problemes_df[' date_taken_year']
    lignes_problemes_df.loc[:,' date_taken_year'] = lignes_problemes_df['annee']

    lignes_problemes_df.drop(columns=['annee', 'title_date_concat', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18' ], inplace=True)

    # Liste des IDs à supprimer
    ids_a_supprimer = [8744184885, 8715425964]

    # Suppression des lignes correspondant aux IDs spécifiés
    lignes_problemes_df = lignes_problemes_df[~lignes_problemes_df['id'].isin(ids_a_supprimer)]

except Exception as e:
    print(f"Une erreur est survenue : {e}")


In [None]:
print(lignes_problemes_df.columns)

In [None]:
ids_problemes = lignes_problemes_df['id'].tolist()  # Liste des IDs corrigés
df_base = data_sans_doublons[~data_sans_doublons['id'].isin(ids_problemes)]
df_base.drop(columns=['Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18' ], inplace=True)
# Ajouter les lignes corrigées au DataFrame de base
df_base = pd.concat([df_base, lignes_problemes_df], ignore_index=True)


In [None]:
print("### Informations générales sur le dataset ###")
print(df_base.info())

# Nombre total de lignes et colonnes
print("\nNombre total de lignes et colonnes :")
print(f"Lignes : {df_base.shape[0]}, Colonnes : {df_base.shape[1]}")

# Statistiques descriptives
print("\n### Statistiques descriptives ###")
#print(data.describe(include='all'))  # Inclut toutes les colonnes

# Comptage des valeurs nulles par colonne
print("\n### Nombre de valeurs nulles par colonne ###")
print(df_base.isnull().sum())

# Pourcentage de valeurs nulles
print("\n### Pourcentage de valeurs nulles par colonne ###")
print((df_base.isnull().sum() / df_base.shape[0]) * 100)

# Détection des doublons
print("\n### Nombre de doublons ###")
print(df_base.duplicated().sum())

# Aperçu des premières lignes
print("\n### Aperçu des premières lignes ###")
print(df_base.head())

In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN

# Appliquer DBSCAN
def perform_dbscan_analysis(data, eps, min_samples):
    features = data[[" lat", " long"]]
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    labels = dbscan.fit_predict(features)
    data["cluster"] = labels

    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise = list(labels).count(-1)

    print(f"Nombre de clusters détectés : {n_clusters}")
    print(f"Nombre de points isolés (bruit) : {n_noise}")

    return data


# Récursivement subdiviser le plus grand cluster
def subdivide_largest_cluster(data, eps, min_samples):
    largest_cluster = data[data["cluster"] != -1]["cluster"].value_counts().idxmax()
    print(f"Subdivising cluster {largest_cluster}...")

    largest_cluster_points = data[data["cluster"] == largest_cluster]

    # Apply DBSCAN to the largest cluster
    refined_cluster_data = perform_dbscan_analysis(
        largest_cluster_points.copy(), eps=eps, min_samples=min_samples
    )

    # Update the cluster IDs to avoid overlap
    max_cluster_id = data["cluster"].max()
    refined_cluster_data["cluster"] = refined_cluster_data["cluster"].apply(
        lambda x: x + max_cluster_id + 1 if x != -1 else -1
    )

    # Merge back refined clusters
    data.loc[data["cluster"] == largest_cluster, "cluster"] = (
        -1
    )  # Mark old cluster as noise
    data = pd.concat([data, refined_cluster_data])

    return data


# Visualiser les clusters avec des couleurs correctes
def plot_clusters_with_matplotlib(data):
    plt.figure(figsize=(10, 8))

    # Map unique cluster IDs to consistent colors
    unique_clusters = sorted(data["cluster"].unique())
    cluster_color_map = {
        cluster: plt.get_cmap("tab20")(idx % 20)
        for idx, cluster in enumerate(unique_clusters)
    }

    for cluster in unique_clusters:
        cluster_points = data[data["cluster"] == cluster]

        # Assign colors to clusters, noise gets black
        color = cluster_color_map[cluster] if cluster != -1 else "black"

        plt.scatter(
            cluster_points[" long"],
            cluster_points[" lat"],
            s=20,
            c=[color],
            label=f"Cluster {cluster}" if cluster != -1 else "Noise",
        )

    plt.title("Visualisation des clusters DBSCAN")
    plt.xlabel("Longitude")
    plt.ylabel("Latitude")
    plt.legend(loc="best", fontsize="small", bbox_to_anchor=(1.05, 1))
    plt.grid()
    plt.show()


# Remove noise from the data
def remove_noise(data):
    filtered_data = data[data["cluster"] != -1]
    print(f"Data after removing noise: {filtered_data.shape[0]} rows.")
    return filtered_data


In [None]:

df_sample = df_base.sample(100000)


if df_sample is not None:
    # Perform initial DBSCAN
    clustered_data = perform_dbscan_analysis(df_sample, eps=0.0061, min_samples=4)

    # Plot initial clusters
    plot_clusters_with_matplotlib(clustered_data)

    # Subdivide the largest clusters
    clustered_data = subdivide_largest_cluster(
        clustered_data, eps=0.001, min_samples=4
    )
    clustered_data = subdivide_largest_cluster(
        clustered_data, eps=0.0006, min_samples=4
    )

    # Plot clusters after subdivision
    plot_clusters_with_matplotlib(clustered_data)

    # # Save clustered data
    # clustered_data.to_csv(output_file_path_clusters, index=False)
    # print(f"Clustered data saved to {output_file_path_clusters}")

    clustered_data_no_noise = remove_noise(clustered_data)
    
    # # Remove noise and save
    # clustered_data_no_noise.to_csv(output_file_path_no_noise, index=False)
    # print(f"Clustered data without noise saved to {output_file_path_no_noise}")

    # Plot clusters without noise
    plot_clusters_with_matplotlib(clustered_data_no_noise)


In [None]:
# Statistiques sur les clusters
print("\n### Statistiques des clusters : ###")
clusters_statistiques = df_sample['cluster'].value_counts().sort_values(ascending=False)

# Affichage formaté des résultats
for cluster_id, count in clusters_statistiques.items():
    print(f"Cluster {cluster_id} a {count} lignes.")


In [None]:
from collections import Counter
import re
# Liste des mots à exclure (stopwords et termes spécifiques)
stopwords = set(['de', 'chaos', '[lyon', 'france]', ',', 'biennale', 'paper', 'abode', 'pasted', '2011', 'of', 'the', 'by', 'thierry', 'ehrmann', 'france', 'la', 'du', 'et', 'des', 'le', 'les', 'à', 'en', 'un', 'une', 'pour', 'avec', 'dans', 'sur', 'par', 'au', 'aux', 'ce', 'cet', 'cette', 'son', 'sa', 'se', 'ou', 'lyon', 'france'])

# Nettoyage supplémentaire pour retirer les apostrophes et autres symboles

def tokenize_column(column):
    text = column.str.cat(sep=', ').replace(',', ' ')
    text = re.sub(r"['’\+%-/:]", "", text)  # Retirer les apostrophes et guillemets
    words = text.split()
    return [word.lower() for word in words if word.lower() not in stopwords]

tags_words = tokenize_column(df_sample[' tags'])
title_words = tokenize_column(df_sample[' title'])

# Compter la fréquence des mots après suppression des mots à exclure
tags_counter = Counter(tags_words)
title_counter = Counter(title_words)

# Affichage des mots les plus fréquents
def plot_word_frequencies(counter, column_name):
    most_common = counter.most_common(10)
    words, counts = zip(*most_common)
    plt.figure(figsize=(10, 5))
    plt.bar(words, counts)
    plt.title(f"Top 10 mots les plus fréquents dans {column_name}")
    plt.xlabel("Mots")
    plt.ylabel("Fréquence")
    plt.xticks(rotation=45)
    plt.show()

In [None]:
plot_word_frequencies(tags_counter, 'tags')
plot_word_frequencies(title_counter, 'title')

In [None]:
# Nettoyage supplémentaire pour retirer les apostrophes et autres symboles
def tokenize_column(column):
    text = column.str.cat(sep=', ').replace(',', ' ')
    text = re.sub(r"['’\-/:]", "", text)  # Retirer les apostrophes et guillemets
    words = text.split()
    return [word.lower() for word in words if word.lower() not in stopwords]

# Liste des mots à exclure (stopwords et termes spécifiques)
stopwords = set(['de', 'chaos', '[lyon', 'france]', ',', 'biennale', 'paper', 'abode', 'pasted', '2011', 'of', 'the', 'by', 'france', 'la', 'du', 'et', 'des', 'le', 'les', 'à', 'en', 'un', 'une', 'pour', 'avec', 'dans', 'sur', 'par', 'au', 'aux', 'ce', 'cet', 'cette', 'son', 'sa', 'se', 'ou', 'lyon', 'france'])

# Nettoyage supplémentaire pour retirer les apostrophes, les nombres et les mots d'une seule lettre
def tokenize_column(column):
    text = column.str.cat(sep=', ').replace(',', ' ')
    text = re.sub(r"['’\-+/:]", "", text)  # Retirer les apostrophes et guillemets
    words = text.split()
    return [word.lower() for word in words if word.lower() not in stopwords and len(word) > 1 and not any(char.isdigit() for char in word)]

# Identifier les mots les plus fréquents dans chaque cluster
def get_top_words_by_cluster(df, cluster_col, text_col, top_n=10):
    clusters = df[cluster_col].unique()
    cluster_top_words = {}

    for cluster in clusters:
        cluster_data = df[df[cluster_col] == cluster][text_col]
        words = tokenize_column(cluster_data)
        word_counter = Counter(words)
        cluster_top_words[cluster] = word_counter.most_common(top_n)

    return cluster_top_words

# Calculer la moyenne de fréquence des mots les plus communs sur tous les clusters
def get_average_top_words(cluster_top_words):
    word_frequencies = Counter()
    cluster_count = len(cluster_top_words)

    for words in cluster_top_words.values():
        for word, freq in words:
            word_frequencies[word] += freq

    # Moyenne des fréquences
    average_frequencies = {word: freq / cluster_count for word, freq in word_frequencies.items()}
    return average_frequencies

# Fonction pour afficher les mots les plus fréquents par cluster
def plot_word_frequencies(counter, cluster_name):
    most_common = counter.most_common(10)
    if most_common:
        words, counts = zip(*most_common)
        plt.figure(figsize=(10, 5))
        plt.bar(words, counts)
        plt.title(f"Top 10 mots les plus fréquents dans le cluster {cluster_name}")
        plt.xlabel("Mots")
        plt.ylabel("Fréquence")
        plt.xticks(rotation=45)
        plt.show()

def plot_average_word_frequencies(average_top_words):
    sorted_avg = sorted(average_top_words.items(), key=lambda x: x[1], reverse=True)
    words, counts = zip(*sorted_avg[:10])
    plt.figure(figsize=(10, 5))
    plt.bar(words, counts)
    plt.title("Moyenne des fréquences des mots les plus communs sur tous les clusters")
    plt.xlabel("Mots")
    plt.ylabel("Fréquence moyenne")
    plt.xticks(rotation=45)
    plt.show()


In [None]:
# Exemple d'utilisation (df_sample est le DataFrame contenant les données)
cluster_top_words = get_top_words_by_cluster(df_sample, 'cluster', ' title')
average_top_words = get_average_top_words(cluster_top_words)

# Affichage des résultats et plots
print("\nTop mots par cluster:")
for cluster, words in cluster_top_words.items():
    print(f"Cluster {cluster}: {words}")
    word_counter = Counter(dict(words))
    plot_word_frequencies(word_counter, cluster)

plot_average_word_frequencies(average_top_words)


In [None]:
# Importation de spaCy
import spacy

# Charger le modèle français de spaCy
nlp = spacy.load("fr_core_news_sm")

# Ajuster la limite de longueur pour éviter l'erreur liée à de longs textes
nlp.max_length = 3000000

# Fonction pour tokeniser avec spaCy en traitant le texte par morceaux

def spacy_tokenize_column(column, chunk_size=500000):
    text = column.str.cat(sep=' ')
    tokens = []
    for i in range(0, len(text), chunk_size):
        doc = nlp(text[i:i+chunk_size])
        tokens.extend([token.lemma_ for token in doc if not token.is_stop and not token.is_punct and not token.like_num])
    return tokens

# Identifier les mots les plus fréquents dans chaque cluster avec spaCy
def get_top_words_by_cluster_spacy(df, cluster_col, text_col, top_n=10):
    clusters = df[cluster_col].unique()
    cluster_top_words = {}

    for cluster in clusters:
        cluster_data = df[df[cluster_col] == cluster][text_col]
        tokens = spacy_tokenize_column(cluster_data)
        word_counter = Counter(tokens)
        cluster_top_words[cluster] = word_counter.most_common(top_n)

    return cluster_top_words

# Calculer la moyenne de fréquence des mots les plus communs sur tous les clusters
def get_average_top_words_spacy(cluster_top_words):
    word_frequencies = Counter()
    cluster_count = len(cluster_top_words)

    for words in cluster_top_words.values():
        for word, freq in words:
            word_frequencies[word] += freq

    # Moyenne des fréquences
    average_frequencies = {word: freq / cluster_count for word, freq in word_frequencies.items()}
    return average_frequencies

# Exemple d'utilisation (df_sample est le DataFrame contenant les données)
cluster_top_words_spacy = get_top_words_by_cluster_spacy(df_sample, 'cluster', ' title')
average_top_words_spacy = get_average_top_words_spacy(cluster_top_words_spacy)

# Affichage des résultats
print("\nTop mots par cluster (spaCy):")
for cluster, words in cluster_top_words_spacy.items():
    print(f"Cluster {cluster}: {words}")

print("\nMoyenne des fréquences des mots les plus communs sur tous les clusters (spaCy):")
for word, avg_freq in sorted(average_top_words_spacy.items(), key=lambda x: x[1], reverse=True):
    print(f"{word}: {avg_freq:.2f}")


In [None]:

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
# Fonction pour afficher une heatmap des mots par cluster
def plot_word_heatmap(cluster_top_words):
    data = []
    for cluster, words in cluster_top_words.items():
        for word, freq in words:
            data.append([cluster, word, freq])
    
    df_heatmap = pd.DataFrame(data, columns=['Cluster', 'Mot', 'Fréquence'])
    df_pivot = df_heatmap.pivot(index='Mot', columns='Cluster', values='Fréquence').fillna(0)
    
    plt.figure(figsize=(12, 8))
    sns.heatmap(df_pivot, cmap="Blues", annot=True, fmt=".1f")
    plt.title("Heatmap des mots les plus fréquents par cluster")
    plt.xlabel("Cluster")
    plt.ylabel("Mot")
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.show()

# Exemple d'utilisation (df_sample est le DataFrame contenant les données)
cluster_top_words_spacy = get_top_words_by_cluster_spacy(df_sample, 'cluster', ' title')
average_top_words_spacy = get_average_top_words_spacy(cluster_top_words_spacy)

# Affichage des résultats
print("\nTop mots par cluster (spaCy):")
for cluster, words in cluster_top_words_spacy.items():
    print(f"Cluster {cluster}: {words}")

print("\nMoyenne des fréquences des mots les plus communs sur tous les clusters (spaCy):")
for word, avg_freq in sorted(average_top_words_spacy.items(), key=lambda x: x[1], reverse=True):
    print(f"{word}: {avg_freq:.2f}")

# Affichage de la heatmap des mots par cluster
plot_word_heatmap(cluster_top_words_spacy)


In [None]:
# Importation de spaCy et des bibliothèques de visualisation
import spacy
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from wordcloud import WordCloud

# Charger le modèle français de spaCy
nlp = spacy.load("fr_core_news_sm")

# Ajuster la limite de longueur pour éviter l'erreur liée à de longs textes
nlp.max_length = 3000000

# Fonction pour tokeniser avec spaCy en traitant le texte par morceaux
def spacy_tokenize_column(column, chunk_size=500000):
    text = column.str.cat(sep=' ')
    tokens = []
    for i in range(0, len(text), chunk_size):
        doc = nlp(text[i:i+chunk_size])
        tokens.extend([token.lemma_ for token in doc if not token.is_stop and not token.is_punct and not token.like_num])
    return tokens

# Identifier les mots les plus fréquents dans chaque cluster avec spaCy
def get_top_words_by_cluster_spacy(df, cluster_col, text_col, top_n=10):
    clusters = df[cluster_col].unique()
    cluster_top_words = {}

    for cluster in clusters:
        cluster_data = df[df[cluster_col] == cluster][text_col]
        tokens = spacy_tokenize_column(cluster_data)
        word_counter = Counter(tokens)
        cluster_top_words[cluster] = word_counter.most_common(top_n)

    return cluster_top_words

# Calculer la moyenne de fréquence des mots les plus communs sur tous les clusters
def get_average_top_words_spacy(cluster_top_words):
    word_frequencies = Counter()
    cluster_count = len(cluster_top_words)

    for words in cluster_top_words.values():
        for word, freq in words:
            word_frequencies[word] += freq

    # Moyenne des fréquences
    average_frequencies = {word: freq / cluster_count for word, freq in word_frequencies.items()}
    return average_frequencies

# Fonction pour afficher un word cloud des mots les plus fréquents
def plot_word_cloud(average_top_words):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(average_top_words)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title("Word Cloud des mots les plus fréquents")
    plt.show()

# Exemple d'utilisation (df_sample est le DataFrame contenant les données)
cluster_top_words_spacy = get_top_words_by_cluster_spacy(df_sample, 'cluster', ' title')
average_top_words_spacy = get_average_top_words_spacy(cluster_top_words_spacy)

# Affichage des résultats
print("\nTop mots par cluster (spaCy):")
for cluster, words in cluster_top_words_spacy.items():
    print(f"Cluster {cluster}: {words}")

print("\nMoyenne des fréquences des mots les plus communs sur tous les clusters (spaCy):")
for word, avg_freq in sorted(average_top_words_spacy.items(), key=lambda x: x[1], reverse=True):
    print(f"{word}: {avg_freq:.2f}")

# Affichage du word cloud
plot_word_cloud(average_top_words_spacy)


In [None]:
# Importation de spaCy et des bibliothèques de visualisation
import spacy
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from wordcloud import WordCloud
from collections import Counter
from itertools import islice

# Charger le modèle français de spaCy
nlp = spacy.load("fr_core_news_sm")

# Ajuster la limite de longueur pour éviter l'erreur liée à de longs textes
nlp.max_length = 3000000

# Fonction pour extraire des groupes de mots (n-grams)
def extract_ngrams(text, n=2):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and not token.like_num]
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]

# Fonction pour tokeniser et extraire des groupes de mots
def spacy_tokenize_column(column, chunk_size=500000, ngram_size=2):
    text = column.str.cat(sep=' ')
    ngrams = []
    for i in range(0, len(text), chunk_size):
        chunk_text = text[i:i+chunk_size]
        ngrams.extend(extract_ngrams(chunk_text, ngram_size))
    return ngrams

# Identifier les groupes de mots les plus fréquents dans chaque cluster
def get_top_ngrams_by_cluster(df, cluster_col, text_col, ngram_size=2, top_n=10):
    clusters = df[cluster_col].unique()
    cluster_top_ngrams = {}

    for cluster in clusters:
        cluster_data = df[df[cluster_col] == cluster][text_col]
        ngrams = spacy_tokenize_column(cluster_data, ngram_size=ngram_size)
        ngram_counter = Counter(ngrams)
        cluster_top_ngrams[cluster] = ngram_counter.most_common(top_n)

    return cluster_top_ngrams

# Calculer la moyenne de fréquence des groupes de mots les plus communs
def get_average_top_ngrams(cluster_top_ngrams):
    ngram_frequencies = Counter()
    cluster_count = len(cluster_top_ngrams)

    for ngrams in cluster_top_ngrams.values():
        for ngram, freq in ngrams:
            ngram_frequencies[ngram] += freq

    # Moyenne des fréquences
    average_frequencies = {ngram: freq / cluster_count for ngram, freq in ngram_frequencies.items()}
    return average_frequencies

# Fonction pour afficher un word cloud des groupes de mots les plus fréquents
def plot_word_cloud(average_top_ngrams):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(average_top_ngrams)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title("Word Cloud des groupes de mots les plus fréquents")
    plt.show()

# Exemple d'utilisation (df_sample est le DataFrame contenant les données)
cluster_top_ngrams_spacy = get_top_ngrams_by_cluster(df_sample, 'cluster', ' title', ngram_size=2)
average_top_ngrams_spacy = get_average_top_ngrams(cluster_top_ngrams_spacy)

# Affichage des résultats
print("\nTop groupes de mots par cluster (spaCy):")
for cluster, ngrams in cluster_top_ngrams_spacy.items():
    print(f"Cluster {cluster}: {ngrams}")

print("\nMoyenne des fréquences des groupes de mots les plus communs sur tous les clusters (spaCy):")
for ngram, avg_freq in sorted(average_top_ngrams_spacy.items(), key=lambda x: x[1], reverse=True):
    print(f"{ngram}: {avg_freq:.2f}")

# Affichage du word cloud
plot_word_cloud(average_top_ngrams_spacy)


In [None]:
ids_problemes = lignes_problemes_df['id'].tolist()  # Liste des IDs corrigés
df_base = data_sans_doublons[~data_sans_doublons['id'].isin(ids_problemes)]
df_base.drop(columns=['Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18' ], inplace=True)
# Ajouter les lignes corrigées au DataFrame de base
df_base = pd.concat([df_base, lignes_problemes_df], ignore_index=True)


In [None]:
# Importation de spaCy et des bibliothèques de visualisation
import spacy
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from wordcloud import WordCloud
from collections import Counter
from itertools import islice

# Charger le modèle français de spaCy
nlp = spacy.load("fr_core_news_sm")

# Ajuster la limite de longueur pour éviter l'erreur liée à de longs textes
nlp.max_length = 3000000

# Fonction pour filtrer les n-grams contenant "Lyon" ou "France"
def filter_ngrams(ngrams, banned_words={"lyon", "france", "paper"}, threshold=0.5):
    filtered_ngrams = []
    
    for ngram in ngrams:
        words = ngram.split()
        banned_count = sum(1 for word in words if word.lower() in banned_words)
        
        if banned_count == 0 or (banned_count / len(words)) < threshold:
            filtered_ngrams.append(ngram)
    
    return filtered_ngrams

# Fonction pour extraire des groupes de mots (n-grams) en appliquant le filtre
def extract_ngrams(text, n=2):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and not token.like_num]
    ngrams = zip(*[tokens[i:] for i in range(n)])
    extracted_ngrams = [" ".join(ngram) for ngram in ngrams]
    
    return filter_ngrams(extracted_ngrams)

# Fonction pour tokeniser et extraire des groupes de mots
def spacy_tokenize_column(column, chunk_size=500000, ngram_size=2):
    text = column.str.cat(sep=' ')
    ngrams = []
    for i in range(0, len(text), chunk_size):
        chunk_text = text[i:i+chunk_size]
        ngrams.extend(extract_ngrams(chunk_text, ngram_size))
    return ngrams

# Identifier les groupes de mots les plus fréquents par cluster et par mois
def get_top_ngrams_by_cluster_month(df, cluster_col, text_col, ngram_size=2, time_col='date_taken', top_n=10):
    df = df[(df['date_taken'].dt.year >= 2003) & (df['date_taken'].dt.year <= 2019)]  # Filtrer entre 2003 et 2019
    df['month'] = df[time_col].dt.month  # Extraire le mois uniquement
    clusters = df[cluster_col].unique()
    cluster_month_ngrams = {}
    
    for cluster in clusters:
        cluster_data = df[df[cluster_col] == cluster]
        month_ngrams = {}
        
        for month, month_data in cluster_data.groupby('month'):
            tokens = spacy_tokenize_column(month_data[text_col], ngram_size=ngram_size)
            ngram_counter = Counter(tokens)
            month_ngrams[month] = ngram_counter.most_common(top_n)
        
        cluster_month_ngrams[cluster] = month_ngrams
    
    return cluster_month_ngrams

# Fonction pour afficher une évolution temporelle des groupes de mots par mois
def plot_ngram_trends_by_month(cluster_month_ngrams, cluster=None, top_n=5):
    if cluster is None or cluster not in cluster_month_ngrams:
        cluster = next(iter(cluster_month_ngrams))  # Sélectionner le premier cluster valide
        print(f"Cluster par défaut sélectionné : {cluster}")
    
    plt.figure(figsize=(12, 6))
    
    # Obtenir les groupes de mots les plus fréquents sur toutes les périodes
    all_ngrams = Counter()
    for month, ngrams in cluster_month_ngrams[cluster].items():
        for ngram, freq in ngrams:
            all_ngrams[ngram] += freq
    
    most_common_ngrams = [ngram for ngram, _ in all_ngrams.most_common(top_n)]
    
    # Construire une matrice temporelle
    time_series = {ngram: [] for ngram in most_common_ngrams}
    months = sorted(cluster_month_ngrams[cluster].keys())
    
    for month in months:
        ngram_counts = dict(cluster_month_ngrams[cluster][month])
        for ngram in most_common_ngrams:
            time_series[ngram].append(ngram_counts.get(ngram, 0))
    
    # Tracer les courbes d'évolution des groupes de mots les plus fréquents
    for ngram, counts in time_series.items():
        plt.plot(months, counts, marker='o', label=ngram)
        plt.xlabel("Mois")
        plt.ylabel("Fréquence")
        plt.title(f"Évolution des groupes de mots les plus fréquents par mois - Cluster {cluster}")
        plt.legend()
        plt.xticks(ticks=range(1, 13), labels=['Jan', 'Fév', 'Mar', 'Avr', 'Mai', 'Juin', 'Juil', 'Août', 'Sep', 'Oct', 'Nov', 'Déc'])
        plt.show()

# Exemple d'utilisation (df_sample est le DataFrame contenant les données)
cluster_month_ngrams_spacy = get_top_ngrams_by_cluster_month(df_base2, 'cluster', 'title', ngram_size=2, time_col='date_taken')

# Affichage des résultats
print("\nTop groupes de mots par cluster et par mois (spaCy):")
print("Clusters disponibles :", cluster_month_ngrams_spacy.keys())
for cluster, month_data in cluster_month_ngrams_spacy.items():
    print(f"Cluster {cluster}:")
    for month, ngrams in month_data.items():
        print(f"  Mois {month}: {ngrams}")

# Visualiser l'évolution des n-grams pour un cluster existant
plot_ngram_trends_by_month(cluster_month_ngrams_spacy, cluster=None, top_n=5)


In [None]:
print("\nClusters disponibles :", cluster_month_ngrams_spacy.keys())


In [None]:
def apply_dbscan(data, eps, min_samples):
    """
    Apply DBSCAN with the given parameters and return the clustered data.
    
    Parameters:
        data (pd.DataFrame): The dataset containing latitude and longitude.
        eps (float): The epsilon value for DBSCAN.
        min_samples (int): The minimum number of samples in a neighborhood.
    """
    coords = data[[" lat", " long"]].values

    # Apply DBSCAN
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    labels = dbscan.fit_predict(coords)

    # Add cluster labels to the DataFrame
    data["cluster"] = labels

    # Count the number of clusters and noise points
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise = list(labels).count(-1)

    print(f"DBSCAN results: {n_clusters} clusters, {n_noise} noise points")

    return data

# Example usage
eps = 0.0011  # Optimal eps from the k-distance plot
min_samples = 4  # Optimal min_samples from experimentation
clustered_data = apply_dbscan(df_sample, eps, min_samples)

def remove_noise(data):
    """
    Remove noise points (cluster = -1) from the dataset.
    
    Parameters:
        data (pd.DataFrame): The dataset containing latitude, longitude, and cluster labels.
    
    Returns:
        pd.DataFrame: The dataset without noise points.
    """
    filtered_data = data[data["cluster"] != -1]
    print(f"Data after removing noise: {filtered_data.shape[0]} rows.")
    return filtered_data

def subdivide_largest_cluster(data, eps, min_samples):
    """
    Recursively subdivide the largest cluster using DBSCAN.
    
    Parameters:
        data (pd.DataFrame): The dataset containing latitude, longitude, and cluster labels.
        eps (float): The epsilon value for DBSCAN.
        min_samples (int): The minimum number of samples in a neighborhood.
    
    Returns:
        pd.DataFrame: The dataset with subdivided clusters.
    """
    # Find the largest cluster (excluding noise)
    largest_cluster = data[data["cluster"] != -1]["cluster"].value_counts().idxmax()
    print(f"Subdividing cluster {largest_cluster}...")

    # Extract points from the largest cluster
    largest_cluster_points = data[data["cluster"] == largest_cluster]

    # Apply DBSCAN to the largest cluster
    refined_cluster_data = perform_dbscan_analysis(
        largest_cluster_points.copy(), eps=eps, min_samples=min_samples
    )

    # Update the cluster IDs to avoid overlap
    max_cluster_id = data["cluster"].max()
    refined_cluster_data["cluster"] = refined_cluster_data["cluster"].apply(
        lambda x: x + max_cluster_id + 1 if x != -1 else -1
    )

    # Merge back refined clusters
    data.loc[data["cluster"] == largest_cluster, "cluster"] = -1  # Mark old cluster as noise
    data = pd.concat([data, refined_cluster_data])

    return data

# Example usage
clustered_data = subdivide_largest_cluster(clustered_data, eps=0.0006, min_samples=4)
clustered_data = subdivide_largest_cluster(clustered_data, eps=0.0003, min_samples=4)
clustered_data = subdivide_largest_cluster(clustered_data, eps=0.0001, min_samples=4)
# clustered_data = subdivide_largest_cluster(clustered_data, eps=0.0001, min_samples=4)
# clustered_data = subdivide_largest_cluster(clustered_data, eps=0.0001, min_samples=4)
# clustered_data = subdivide_largest_cluster(clustered_data, eps=0.0001, min_samples=4)
# clustered_data = subdivide_largest_cluster(clustered_data, eps=0.0001, min_samples=4)
# clustered_data = subdivide_largest_cluster(clustered_data, eps=0.0001, min_samples=4)
# clustered_data = subdivide_largest_cluster(clustered_data, eps=0.0001, min_samples=4)
# clustered_data = subdivide_largest_cluster(clustered_data, eps=0.0001, min_samples=4)
# clustered_data = subdivide_largest_cluster(clustered_data, eps=0.0001, min_samples=4)





clustered_data_no_noise = remove_noise(clustered_data)

In [None]:
# compter le nombre de lignes du df 
print(clustered_data_no_noise.shape[0])

In [None]:
df_sample = clustered_data_no_noise.copy()
df_base2 = df_sample.copy()
df_base2.columns = df_base2.columns.str.strip()
required_columns = ['date_taken_year', 'date_taken_month', 'date_taken_day']
df_base2 = df_base2.dropna(subset=required_columns)
print("\n### Nombre de valeurs nulles par colonne ###")
print(df_base2.isnull().sum())

# Convertir en entiers uniquement si les valeurs sont valides
for col in required_columns:
    df_base2[col] = df_base2[col].astype(int, errors='ignore')
    

# Supprimer les dates invalides
df_base2 = df_base2[(df_base2['date_taken_year'] > 2003) & 
                    (df_base2['date_taken_year'] < 2100) &
                    (df_base2['date_taken_month'].between(1, 12)) &
                    (df_base2['date_taken_day'].between(1, 31))]
    
df_temp2 = df_base2[['date_taken_year', 'date_taken_month', 'date_taken_day']].rename(
    columns={
        'date_taken_year': 'year',
        'date_taken_month': 'month',
        'date_taken_day': 'day'
    }
)
df_base2['date_taken'] = pd.to_datetime(df_temp2, errors='coerce')
df_base2['date_taken'] = pd.to_datetime({
    'year': df_base2['date_taken_year'],
    'month': df_base2['date_taken_month'],
    'day': df_base2['date_taken_day']
}, errors='coerce')

print(df_base2.columns)

In [None]:
df_base.columns = df_base.columns.str.strip()
required_columns = ['date_taken_year', 'date_taken_month', 'date_taken_day']
df_base = df_base.dropna(subset=required_columns)
print("\n### Nombre de valeurs nulles par colonne ###")
print(df_base.isnull().sum())

In [None]:
# Vérifier les types de données après suppression
print("\n### Types de données après suppression des NaN ###")
print(df_base[required_columns].dtypes)
    

In [None]:
# Convertir en entiers uniquement si les valeurs sont valides
for col in required_columns:
    df_base[col] = df_base[col].astype(int, errors='ignore')
    

In [None]:
# Vérifier les valeurs minimales et maximales
print("\n### Valeurs min/max dans les colonnes de date ###")
print(df_base[required_columns].agg(['min', 'max']))
    

In [None]:
# Supprimer les dates invalides
df_base = df_base[(df_base['date_taken_year'] > 2003) & 
                    (df_base['date_taken_year'] < 2100) &
                    (df_base['date_taken_month'].between(1, 12)) &
                    (df_base['date_taken_day'].between(1, 31))]
    

In [None]:
print("\n### Vérification finale des types de données ###")
print(df_base[required_columns].dtypes)


In [None]:
df_temp = df_base[['date_taken_year', 'date_taken_month', 'date_taken_day']].rename(
    columns={
        'date_taken_year': 'year',
        'date_taken_month': 'month',
        'date_taken_day': 'day'
    }
)
df_base['date_taken'] = pd.to_datetime(df_temp, errors='coerce')
df_base['date_taken'] = pd.to_datetime({
    'year': df_base['date_taken_year'],
    'month': df_base['date_taken_month'],
    'day': df_base['date_taken_day']
}, errors='coerce')


In [None]:
# Ajouter une colonne pour le jour de la semaine
df_base['day_of_week'] = df_base['date_taken'].dt.day_name()

# Créer un calendrier par année
years = df_base['date_taken_year'].unique()
for year in sorted(years):
        df_year = df_base[df_base['date_taken_year'] == year]
        heatmap_data = df_year.groupby(['date_taken_month', 'day_of_week']).size().unstack().fillna(0)
        
        # Réorganiser l'ordre des jours de la semaine
        order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
        heatmap_data = heatmap_data.reindex(columns=order, fill_value=0)
        
        # Tracer la heatmap pour chaque année
        plt.figure(figsize=(12, 6))
        sns.heatmap(heatmap_data, cmap="Blues", annot=True, fmt=".0f", linewidths=0.5)
        plt.title(f"Répartition des photos par jour de la semaine et par mois - Année {year}")
        plt.xlabel("Jour de la semaine")
        plt.ylabel("Mois")
        plt.show()

In [None]:
# Filtrer les années à partir de 2003
df_base = df_base[df_base['date_taken_year'] >= 2003]
    
# Créer un calendrier par année
years = df_base['date_taken_year'].unique()
for year in sorted(years):
        df_year = df_base[df_base['date_taken_year'] == year]
        heatmap_data = df_year.groupby(['date_taken_month', 'date_taken_day']).size().unstack().fillna(0)
        
        # Tracer la heatmap pour chaque année
        plt.figure(figsize=(12, 6))
        sns.heatmap(heatmap_data, cmap="Blues", annot=True, fmt=".0f", linewidths=0.5)
        plt.title(f"Répartition des photos par jour du mois - Année {year}")
        plt.xlabel("Jour du mois")
        plt.ylabel("Mois")
        plt.show()

In [None]:
print(df_base.columns)


In [None]:
import spacy
from collections import Counter
from wordcloud import WordCloud
import io, base64
import folium

# Make sure column names are stripped
df_base.columns = df_base.columns.str.strip()

# Load the French spaCy model (ensure you've installed it with: python -m spacy download fr_core_news_sm)
nlp = spacy.load("fr_core_news_sm")
nlp.max_length = 3000000

# Function to tokenize a text column using spaCy
def spacy_tokenize_column(column):
    text = " ".join(column.dropna().astype(str).tolist())
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc 
              if not token.is_stop and not token.is_punct and not token.like_num]
    return tokens

# Function to get the top N words for each cluster (ignores noise with cluster == -1)
def get_top_words_by_cluster(df, cluster_col, text_col, top_n=10):
    cluster_top_words = {}
    for cluster in df[cluster_col].unique():
        if cluster == -1:
            continue
        cluster_text = df[df[cluster_col] == cluster][text_col]
        tokens = spacy_tokenize_column(cluster_text)
        # Si aucun token n'est trouvé, on effectue une tokenisation plus simple
        if not tokens:
            tokens = " ".join(cluster_text.dropna().astype(str).tolist()).split()
        counts = Counter(tokens)
        cluster_top_words[cluster] = counts.most_common(top_n)
    return cluster_top_words


# Function to generate word clouds from the top words
def generate_wordclouds(cluster_top_words):
    wordclouds = {}
    for cluster, words in cluster_top_words.items():
        freq = dict(words)
        if not freq:  # Si aucun mot n'est présent, on utilise un texte par défaut
            default_text = "No Data"
            wc = WordCloud(width=400, height=200, background_color='white')\
                 .generate(default_text)
        else:
            wc = WordCloud(width=400, height=200, background_color='white')\
                 .generate_from_frequencies(freq)
        wordclouds[cluster] = wc
    return wordclouds



# Helper function to convert a WordCloud image to a base64 string
def wordcloud_to_base64(wc):
    img = wc.to_image()
    buffer = io.BytesIO()
    img.save(buffer, format="PNG")
    return base64.b64encode(buffer.getvalue()).decode('utf-8')

# Generate top words and word clouds using the 'title' column.
# (Make sure your dataframe df_base has a 'cluster' column and a 'title' column.)
cluster_top_words = get_top_words_by_cluster(df_base2, 'cluster', 'title', top_n=10)
wordclouds = generate_wordclouds(cluster_top_words)

# Function to create a Folium map for text visualization.
def create_text_folium_map(df, wordclouds):
    map_center = [df['lat'].mean(), df['long'].mean()]
    fmap_text = folium.Map(location=map_center, zoom_start=12)
    
    # Loop over each unique cluster (skip noise, which is marked as -1)
    for cluster in df['cluster'].unique():
        if cluster == -1:
            continue
        cluster_df = df[df['cluster'] == cluster]
        centroid = [cluster_df['lat'].mean(), cluster_df['long'].mean()]
        
        if cluster in wordclouds:
            img_b64 = wordcloud_to_base64(wordclouds[cluster])
            popup_html = f"<h4>Cluster {cluster}</h4><img src='data:image/png;base64,{img_b64}' style='width:300px;'>"
        else:
            popup_html = f"<h4>Cluster {cluster}</h4>No wordcloud available"
        
        popup = folium.Popup(popup_html, max_width=300)
        folium.CircleMarker(location=centroid, radius=8, popup=popup, color="green", fill=True).add_to(fmap_text)
    
    return fmap_text

# Create the text-based map using df_base and the computed wordclouds.
fmap_text = create_text_folium_map(df_base2, wordclouds)
fmap_text


In [None]:
import folium
from folium.plugins import TimestampedGeoJson

def create_timestamped_geojson(df):
    features = []
    for _, row in df.iterrows():
        if pd.isna(row['date_taken']):
            continue
        feature = {
            'type': 'Feature',
            'properties': {
                'time': row['date_taken'].isoformat(),
                'popup': f"ID: {row['id']}<br>Date: {row['date_taken']}"
            },
            'geometry': {
                'type': 'Point',
                'coordinates': [row['long'], row['lat']]
            }
        }
        features.append(feature)
    return {'type': 'FeatureCollection', 'features': features}

def add_timestamped_layer(fmap, geojson):
    TimestampedGeoJson(
        geojson,
        period='P1D',  # each time step represents one day
        add_last_point=True,
        auto_play=False,
        loop=False,
        max_speed=1,
        loop_button=True,
        date_options='YYYY-MM-DD',
        time_slider_drag_update=True
    ).add_to(fmap)
    return fmap

def create_folium_map(df):
    map_center = [df['lat'].mean(), df['long'].mean()]
    fmap = folium.Map(location=map_center, zoom_start=12)
    
    # Create and add the timestamped GeoJSON layer
    geojson = create_timestamped_geojson(df)
    fmap = add_timestamped_layer(fmap, geojson)
    
    # Optionally, add a marker for each point
    for _, row in df.iterrows():
        if pd.isna(row['date_taken']):
            continue
        folium.CircleMarker(
            location=[row['lat'], row['long']],
            radius=3,
            popup=f"ID: {row['id']}<br>Date: {row['date_taken']}",
            color='blue',
            fill=True
        ).add_to(fmap)
    
    return fmap

# Create the map using your cleaned data (ensure df_base contains 'lat', 'long', 'date_taken', and 'id')
fmap = create_folium_map(df_base2)
fmap


In [None]:
import folium

# Calculer le centre de la carte à partir de la moyenne des latitudes et longitudes de df_base2
map_center = [df_base2['lat'].mean(), df_base2['long'].mean()]
fmap = folium.Map(location=map_center, zoom_start=12)

# Boucle sur chaque cluster et calcul du centroïde pour afficher un marqueur
for cluster in df_base2['cluster'].unique():
    cluster_df = df_base2[df_base2['cluster'] == cluster]
    centroid = [cluster_df['lat'].mean(), cluster_df['long'].mean()]
    folium.CircleMarker(
        location=centroid,
        radius=8,
        popup=f"Cluster {cluster}",
        color="blue",
        fill=True,
        fill_color="blue"
    ).add_to(fmap)

fmap
