In [None]:
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Remplace 'nom_du_fichier.csv' par le chemin réel de ton fichier CSV
chemin_fichier = 'flickr_data2.csv'

# Lecture du fichier CSV
try:
    data = pd.read_csv(chemin_fichier, sep=',')
    print("Fichier importé avec succès !")
except Exception as e:
    print(f"Une erreur est survenue : {e}")

In [None]:
print("### Informations générales sur le dataset ###")
print(data.info())

# Nombre total de lignes et colonnes
print("\nNombre total de lignes et colonnes :")
print(f"Lignes : {data.shape[0]}, Colonnes : {data.shape[1]}")

# Statistiques descriptives
print("\n### Statistiques descriptives ###")
#print(data.describe(include='all'))  # Inclut toutes les colonnes

# Comptage des valeurs nulles par colonne
print("\n### Nombre de valeurs nulles par colonne ###")
print(data.isnull().sum())

# Pourcentage de valeurs nulles
print("\n### Pourcentage de valeurs nulles par colonne ###")
print((data.isnull().sum() / data.shape[0]) * 100)

# Détection des doublons
print("\n### Nombre de doublons ###")
print(data.duplicated().sum())

# Aperçu des premières lignes
print("\n### Aperçu des premières lignes ###")
print(data.head())


In [None]:

try:
    # Suppression des doublons sur toutes les colonnes
    data_sans_doublons = data.drop_duplicates()
    print("Les doublons ont été supprimés du dataset.")

    # Identifier les 3 dernières colonnes
    dernieres_colonnes = data_sans_doublons.columns[-3:]  # Noms des 3 dernières colonnes
    print("### Les 3 dernières colonnes identifiées sont : ###")
    print(dernieres_colonnes)

    # Filtrer les lignes où au moins une des 3 dernières colonnes contient une valeur non nulle
    lignes_problemes = data_sans_doublons[dernieres_colonnes].notnull().any(axis=1)

    lignes_problemes_df = data_sans_doublons[lignes_problemes]

    lignes_problemes_df['title_date_concat'] = (
    lignes_problemes_df[' title'].astype(str) + '_:' + lignes_problemes_df[' date_upload_minute'].astype(str)
    )
    # Réassigner les colonnes en utilisant .loc
    lignes_problemes_df.loc[:, ' date_upload_minute'] = lignes_problemes_df[' date_upload_hour']
    lignes_problemes_df.loc[:, ' date_upload_hour'] = lignes_problemes_df[' date_upload_day']
    lignes_problemes_df.loc[:, ' date_upload_day'] = lignes_problemes_df[' date_upload_month']
    lignes_problemes_df.loc[:, ' date_upload_month'] = lignes_problemes_df[' date_upload_year']
    lignes_problemes_df.loc[:, ' date_upload_year'] = lignes_problemes_df['Unnamed: 16']
    lignes_problemes_df.loc[:, ' title'] = lignes_problemes_df['title_date_concat']

    lignes_problemes_df['annee'] = lignes_problemes_df[' date_taken_minute']
    lignes_problemes_df.loc[:, ' date_taken_minute'] = lignes_problemes_df[' date_taken_hour']
    lignes_problemes_df.loc[:,' date_taken_hour'] = lignes_problemes_df[' date_taken_day']
    lignes_problemes_df.loc[:,' date_taken_day'] = lignes_problemes_df[' date_taken_month']
    lignes_problemes_df.loc[:,' date_taken_month'] = lignes_problemes_df[' date_taken_year']
    lignes_problemes_df.loc[:,' date_taken_year'] = lignes_problemes_df['annee']

    lignes_problemes_df.drop(columns=['annee', 'title_date_concat', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18' ], inplace=True)

    # Liste des IDs à supprimer
    ids_a_supprimer = [8744184885, 8715425964]

    # Suppression des lignes correspondant aux IDs spécifiés
    lignes_problemes_df = lignes_problemes_df[~lignes_problemes_df['id'].isin(ids_a_supprimer)]

except Exception as e:
    print(f"Une erreur est survenue : {e}")


In [None]:
ids_problemes = lignes_problemes_df['id'].tolist()  # Liste des IDs corrigés
df_base = data_sans_doublons[~data_sans_doublons['id'].isin(ids_problemes)]
df_base.drop(columns=['Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18' ], inplace=True)
# Ajouter les lignes corrigées au DataFrame de base
df_base = pd.concat([df_base, lignes_problemes_df], ignore_index=True)


In [None]:
print(df_base.columns)

In [None]:
df_troncate = df_base.iloc[:, :4]

In [None]:
from sklearn.cluster import DBSCAN
import numpy as np

# Garder seulement les 4 premières colonnes
df_troncate = df_base.iloc[:, :4]

# Vérification des colonnes conservées
print("### Aperçu du DataFrame réduit aux 4 premières colonnes : ###")
print(df_troncate.head())

# Préparation des données pour la clusterisation (Latitude et Longitude)
coords = df_troncate[[' lat', ' long']].to_numpy()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN


# Charger les données nettoyées
def load_cleaned_data(file_path):
    try:
        data = pd.read_csv(file_path, nrows=10000)
        print(f"Loaded data with {data.shape[0]} rows and {data.shape[1]} columns.")
        return data
    except Exception as e:
        print(f"Error loading cleaned data: {e}")
        return None


# Appliquer DBSCAN
def perform_dbscan_analysis(data, eps, min_samples):
    features = data[[" lat", " long"]]
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    labels = dbscan.fit_predict(features)
    data["cluster"] = labels

    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise = list(labels).count(-1)

    print(f"Nombre de clusters détectés : {n_clusters}")
    print(f"Nombre de points isolés (bruit) : {n_noise}")

    return data


# Récursivement subdiviser le plus grand cluster
def subdivide_largest_cluster(data, eps, min_samples):
    largest_cluster = data[data["cluster"] != -1]["cluster"].value_counts().idxmax()
    print(f"Subdivising cluster {largest_cluster}...")

    largest_cluster_points = data[data["cluster"] == largest_cluster]

    # Apply DBSCAN to the largest cluster
    refined_cluster_data = perform_dbscan_analysis(
        largest_cluster_points.copy(), eps=eps, min_samples=min_samples
    )

    # Update the cluster IDs to avoid overlap
    max_cluster_id = data["cluster"].max()
    refined_cluster_data["cluster"] = refined_cluster_data["cluster"].apply(
        lambda x: x + max_cluster_id + 1 if x != -1 else -1
    )

    # Merge back refined clusters
    data.loc[data["cluster"] == largest_cluster, "cluster"] = (
        -1
    )  # Mark old cluster as noise
    data = pd.concat([data, refined_cluster_data])

    return data


# Visualiser les clusters avec des couleurs correctes
def plot_clusters_with_matplotlib(data):
    plt.figure(figsize=(10, 8))

    # Map unique cluster IDs to consistent colors
    unique_clusters = sorted(data["cluster"].unique())
    cluster_color_map = {
        cluster: plt.get_cmap("tab20")(idx % 20)
        for idx, cluster in enumerate(unique_clusters)
    }

    for cluster in unique_clusters:
        cluster_points = data[data["cluster"] == cluster]

        # Assign colors to clusters, noise gets black
        color = cluster_color_map[cluster] if cluster != -1 else "black"

        plt.scatter(
            cluster_points[" long"],
            cluster_points[" lat"],
            s=20,
            c=[color],
            label=f"Cluster {cluster}" if cluster != -1 else "Noise",
        )

    plt.title("Visualisation des clusters DBSCAN")
    plt.xlabel("Longitude")
    plt.ylabel("Latitude")
    plt.legend(loc="best", fontsize="small", bbox_to_anchor=(1.05, 1))
    plt.grid()
    plt.show()


# Remove noise from the data
def remove_noise(data):
    filtered_data = data[data["cluster"] != -1]
    print(f"Data after removing noise: {filtered_data.shape[0]} rows.")
    return filtered_data


# Main function
def main():
    # File paths
    input_file_path = "./data/cleaned_flickr_data.csv"
    output_file_path_clusters = "./data/clustered_flickr_data.csv"
    output_file_path_no_noise = "./data/clustered_flickr_data.csv"

    # Load data
    df = load_cleaned_data(input_file_path)

    if df is not None:
        # Perform initial DBSCAN
        clustered_data = perform_dbscan_analysis(df, eps=0.0061, min_samples=4)

        # Plot initial clusters
        plot_clusters_with_matplotlib(clustered_data)

        # Subdivide the largest clusters
        clustered_data = subdivide_largest_cluster(
            clustered_data, eps=0.001, min_samples=4
        )
        clustered_data = subdivide_largest_cluster(
            clustered_data, eps=0.0006, min_samples=4
        )

        # Plot clusters after subdivision
        plot_clusters_with_matplotlib(clustered_data)

        # Save clustered data
        clustered_data.to_csv(output_file_path_clusters, index=False)
        print(f"Clustered data saved to {output_file_path_clusters}")

        # Remove noise and save
        clustered_data_no_noise = remove_noise(clustered_data)
        clustered_data_no_noise.to_csv(output_file_path_no_noise, index=False)
        print(f"Clustered data without noise saved to {output_file_path_no_noise}")

        # Plot clusters without noise
        plot_clusters_with_matplotlib(clustered_data_no_noise)


In [None]:
from sklearn.neighbors import NearestNeighbors
df_sample = df_troncate.sample(n=100000)

def plot_k_distance(data, k=4):
    """
    Plot the k-distance graph to help choose the optimal eps value for DBSCAN.
    
    Parameters:
        data (pd.DataFrame): The dataset containing latitude and longitude.
        k (int): The number of nearest neighbors to consider.
    """
    # Extract latitude and longitude
    coords = data[[" lat", " long"]].values

    # Compute the distance to the k-th nearest neighbor
    neighbors = NearestNeighbors(n_neighbors=k)
    neighbors_fit = neighbors.fit(coords)
    distances, _ = neighbors_fit.kneighbors(coords)

    # Sort the distances
    distances = np.sort(distances[:, k-1], axis=0)

    # Plot the k-distance graph
    plt.figure(figsize=(10, 6))
    plt.plot(distances)
    plt.title(f"K-Distance Graph (k={k})")
    plt.xlabel("Points sorted by distance")
    plt.ylabel(f"{k}-th nearest neighbor distance")
    plt.grid()
    plt.show()

# Example usage
plot_k_distance(df_sample, k=4)

In [None]:
from sklearn.cluster import DBSCAN

def experiment_with_min_samples(data, eps, min_samples_list):
    """
    Experiment with different values of min_samples and visualize the results.
    
    Parameters:
        data (pd.DataFrame): The dataset containing latitude and longitude.
        eps (float): The epsilon value for DBSCAN.
        min_samples_list (list): A list of min_samples values to try.
    """
    coords = data[[" lat", " long"]].values

    for min_samples in min_samples_list:
        # Apply DBSCAN
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        labels = dbscan.fit_predict(coords)

        # Count the number of clusters and noise points
        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        n_noise = list(labels).count(-1)

        print(f"min_samples={min_samples}: {n_clusters} clusters, {n_noise} noise points")

        # Visualize the clusters
        plt.figure(figsize=(10, 6))
        plt.scatter(coords[:, 1], coords[:, 0], c=labels, cmap="tab20", s=20)
        plt.title(f"DBSCAN Clustering (eps={eps}, min_samples={min_samples})")
        plt.xlabel("Longitude")
        plt.ylabel("Latitude")
        plt.grid()
        plt.show()

# Example usage
eps = 0.0008  # Example value from the k-distance plot
min_samples_list = [4, 5, 6]  # Try different values
experiment_with_min_samples(df_sample, eps, min_samples_list)

In [None]:
def apply_dbscan(data, eps, min_samples):
    """
    Apply DBSCAN with the given parameters and return the clustered data.
    
    Parameters:
        data (pd.DataFrame): The dataset containing latitude and longitude.
        eps (float): The epsilon value for DBSCAN.
        min_samples (int): The minimum number of samples in a neighborhood.
    """
    coords = data[[" lat", " long"]].values

    # Apply DBSCAN
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    labels = dbscan.fit_predict(coords)

    # Add cluster labels to the DataFrame
    data["cluster"] = labels

    # Count the number of clusters and noise points
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise = list(labels).count(-1)

    print(f"DBSCAN results: {n_clusters} clusters, {n_noise} noise points")

    return data

# Example usage
eps = 0.0011  # Optimal eps from the k-distance plot
min_samples = 4  # Optimal min_samples from experimentation
clustered_data = apply_dbscan(df_sample, eps, min_samples)

def remove_noise(data):
    """
    Remove noise points (cluster = -1) from the dataset.
    
    Parameters:
        data (pd.DataFrame): The dataset containing latitude, longitude, and cluster labels.
    
    Returns:
        pd.DataFrame: The dataset without noise points.
    """
    filtered_data = data[data["cluster"] != -1]
    print(f"Data after removing noise: {filtered_data.shape[0]} rows.")
    return filtered_data

def subdivide_largest_cluster(data, eps, min_samples):
    """
    Recursively subdivide the largest cluster using DBSCAN.
    
    Parameters:
        data (pd.DataFrame): The dataset containing latitude, longitude, and cluster labels.
        eps (float): The epsilon value for DBSCAN.
        min_samples (int): The minimum number of samples in a neighborhood.
    
    Returns:
        pd.DataFrame: The dataset with subdivided clusters.
    """
    # Find the largest cluster (excluding noise)
    largest_cluster = data[data["cluster"] != -1]["cluster"].value_counts().idxmax()
    print(f"Subdividing cluster {largest_cluster}...")

    # Extract points from the largest cluster
    largest_cluster_points = data[data["cluster"] == largest_cluster]

    # Apply DBSCAN to the largest cluster
    refined_cluster_data = perform_dbscan_analysis(
        largest_cluster_points.copy(), eps=eps, min_samples=min_samples
    )

    # Update the cluster IDs to avoid overlap
    max_cluster_id = data["cluster"].max()
    refined_cluster_data["cluster"] = refined_cluster_data["cluster"].apply(
        lambda x: x + max_cluster_id + 1 if x != -1 else -1
    )

    # Merge back refined clusters
    data.loc[data["cluster"] == largest_cluster, "cluster"] = -1  # Mark old cluster as noise
    data = pd.concat([data, refined_cluster_data])

    return data

# Example usage
clustered_data = subdivide_largest_cluster(clustered_data, eps=0.0006, min_samples=4)
clustered_data = subdivide_largest_cluster(clustered_data, eps=0.0003, min_samples=4)
clustered_data = subdivide_largest_cluster(clustered_data, eps=0.0001, min_samples=4)
clustered_data = subdivide_largest_cluster(clustered_data, eps=0.0001, min_samples=4)
clustered_data = subdivide_largest_cluster(clustered_data, eps=0.0001, min_samples=4)
clustered_data = subdivide_largest_cluster(clustered_data, eps=0.0001, min_samples=4)
clustered_data = subdivide_largest_cluster(clustered_data, eps=0.0001, min_samples=4)
clustered_data = subdivide_largest_cluster(clustered_data, eps=0.0001, min_samples=4)
clustered_data = subdivide_largest_cluster(clustered_data, eps=0.0001, min_samples=4)
clustered_data = subdivide_largest_cluster(clustered_data, eps=0.0001, min_samples=4)
clustered_data = subdivide_largest_cluster(clustered_data, eps=0.0001, min_samples=4)





clustered_data_no_noise = remove_noise(clustered_data)

In [None]:
import folium

def plot_clusters_on_map(data):
    """
    Plot the clusters on a Folium map.
    
    Parameters:
        data (pd.DataFrame): The dataset containing latitude, longitude, and cluster labels.
    """
    # Create a map centered on Lyon
    lyon_map = folium.Map(location=[45.7640, 4.8357], zoom_start=13)

    # Define a color map for clusters
    colors = [
        "red", "blue", "green", "purple", "orange", "darkred", "lightblue",
        "darkgreen", "darkblue", "pink", "lightgreen", "gray", "black"
    ]

    # Add points to the map
    for _, row in data.iterrows():
        cluster = row["cluster"]
        color = colors[cluster % len(colors)] if cluster != -1 else "black"
        folium.CircleMarker(
            location=[row[" lat"], row[" long"]],
            radius=3,
            color=color,
            fill=True,
            fill_color=color,
            popup=f"Cluster {cluster}"
        ).add_to(lyon_map)

    return lyon_map

# Example usage
lyon_map = plot_clusters_on_map(clustered_data_no_noise)
lyon_map  # Display the map in a Jupyter notebook

In [None]:
sampled_data = data_sans_doublons.sample(1000, random_state=42)
#new df with only user lat and long
sampled_data = sampled_data[[' lat', ' long']]

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(sampled_data)

In [None]:
# Elbow Method
inertia = []
k_range = range(30, 200)
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_data)
    inertia.append(kmeans.inertia_)

# Plot the elbow curve
plt.figure(figsize=(8, 5))
plt.plot(k_range, inertia, marker='o')
plt.title("Elbow Method")
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Inertia")
plt.grid(True)
plt.show()

In [None]:
# Perform KMeans clustering with the chosen number of clusters (e.g., 3)
kmeans = KMeans(n_clusters=60, random_state=42)
kmeans.fit(scaled_data)

# Add cluster labels back to the original sampled data
sampled_data['Cluster'] = kmeans.labels_

# Visualization of clusters
plt.figure(figsize=(8, 6))
plt.scatter(
    sampled_data[' lat'], sampled_data[' long'], 
    c=sampled_data['Cluster'], cmap='viridis', s=50
)
plt.scatter(
    scaler.inverse_transform(kmeans.cluster_centers_)[:, 0], 
    scaler.inverse_transform(kmeans.cluster_centers_)[:, 1], 
    s=300, c='red', label='Centroids'
)
plt.title("KMeans Clustering")
plt.xlabel("Latitude")
plt.ylabel("Longitude")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

In [None]:
# Compute linkage matrix
linkage_matrix = linkage(scaled_data, method='complete')  # 'ward', 'single', 'complete', etc.

# Plot dendrogram
plt.figure(figsize=(10, 7))
dendrogram(linkage_matrix)
plt.title("Hierarchical Clustering Dendrogram")
plt.xlabel("Sample Index")
plt.ylabel("Distance")
plt.show()


In [None]:
# Define the number of clusters
num_clusters = 170

# Assign cluster labels
cluster_labels = fcluster(linkage_matrix, num_clusters, criterion='maxclust')

# Add cluster labels to the DataFrame
sampled_data['Cluster'] = cluster_labels


In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(
    sampled_data[' lat'], sampled_data[' long'], 
    c=sampled_data['Cluster'], cmap='viridis', s=50
)
plt.title("Hierarchical Clustering")
plt.xlabel("Latitude")
plt.ylabel("Longitude")
plt.grid(True)
plt.show()
