In [None]:
import os
import pandas as pd
from clustergram import Clustergram
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
local_crs = 27700
place = "test_OS"
lat = 55.86421405612109
lng = -4.251846930489373
country = "UK"
crs=4326
radius=1

In [None]:
local_crs = 27700
place = "Glasgow_OS"
lat = 55.86421405612109
lng = -4.251846930489373
country = "UK"
crs=4326
radius=20

In [None]:
output_directory = f"../output/{place}"

In [None]:
# Generate file names based on the naming pattern
file_names = [os.path.join(output_directory, f"percentile_chunk_{i}.pq") for i in range(84)]

# Initialize an empty list to store dataframes
dataframes = []

# Iterate over the generated file names and read each file
for file_name in file_names:
    if os.path.exists(file_name):
        try:
            df = pd.read_parquet(file_name)
            dataframes.append(df)
            print(f"Successfully read {file_name}")
        except Exception as e:
            print(f"Failed to read {file_name}. Error: {e}")
    else:
        print(f"File not found: {file_name}")

# Concatenate all the dataframes into one
combined_df = pd.concat(dataframes, ignore_index=True)

In [None]:
combined_df = combined_df.drop(columns=["height_25", "height_50", "height_75"])

In [None]:
standardized = (combined_df - combined_df.mean()) / combined_df.std()
standardized.head()

In [None]:
standardized.fillna(0, inplace=True)

In [None]:
cgram = Clustergram(range(1, 30))
cgram.fit(standardized)

In [None]:
cgram.labels

In [None]:
cgram.plot()

In [None]:
cgram.labels.to_parquet(f"../output/{place}/cgram_labels.pq")

In [None]:
fig, axs = plt.subplots(2, figsize=(10, 10), sharex=True)
cgram.calinski_harabasz_score().plot(xlabel="Number of clusters (k)", ylabel="Calinski-Harabasz score", ax=axs[1])
cgram.davies_bouldin_score().plot(xlabel="Number of clusters (k)", ylabel="davies_bouldin_score", ax=axs[0])
sns.despine(offset=10)

In [None]:
num_clusters = 26

In [None]:
tessellation = gpd.read_parquet(f"../output/{place}/tessellation_morphometric_p3.pq")

In [None]:
standardized["cluster"] = cgram.labels[num_clusters].values

In [None]:
standardized["geometry"] = tessellation["geometry"]

In [None]:
tess_cluster = gpd.GeoDataFrame(standardized, geometry="geometry")

In [None]:
reduced_array = np.mean(cgram.cluster_centers[num_clusters], axis=1)
weighted_difference_between_clusters = {i: k for i, k, in enumerate(reduced_array)}

def scale_dict(d):
    # Extract values and convert them to a numpy array
    values = np.array(list(d.values()))

    # Normalize values to [0,1]
    normalized_values = (values - np.min(values)) / (np.max(values) - np.min(values))

    # Scale values from [-10,10]
    scaled_values = (normalized_values * 20) - 10

    # Create a new dictionary with the scaled values
    scaled_dict = {key: value for key, value in zip(d.keys(), scaled_values)}

    return scaled_dict

weighted_difference_between_clusters = scale_dict(weighted_difference_between_clusters)


In [None]:
sorted_clusters = sorted(weighted_difference_between_clusters.items(), key=lambda x: x[1])
# Sort the dictionary by values in ascending order
sorted_clusters = sorted(weighted_difference_between_clusters.items(), key=lambda x: x[1])

# Create a mapping of current column names to new column names
column_mapping = {cluster_id: index + 1 for index, (cluster_id, _) in enumerate(sorted_clusters)}
weighted_difference_between_clusters

In [None]:
tess_cluster["cluster_ID"].map(column_mapping)

tess_cluster["cluster_ID"] = tess_cluster["cluster_ID"].map(column_mapping)

def rename_dictionary_keys(original_dict, rename_dict):
    renamed_dict = {}
    for key, value in original_dict.items():
        if key in rename_dict:
            new_key = rename_dict[key]
        else:
            new_key = key
        renamed_dict[new_key] = value
    return renamed_dict

renamed_dict = rename_dictionary_keys(weighted_difference_between_clusters, column_mapping)
weighted_difference_between_clusters = renamed_dict

tess_cluster["one_dimensional_diff_between_clusters"] = tess_cluster["cluster_ID"].map(renamed_dict)

In [None]:
tess_cluster["one_dimensional_diff_between_clusters"]

In [None]:
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import numpy as np
import contextily as ctx
import glasbey

# Assuming tess_cluster and df_pca_geometry are predefined GeoDataFrames

# Plotting the 'tess_cluster' with 'cluster' as a categorical variable
fig, ax = plt.subplots(figsize=(300, 300))  # Adjust the figure size as needed

# Generate a colormap with 30 distinct pastel colors
palette = glasbey.extend_palette("tab20", palette_size=num_clusters)

cmap = mcolors.ListedColormap(palette)

tess_cluster.plot(column='cluster', categorical=True, ax=ax, legend=True, cmap=cmap)
ax.set_axis_off()

# Adding labels to each cell
for idx, row in tess_cluster.iterrows():
    # Getting the centroid of each geometry
    centroid = row.geometry.centroid
    # Annotating with the cluster number
    ax.annotate(text=row['cluster'], xy=(centroid.x, centroid.y),
                ha='center', va='center', fontsize=8, color='black')

# Adding a basemap to the plot
# Ensure that the CRS of df_pca_geometry is compatible with contextily basemaps
ctx.add_basemap(ax, crs=tess_cluster.crs.to_string())

plt.show()

In [None]:
tess_cluster.to_parquet(f"../output/{place}/p6_tess_cluster_out.pq")

In [None]:
from sklearnex import patch_sklearn
patch_sklearn()

import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler


In [None]:
tess_cluster = gpd.read_parquet(f"../output/{place}/p6_tess_cluster_out.pq")
tess_cluster = tess_cluster.drop(columns=["cluster_ID", "geometry", "one_dimensional_diff_between_clusters"])



In [None]:
tess_cluster.columns

In [None]:
import pandas as pd

# Assuming tess_cluster is your DataFrame from the previous step

variable_names = tess_cluster.columns[~tess_cluster.columns.isin(['geometry', 'cluster'])]

# Assuming tess_cluster is your DataFrame from the previous step
# and variable_names is a list of your variable names

# Calculate the median for each variable across the entire dataset
overall_median = tess_cluster[variable_names].median()

# Initialize a dictionary to store the results
furthest_from_median = {}

# Iterate over each cluster
for cluster_id in tess_cluster['cluster'].unique():
    # Calculate the mean of each variable in the cluster
    cluster_mean = tess_cluster[tess_cluster['cluster'] == cluster_id][variable_names].mean()
    
    # Calculate the absolute difference from the overall median
    difference_from_median = abs(cluster_mean - overall_median)
    ""
    # Sort the differences and get the top 10
    top_10_variables = difference_from_median.sort_values(ascending=False).head(10)
    
    # Store the results in the dictionary
    furthest_from_median[cluster_id] = top_10_variables

sorted_furthest_from_median = sorted(furthest_from_median.items(), key=lambda item: item[0])

with open(f'../output/{place}/cluster_analysis.txt', 'w') as file:
    # Write the results to the file
    for key in sorted(list(furthest_from_median.keys())):
        file.write(f"Cluster {key}:\n")
        file.write(furthest_from_median[key].to_string())
        file.write("\n\n")

print("Analysis saved to 'cluster_analysis.txt'")

In [None]:

# Assuming tess_cluster is a GeoDataFrame
# Replace 'your_data.geojson' with the actual file or use your loaded GeoDataFrame

# Assuming 'cluster' is the column with k-means clustering results

# Get variable names from columns
variable_names = tess_cluster.columns[~tess_cluster.columns.isin(['geometry', 'cluster'])]

# Create a DataFrame to store the scaled values
tess_cluster = pd.DataFrame()

# Min-Max scaling each variable within each cluster
for cluster_id in tess_cluster['cluster'].unique():
    cluster_data = tess_cluster[tess_cluster['cluster'] == cluster_id][variable_names]
    scaler = MinMaxScaler()
    scaled_cluster = scaler.fit_transform(cluster_data)
    scaled_cluster_df = pd.DataFrame(scaled_cluster, columns=variable_names)
    scaled_cluster_df['cluster'] = cluster_id
    tess_cluster = pd.concat([tess_cluster, scaled_cluster_df], ignore_index=True)

# Create a square heatmap
plt.figure(figsize=(150, 150))
sns.heatmap(tess_cluster.groupby('cluster')[variable_names].mean().T, cmap='coolwarm', annot=True, fmt=".2f", cbar_kws={'label': 'Scaled Value'}, square=True)
plt.title('Scaled Values of Variables in Each Cluster (Min-Max Scaling)')
plt.show()

In [None]:
tess_cluster = gpd.read_parquet(f"../output/{place}/p6_tess_cluster_out.pq")

In [None]:
tess_cluster = tess_cluster.drop(columns=["one_dimensional_diff_between_clusters", "geometry"])

In [None]:
grouped = tess_cluster.groupby('cluster_ID')

# Calculate the mean of each group (excluding the cluster_id column)
centroids = grouped.mean()

print(centroids)

In [None]:
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage

# Assuming X_scaled_df is your scaled DataFrame and num_clusters is defined
sl_mergings = linkage(centroids, method="single", metric='euclidean')

# Create the dendrogram
dendrogram(sl_mergings)

plt.show()