In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from random import sample
from numpy.random import uniform
from math import isnan
from sklearn.preprocessing import scale
import geopandas as gpd
import contextily as ctx 
from sklearnex import patch_sklearn
patch_sklearn()


from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import IncrementalPCA

In [None]:
local_crs = 27700
place = "test"
lat = 55.86421405612109
lng = -4.251846930489373
country = "UK"
crs=4326
radius=1

In [None]:
grid = gpd.read_parquet(f"../output/{place}/p5-grid-output.pq")

In [None]:
grid.set_index(['row', 'col'], inplace=True)

In [None]:
geo_col = grid[['geometry']]

In [None]:
geo_col

In [None]:
colsubset = grid.head().columns.tolist()

In [None]:
grid.describe()

In [None]:
grid_dropped = grid.fillna(0)
grid_dropped = grid_dropped[grid_dropped['building_count_x'] > 2]

In [None]:
# List of substrings to check in the column names
substrings = ['index', 'bID']

# Identify columns to drop
columns_to_drop = [col for col in grid_dropped.columns if any(substring in col for substring in substrings)]

# Drop the identified columns
grid_dropped = grid_dropped.drop(columns=columns_to_drop)

# Display the modified DataFrame
print(grid_dropped.columns.tolist())

In [None]:
grid_dropped_old = grid_dropped

In [None]:
grid_dropped["index"] = grid_dropped.index

In [None]:
grid_dropped_geo = grid_dropped
grid_dropped = grid_dropped.drop(columns=["geometry", "index"]).reset_index()

In [None]:
pca = PCA(svd_solver='randomized', random_state=50)


In [None]:
scaler = StandardScaler()
tessellation_scaled = scaler.fit_transform(grid_dropped)

In [None]:
pca.fit(tessellation_scaled)

In [None]:
features = grid_dropped.columns

In [None]:
pca.components_

In [None]:
# Variance Ratio

pca.explained_variance_ratio_

In [None]:
# Variance Ratio bar plot for each PCA components.
plt.figure(figsize = (10, 5))
ax = plt.bar(range(1,len(pca.explained_variance_ratio_)+1), pca.explained_variance_ratio_)
plt.xlabel("PCA Components",fontweight = 'bold')
plt.ylabel("Variance Ratio",fontweight = 'bold')

plt.show()

In [None]:
# calculate the cumulative sum of explained variance ratios
cumulative_sum = np.cumsum(pca.explained_variance_ratio_)

org_col = list(grid_dropped.columns)

num_pc = np.argmax(cumulative_sum >= 0.8) + 1

pc_dict = {'Attribute': org_col}

pc_dict.update({f'PC_{i+1}':pca.components_[i] for i in range(num_pc)})

attributes_pca = pd.DataFrame(pc_dict)

In [None]:
# Scree plot to visualize the Cumulative variance against the Number of components

fig = plt.figure(figsize = (12,5))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.vlines(x=num_pc, ymax=1, ymin=0, colors="r", linestyles="--")
plt.xlabel('Number of PCA components')
plt.ylabel('Cumulative Explained Variance')
plt.show()

In [None]:
attributes_pca

In [None]:
# Building the dataframe using Incremental PCA for better efficiency.

inc_pca = IncrementalPCA(n_components=num_pc)

In [None]:
# Fitting the scaled df on incremental pca

df_inc_pca = inc_pca.fit_transform(tessellation_scaled)
df_inc_pca

In [None]:
components_df = pd.DataFrame(inc_pca.components_, columns=features)  # Adjust index based on n_components

In [None]:
# Writing the provided code with modifications to output to a text file

# Creating a string to capture the text that will be printed, for writing to a file
output_text = ""

# Assuming components_df is already defined as shown previously
number_of_features_to_describe = 15  # Change this to choose how many top features to describe for each component

# Iterating through each principal component
for component in components_df.index:
    component_description = f"Describing {component}:\n"
    output_text += component_description
    
    # Sorting the features by their contribution to the component
    sorted_features = components_df.loc[component].abs().sort_values(ascending=False)
    
    # Picking the top features
    top_features = sorted_features.head(number_of_features_to_describe).index
    contributions = sorted_features.head(number_of_features_to_describe).values
    
    # Printing out the top features and their contributions
    for feature, contribution in zip(top_features, contributions):
        feature_description = f" - {feature} with a loading of {contribution:.2f}\n"
        output_text += feature_description
    
    output_text += "\n"  # Add a new line for better readability

print(output_text)
# Writing the output to a text file
with open(f"../output/{place}/PC_summary.txt", "w") as file:
    file.write(output_text)


In [None]:
import pickle

with open(f"../output/{place}/pca.pickle", 'wb') as f:
    pickle.dump(components_df, f)

In [None]:
# Creating new dataframe with Principal components


df_pca = pd.DataFrame(df_inc_pca, columns=[f"PC_{i+1}" for i in range(num_pc)])

In [None]:
df_pca["row"] = grid_dropped["row"]
df_pca["col"] = grid_dropped["col"]

In [None]:
grid_dropped

In [None]:
if len(grid_dropped) == len(df_pca):
    df_pca.index = grid_dropped.index
else:
    print("Error: The number of rows in grid_dropped and df_pca do not match.")

In [None]:
geo_col.reset_index()

In [None]:
df_pca_geometry = geo_col.reset_index().merge(df_pca, on=["row", "col"], how="left")

In [None]:
df_pca_geometry = df_pca_geometry.dropna()

In [None]:
df_pca_geometry

In [None]:
df_pca = df_pca_geometry.drop(columns=["row", "col", "geometry"])

In [None]:
columns_to_plot = df_pca_geometry.dropna().columns.drop('geometry')

In [None]:
# Number of rows and columns for the subplot grid
n_cols = 2  # You can adjust this based on your preference
n_rows = (len(columns_to_plot) + 1) // n_cols

# Create a figure with subplots
fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, n_rows * 4))

# Flatten axes array for easy iteration, if there's more than one row
axes = axes.flatten() if n_rows > 1 else [axes]

# Loop through the columns and create a plot for each
for i, column in enumerate(columns_to_plot):
    # Plot with translucent colors
    df_pca_geometry.dropna().plot(column=column, scheme="natural_breaks", ax=axes[i], legend=True, alpha=0.5)  # Adjust alpha for translucency

    # Add Contextily basemap
    ctx.add_basemap(axes[i], crs=df_pca_geometry.crs.to_string())

    axes[i].set_title(column)
    axes[i].set_axis_off()

# Remove any unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

In [None]:
df_pca_geometry

In [None]:
df_pca_geometry.to_parquet(f"../output/{place}/df_pca_geom.pq")

In [None]:
import pickle

with open(f"../output/{place}/df_pca_geometry.pickle", 'wb') as f:
    pickle.dump(df_pca_geometry, f)

In [None]:
df_pca_with_uID = df_pca.copy()

In [None]:
df_pca_with_uID["uID"] = [str(i) for i in grid_dropped.index]

In [None]:
# # Calculating Hopkins score to know whether the data is good for clustering or not.

# def hopkins(X):
#     d = X.shape[1]
#     n = len(X)
#     m = int(0.1 * n) 
#     nbrs = NearestNeighbors(n_neighbors=1).fit(X.values)
 
#     rand_X = sample(range(0, n, 1), m)
 
#     ujd = []
#     wjd = []
#     for j in range(0, m):
#         u_dist, _ = nbrs.kneighbors(uniform(np.amin(X,axis=0),np.amax(X,axis=0),d).reshape(1, -1), 2, return_distance=True)
#         ujd.append(u_dist[0][1])
#         w_dist, _ = nbrs.kneighbors(X.iloc[rand_X[j]].values.reshape(1, -1), 2, return_distance=True)
#         wjd.append(w_dist[0][1])
 
#     HS = sum(ujd) / (sum(ujd) + sum(wjd))
#     if isnan(HS):
#         print(ujd, wjd)
#         HS = 0
 
#     return HS


In [None]:
# # Hopkins score
# Hopkins_score=round(hopkins(df_pca),2)

In [None]:
# print(Hopkins_score)

The Hopkins statistic (introduced by Brian Hopkins and John Gordon Skellam) is a way of measuring the cluster tendency of a data set.[1] It belongs to the family of sparse sampling tests. It acts as a statistical hypothesis test where the null hypothesis is that the data is generated by a Poisson point process and are thus uniformly randomly distributed.[2] A value close to 1 tends to indicate the data is highly clustered, random data will tend to result in values around 0.5, and uniformly distributed data will tend to result in values close to 0.[3]

In [None]:
len(df_pca_with_uID)

In [None]:
df_pca.to_parquet(f"../output/{place}/df_pca.pq")