In [None]:
%load_ext autoreload

In [None]:
%autoreload 2

In [None]:
import ShallowLearn.ImageHelper as ih

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
import os

In [None]:
path = "/media/ziad/Expansion/Cleaned_Data_Directory/"

In [None]:
data_paths = os.listdir(path)

In [None]:
data_paths = [i for i in data_paths if i.endswith(".tiff")]

In [None]:
data_frame = pd.DataFrame(data_paths)

In [None]:
def reshape_dataframe(df):
    """
    Reshape a DataFrame such that each unique 4-digit ID at the beginning of the strings
    becomes its own column, using the string following the ID as the column name.

    Parameters:
    - df: Input DataFrame with only one column of interest

    Returns:
    - Reshaped DataFrame
    """
    # Ensure the DataFrame has only one column of interest
    if df.shape[1] != 1:
        raise ValueError("Input DataFrame should have only one column of interest.")
    
    column_name = df.columns[0]
    
    # Extract the 4-digit identifier and the associated value
    df['ID'] = df[column_name].str.extract(r'(\d{4})')
    df['ColumnName'] = df[column_name].str[5:]
    
    # Set multi-index and then unstack to reshape the DataFrame
    df_reshaped = df.set_index(['ID', 'ColumnName']).drop(columns=column_name).unstack()
    
    # Drop top level of multi-index in columns and fill NaNs
    df_reshaped.columns = df_reshaped.columns.droplevel(0)
    df_reshaped.fillna('', inplace=True)

    return df_reshaped.reset_index(drop=True)

In [None]:
# operation is done in place - no need to reassign
reshape_dataframe(data_frame)

In [None]:
def extract_date_from_string(df, column_name='Image_name'):
    """
    Extract the date from the specified column string and set it to a separate date column.

    Parameters:
    - df: Input DataFrame
    - column_name: Name of the column containing strings with dates

    Returns:
    - DataFrame with an added 'Date' column
    """
    # Extract date using regex
    df['Date'] = df[column_name].str.extract(r'_(\d{8})T')
    
    # Convert the extracted date string to datetime format
    df['Date'] = pd.to_datetime(df['Date'], format='%Y%m%d')
    
    return df

In [None]:
data_frame.columns = ["Path", "ID", "Image_name"]

In [None]:
extract_date_from_string(data_frame)

In [None]:
len(data_frame.Date.unique())

In [None]:
img_no = 3500
img, meta, bounds = ih.load_img(path + data_frame.Path[img_no], return_meta = True)

In [None]:
img.dtype

In [None]:
img = img.astype(np.float32)
img = np.where(img == 0, np.nan, img)

In [None]:
rgb_img = ih.plot_rgb(img)


In [None]:
rgb_img
rgb_img = rgb_img.astype(np.float32)
rgb_img = np.where(rgb_img == 0, np.nan, rgb_img)

In [None]:


ih.plot_geotiff(rgb_img/255, bounds, data_frame.Path[img_no])

In [None]:
data_frame.Path[img_no]

In [None]:
image_list = []
for id, image in zip(data_frame.ID, data_frame.Path):
    if id == '6832':
        image_path = path + image
        print(image_path)
        img, meta, bounds = ih.load_img(image_path, return_meta = True)
        image_list.append(img)
    

In [None]:
img_arr = np.array(image_list)

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=2)

pca_data = pca.fit_transform(img_arr.reshape(48, -1))

In [None]:
pca.explained_variance_

In [None]:
plt.scatter(pca_data[:,0], pca_data[:,1])

In [None]:
import ShallowLearn.RadiometricNormalisation as rn

In [None]:
plt.imshow(ih.plot_rgb(img_arr[1]))

In [None]:
import ShallowLearn.Transform as tf

In [None]:
ref = img_arr[1]
for i in img_arr:
    src = i
    normalised = rn.pca_based_normalization(src, ref)
    hist_norm = rn.pca_filter_and_normalize_b8(src, ref, band_8=7, threshold=.25)
    
    fig, ax = plt.subplots(1, 6, figsize=(35, 10))
    ih.plot_geotiff(ih.plot_rgb(src), ax=ax[0], bounds=bounds, title="Original")
    ih.plot_geotiff(ih.plot_rgb(normalised), ax=ax[1], bounds=bounds, title="Normalised - PCA - histogram matching")
    ih.plot_geotiff(ih.plot_rgb(hist_norm), ax= ax[2], bounds=bounds, title="Normalised - PCA-filter matching")

    ax[3].hist(tf.LCE_multi(src).flatten(), bins=100, alpha=0.5, label='src')
    ax[3].set_title("Original hist")
    ax[4].hist(normalised.flatten(), bins=100, alpha=0.5, label='normalised')
    ax[4].set_title("Normalised pca - hist")
    ax[5].hist(hist_norm.flatten(), bins=100, alpha=0.5, label='hist_norm')
    ax[5].set_title("Normalised hist - PCA-filter")
    plt.show()


In [None]:
ref = img_arr[1]
from sklearn.cluster import KMeans
for i in img_arr:
    try:
        src = i


        normalised = rn.pca_based_normalization(src, ref)
        hist_norm = rn.pca_filter_and_normalize_b8(src, ref, band_8=7, threshold=.25)
        
        fig, ax = plt.subplots(1, 6, figsize=(35, 10))
        kmeans = KMeans(n_clusters=10)
        kmeans.fit(src.reshape(-1, 13))
        src_kmeans = kmeans.predict(src.reshape(-1, 13)).reshape(src.shape[:2])
        ih.plot_geotiff(ih.plot_rgb(src), ax=ax[0], bounds=bounds, title="Original")
        # ih.plot_geotiff(ih.plot_rgb(normalised), ax=ax[1], bounds=bounds, title="Normalised - PCA - histogram matching")
        ih.discrete_implotv2(src_kmeans, ax = ax[1])

        ih.plot_geotiff(ih.plot_rgb(normalised), ax= ax[2], bounds=bounds, title="Normalised - hist/pca")

        ax[3].hist(tf.LCE_multi(src).flatten(), bins=100, alpha=0.5, label='src')
        ax[3].set_title("Original hist")

        ax[4].hist(normalised.flatten(), bins=100, alpha=0.5, label='normalised')
        ax[4].set_title("Normalised pca - hist")

        plt.show()
    except:
        print("Hist match failed")
    


In [None]:
ref = img_arr[1]
from sklearn.cluster import KMeans
cluster_defined = False
counter = 0
for i in img_arr:
    try:
        src = i

        normalised = rn.pca_based_normalization(src, ref)
        # hist_norm = rn.pca_filter_and_normalize_b8(src, ref, band_8=7, threshold=.25)
        
        fig, ax = plt.subplots(1, 5, figsize=(38, 10))
        if cluster_defined:
            cluster_centers = kmeans.cluster_centers_
            kmeans = KMeans(n_clusters=10, init=cluster_centers)
            kmeans.fit(normalised[:,:,[3,2,1,7]].reshape(-1, 4))
        else:
            kmeans = KMeans(n_clusters=10)
            kmeans.fit(normalised[:,:,[3,2,1,7]].reshape(-1, 4))

        src_kmeans = kmeans.predict(normalised[:,:,[3,2,1,7]].reshape(-1,4)).reshape(src.shape[:2])
        ih.plot_geotiff(ih.plot_rgb(src), ax=ax[0], bounds=bounds, title="Original")
        # ih.plot_geotiff(ih.plot_rgb(normalised), ax=ax[1], bounds=bounds, title="Normalised - PCA - histogram matching")
        ih.discrete_implotv2(src_kmeans, ax = ax[1])

        ih.plot_geotiff(ih.plot_rgb(normalised), ax= ax[2], bounds=bounds, title="Normalised - PCA-filter matching")

        ax[3].hist(tf.LCE_multi(src).flatten(), bins=100, alpha=0.5, label='src')
        ax[3].set_title("Original hist")
        ax[4].hist(normalised.flatten(), bins=100, alpha=0.5, label='normalised')
        ax[4].set_title("Normalised pca - hist")

        cluster_defined = True
        cluster_centers = kmeans.cluster_centers_
        plt.show()
    except:
        print("Hist match failed")

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
date_ = data_frame[data_frame.ID == '6832'].Date.values
final_dates = []
def process_images_with_kmeans(img_arr, ref):
    cluster_defined = False
    cluster_changes = []  # For tracking changes in cluster centers
    cluster_centers_list = []  # For tracking the cluster centers themselves

    counter = 0

    for i in img_arr:
        try:
            src = i
            normalised = rn.pca_based_normalization(src, ref)

            fig, ax = plt.subplots(1, 5, figsize=(38, 10))

            if cluster_defined:
                old_cluster_centers = cluster_centers
                cluster_centers = kmeans.cluster_centers_
                kmeans = KMeans(n_clusters=10, init=cluster_centers)
                kmeans.fit(normalised[:, :, [3, 2, 1, 7]].reshape(-1, 4))

                # Compute distance between old and new centers
                distances = np.linalg.norm(cluster_centers - old_cluster_centers, axis=1)
                cluster_changes.append(distances)
            else:
                kmeans = KMeans(n_clusters=10)
                kmeans.fit(normalised[:, :, [3, 2, 1, 7]].reshape(-1, 4))

            src_kmeans = kmeans.predict(normalised[:, :, [3, 2, 1, 7]].reshape(-1, 4)).reshape(src.shape[:2])
            ih.plot_geotiff(ih.plot_rgb(src), ax=ax[0], bounds=bounds, title="Original")
            ih.discrete_implotv2(src_kmeans, ax=ax[1])
            ih.plot_geotiff(ih.plot_rgb(normalised), ax=ax[2], bounds=bounds, title="Normalised - PCA-filter matching")
            ax[3].hist(tf.LCE_multi(src).flatten(), bins=100, alpha=0.5, label='src')
            ax[3].set_title("Original hist")
            ax[4].hist(normalised.flatten(), bins=100, alpha=0.5, label='normalised')
            ax[4].set_title("Normalised pca - hist")

            cluster_defined = True
            cluster_centers = kmeans.cluster_centers_
            cluster_centers_list.append(cluster_centers)
            final_dates.append(date_[counter])
            counter += 1
            plt.show()


        except Exception as e:
            print("Hist match failed due to error:", e)

    # Plot the changes in cluster centers
    cluster_changes = np.array(cluster_changes)
    for i in range(cluster_changes.shape[1]):
        plt.plot(cluster_changes[:, i], label=f'Cluster {i}')
    plt.xlabel('Iteration')
    plt.ylabel('Change in Cluster Center')
    plt.title('Evolution of Cluster Centers')
    plt.legend()
    plt.show()

    # Plot the cluster centers themselves
    cluster_centers_array = np.array(cluster_centers_list)
    num_features = cluster_centers_array.shape[2]
    for i in range(num_features):
        for j in range(cluster_centers_array.shape[1]):
            plt.plot(cluster_centers_array[:, j, i], label=f'Feature {i} for Cluster {j}')
    plt.xlabel('Iteration')
    plt.ylabel('Cluster Center Value')
    plt.title('Evolution of Cluster Center Values')
    plt.legend()
    plt.show()

    return cluster_changes, cluster_centers_list

# Now you can call this function with your data:


In [None]:
cluster_changes, cluster_centers_list = process_images_with_kmeans(img_arr, ref)

In [None]:
for i in range(cluster_changes.shape[1]):
    plt.plot(cluster_changes[:, i], label=f'Cluster {i}')
plt.xlabel('Iteration')
plt.ylabel('Change in Cluster Center')
plt.title('Evolution of Cluster Centers')
plt.legend()
plt.show()

# Plot the cluster centers themselves
fig, ax = plt.subplots(figsize = (20,20))
cluster_centers_array = np.array(cluster_centers_list)
num_features = cluster_centers_array.shape[2]
for i in range(num_features):
    for j in range(cluster_centers_array.shape[1]):
        plt.plot(cluster_centers_array[:, j, i], label=f'Feature {i} for Cluster {j}')
plt.xlabel('Iteration')
plt.ylabel('Cluster Center Value')
plt.title('Evolution of Cluster Center Values')
plt.legend()
plt.show()

In [None]:
def plot_changes(cluster_changes, cluster_centers_list):
    # Convert lists to numpy arrays for easier manipulation
    cluster_changes = np.array(cluster_changes)
    cluster_centers_list = np.array(cluster_centers_list)

    # Plot changes in cluster centers for clusters 1, 6, 7, 8
    clusters_to_plot = [8]
    for cluster in clusters_to_plot:
        plt.plot(cluster_changes[:, cluster], label=f'Cluster {cluster}') # +1 because python is 0-indexed
    plt.xlabel('Iteration')
    plt.ylabel('Change in Cluster Center')
    plt.title('Evolution of Cluster Centers')
    plt.legend()
    plt.show()
    bands = [4,3,2,8]
    # Plot cluster center values for clusters 1, 6, 7, 8
    num_features = cluster_centers_list.shape[2]
    for cluster in clusters_to_plot:
        for feature in range(num_features):
            plt.plot(cluster_centers_list[:, cluster, feature], label=f'Band {bands[feature] } for Cluster {cluster}')
    plt.xlabel('Iteration')
    plt.ylabel('Cluster Center Value')
    plt.title('Evolution of Cluster Center Values')
    plt.legend()
    plt.show()

# Now, just call the function with your data:
plot_changes(cluster_changes, cluster_centers_list)

In [None]:
date_ = data_frame[data_frame.ID == '6832'].Date.values


In [None]:
final_dates

In [None]:
dates_2 = pd.to_datetime(final_dates)

In [None]:
dates_2

In [None]:
len(dates_2)

In [None]:
len(cluster_centers_list)

In [None]:
def plot_changes_with_dates(cluster_changes, cluster_centers_list, dates):
    cluster_changes = np.array(cluster_changes)
    cluster_centers_list = np.array(cluster_centers_list)

    # Ensure that the length of dates matches the number of rows in cluster_changes and cluster_centers_list
    # if len(dates) != len(cluster_changes):
    #     raise ValueError("Length of dates must match number of rows in cluster_changes and cluster_centers_list")

    # Plot changes in cluster centers for clusters 1, 6, 7, 8
    clusters_to_plot = [ 8]
    # for cluster in clusters_to_plot:
    #     plt.plot(dates, cluster_changes[:, cluster], label=f'Cluster {cluster + 1}')
    # plt.xlabel('Date')
    # plt.ylabel('Change in Cluster Center')
    # plt.title('Evolution of Cluster Centers Over Time')
    # plt.legend()
    # plt.xticks(rotation=45)  # Rotate dates for better readability if needed
    # plt.tight_layout()
    # plt.show()

    # Plot cluster center values for clusters 1, 6, 7, 8
    num_features = cluster_centers_list.shape[2]
    for cluster in clusters_to_plot:
        for feature in range(num_features):
            plt.plot(dates_2, cluster_centers_list[:, cluster, feature], label=f'Feature {feature + 1} for Cluster {cluster + 1}')
    plt.xlabel('Date')
    plt.ylabel('Cluster Center Value')
    plt.title('Evolution of Cluster Center Values Over Time')
    plt.legend()
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

# Now, call the function with your data:
plot_changes_with_dates(cluster_changes, cluster_centers_list, dates_2)

In [None]:
def plot_changes_with_dates(cluster_changes, cluster_centers_list, dates_2):
    fig, ax = plt.subplots(figsize = (20,20))
    cluster_changes = np.array(cluster_changes)
    cluster_centers_list = np.array(cluster_centers_list)
    cluster_no = 1
    clusters_to_plot = [cluster_no]
    bands = [4,3,2,8]
    # Plot cluster center values for cluster 8
    num_features = cluster_centers_list.shape[2]
    for feature in range(num_features):
        plt.scatter(dates_2, cluster_centers_list[:, cluster_no, feature], label=f'Band {bands[feature] } for Cluster {cluster_no}')  # Using 7 as an index because Python is 0-indexed
    plt.xlabel('Date')
    plt.ylabel('Cluster Center Value')
    plt.title(f'Evolution of Cluster {cluster_no} Center Values Over Time')
    plt.legend(loc= 'upper right')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

# Now, call the function with your data:
plot_changes_with_dates(cluster_changes, cluster_centers_list, dates_2)