# Import Libraries

In [1]:
import rioxarray
import glob
import numpy as np
import tqdm
import json
import pandas as pd
import plotly.express as px

# Load Data

In [2]:
# Find all the Level 4 chips in the specified directory
level_4_chips = glob.glob("/data/projects/punim1932/Data/old/Level4_chipss/*")

# Create a dictionary of yearly chips, where each year is mapped to a list of chips for that year
yearly_chips = {}
for year in range(1988, 2021):
    yearly_chips[year] = [chip for chip in level_4_chips if str(year) in chip]

# Data Wrangling

In [3]:
# Define a function to calculate yearly distributions for given chips
def get_yearly_distributions(yearly_chips, split_percent):
    # Initialize a dictionary to store yearly distributions
    yearly_distributions = {}
    # Iterate over the years in the yearly_chips dictionary
    for year in tqdm.tqdm(yearly_chips.keys()):
        # Create an empty list to store flattened chip data for the current year
        savannah_chips_year = []
        # Iterate over the chips for the current year, taking only a percentage defined by split_percent
        for chip in yearly_chips[year][:int(len(yearly_chips[year]) * split_percent)]:
            # Load the chip data using rioxarray
            chip_data = rioxarray.open_rasterio(chip).load()
            # Flatten the chip data and append it to the savannah_chips_year list
            flattened_chip_data = chip_data.data[0].flatten()
            savannah_chips_year.append(flattened_chip_data)
        # Concatenate the flattened chip data for the current year
        savannah_chips_year = np.concatenate(savannah_chips_year)
        # Calculate the unique values and their counts in the savannah_chips_year array
        unique, counts = np.unique(savannah_chips_year, return_counts=True)
        # Store the unique values and their counts in the yearly_distributions dictionary for the current year
        yearly_distributions[year] = dict(zip(unique, counts))
    # Return the yearly_distributions dictionary
    return yearly_distributions

In [None]:
# Call the get_yearly_distributions function for the yearly_chips dictionary with a split_percent of 0.5 to get two sets of yearly distributions
yearly_distributions = get_yearly_distributions(yearly_chips, 0.5)
yearly_distributions_2 = get_yearly_distributions(yearly_chips, 0.5)

In [None]:
# Combine the yearly distributions from both sets using a dictionary comprehension
yearly_distribution_all = {}
for year in yearly_distributions:
    year_1 = yearly_distributions[year]
    year_2 = yearly_distributions_2[year]
    yearly_distribution_all[int(year)] = {int(k): year_1.get(k, 0) + year_2.get(k, 0) for k in set(year_1) | set(year_2)}

In [54]:
with open("/data/projects/punim1932/Data/old/yearly_L4_data.json") as json_file: 
    yearly_distribution_all=json.load(json_file)    

In [55]:
# Initialize a dictionary to store yearly distribution percentages
yearly_distributions_precentage = {}

# Loop over the yearly distribution data
for year in yearly_distribution_all:
    # Calculate the sum of all values for the year
    yearly_values = sum(yearly_distribution_all[year].values())
    
    # Initialize a dictionary to store the percentage distribution for the year
    year_percent_dict = {}
    
    # Loop over the TIF IDs for the year and calculate their percentage distribution
    for tif_id in yearly_distribution_all[year]:
        year_percent_dict[tif_id] = (yearly_distribution_all[year][tif_id] / yearly_values) * 100
    
    # Add the percentage distribution data for the year to the yearly_distributions_precentage dictionary
    yearly_distributions_precentage[year] = year_percent_dict

In [56]:
df_hierarchy=pd.read_excel("/data/projects/punim1932/Data/old/DEALandCover_to_LCNS.xlsx",header=1)
df_hierarchy.head()

# Read the Excel file into a DataFrame
df_hierarchy = pd.read_excel("/data/projects/punim1932/Data/old/DEALandCover_to_LCNS.xlsx", header=1)

# Display the first few rows of the DataFrame
df_hierarchy.head()

# Create a dictionary to store hierarchical categories
hierarchical_categories = {}

# Iterate over the columns of the DataFrame and populate the hierarchical_categories dictionary
for id, label, category in zip(df_hierarchy["level4"], df_hierarchy["level4_label"], df_hierarchy["LCNS_label"]):
    # Check if the category is already present in the dictionary
    if category not in hierarchical_categories:
        hierarchical_categories[category] = {label.replace('\xa0', ''): int(id.replace('\xa0', ''))}
    else:
        hierarchical_categories[category][label.replace('\xa0', '')] = int(id.replace('\xa0', ''))

# Convert the dictionary to a pandas DataFrame
df = pd.DataFrame.from_dict(yearly_distributions_precentage, orient='columns')

# Transpose the dataframe to make the years as rows and tif_ids as columns
df = df.transpose()

# Reset the index of the DataFrame and rename the column
df = df.reset_index().rename(columns={"index": "year"})

# Convert the 'year' column to integers
df['year'] = [int(year) for year in df['year']]

# Create a list to store yearly hierarchical distribution
yearly_hierarchical_distribution = []

# Iterate over the records in the DataFrame and calculate the yearly hierarchical distribution
for record in df.to_dict("records"):
    yearly_info = {
        "year": record['year']
    }
    # Calculate the sum for each category using hierarchical_categories
    for category in hierarchical_categories:
        yearly_info[category] = sum([record[str(tif_id)] for tif_id in hierarchical_categories[category].values() if str(tif_id) in record])
    # Add the yearly information to the list
    yearly_hierarchical_distribution.append(yearly_info)

# Create a new DataFrame from the yearly_hierarchical_distribution list
df_hierarchical = pd.DataFrame(yearly_hierarchical_distribution)

In [57]:
df_hierarchical

Unnamed: 0,year,No data,Cultivated closed,Cultivated open 40,Cultivated open 15,Cultivated sparse,Woody closed,Woody open 40,Woody open 15,Woody sparse,...,Herbaceous sparse,Aquatic Woody closed,Aquatic Woody open,Aquatic Woody sparse to open,Aquatic Herbaceous,Artificial Surface,Natural Bare ground,Natural Herbaceous Sparse,Water,Water 1 to 6 months
0,1988,2.002524,0.031815,0.062493,0.246725,1.828195,1.2903,2.423721,0.445345,0.034386,...,17.752641,0.152214,0.065424,0.011332,0.017801,0.001203,8.372029,14.521,18.610318,0.10366
1,1989,1.855639,0.134067,0.128025,0.37616,1.294546,1.693857,3.807317,0.504266,0.023868,...,12.913344,0.214767,0.080424,0.012793,0.024571,0.002658,6.286643,15.253244,17.925702,0.134851
2,1990,1.356222,0.089303,0.156074,0.504917,1.633603,1.113932,3.484339,0.52976,0.043429,...,16.158206,0.147932,0.091496,0.017402,0.032607,0.005079,6.867545,13.454941,21.96934,0.113495
3,1991,1.340865,0.128487,0.153923,0.382802,0.981957,0.838382,2.1688,0.407299,0.017564,...,16.535605,0.122373,0.07789,0.013079,,0.001974,5.710274,10.944512,19.930142,0.169515
4,1992,1.889962,0.057561,0.100588,0.289753,2.094129,0.456015,2.256965,0.416815,0.028158,...,23.737013,0.117369,0.073958,0.011098,0.027188,0.003677,8.209308,11.127347,18.433158,0.110291
5,1993,1.477504,0.179336,0.14907,0.464813,2.032463,1.162122,2.692603,0.459562,0.058821,...,17.441577,0.204038,0.094557,0.008139,0.020146,0.003069,4.905016,12.056943,18.498944,0.191315
6,1994,1.641618,0.026347,0.112668,0.331639,1.094173,0.805784,3.416197,0.527827,0.066437,...,22.814125,0.153096,0.104945,0.014557,0.029597,0.001509,4.825469,10.167549,18.10187,0.108507
7,1995,2.004522,0.108683,0.121416,0.429155,2.158861,1.738593,4.531802,0.508798,0.064722,...,18.524943,0.219818,0.102418,0.019612,0.02322,0.001654,4.562121,10.582191,19.871891,0.149742
8,1996,1.582654,0.076038,0.094075,0.229808,2.091378,1.687713,3.637183,0.724382,0.11311,...,17.49314,0.19545,0.09758,0.020823,0.028138,0.00266,5.955617,10.961522,21.13376,0.114976
9,1997,1.773305,0.104563,0.073252,0.451019,1.814915,1.028511,3.427713,0.717055,0.078947,...,16.616751,0.171841,0.076876,0.011996,0.021458,0.001579,2.769683,11.097206,17.684972,0.134996


In [58]:
def fit_linear_model(df, category):
    # Create a scatter plot with a lowess trendline using Plotly Express
    fig = px.scatter(df, x="year", y=category, trendline="lowess")
    fig.show()

    # Create a scatter plot with a lowess trendline and customized options using Plotly Express
    fig = px.scatter(df, x="year", y=category, trendline="lowess", trendline_options=dict(frac=0.15))
    fig.show()


In [59]:
categories=hierarchical_categories.keys()
for category in categories:
    print(category)
    fit_linear_model(df_hierarchical,category)
    print()

No data



Cultivated closed



Cultivated open 40



Cultivated open 15



Cultivated sparse



Woody closed



Woody open 40



Woody open 15



Woody sparse



Herbaceous closed



Herbaceous open 40



Herbaceous open 15



Herbaceous sparse



Aquatic Woody closed



Aquatic Woody open



Aquatic Woody sparse to open



Aquatic Herbaceous



Artificial Surface



Natural Bare ground



Natural Herbaceous Sparse



Water



Water 1 to 6 months



