In [25]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# CONSTANTS
# title_master_dataset = "Countries: Belize, El Salvador, Honduras, Nicaragua"
title_master_dataset = "Mexico"
datasets = ['Mexico']
dataset_name = 'master_dataset_Mexico'
model_dataset_name = 'master_dataset_Mexico_model'
threshold_of_missing_values = 0.3 # 30%


# ---- do not modify below this line ----

# Visualise the dataset

In [None]:
# load data
load_data = []
for dataset in datasets:
    df = pd.read_csv(f'./outputs/20241216/{dataset}_wosis/{dataset}_wosis_merged.csv')
    load_data.append(df)

# append dataframes
master_dataset = pd.concat(load_data, ignore_index=True)

# save dataset
master_dataset.to_csv(f'./data/soil/{dataset_name}.csv', index=False)

master_dataset.head()

# Visualise the missing values

In [None]:
# drop columns with more than 30% missing values
master_dataset = master_dataset.dropna(thresh=threshold_of_missing_values * len(master_dataset), axis=1)

# plot missing values
plt.figure(figsize=(10, 6))
sns.heatmap(master_dataset.isnull(), cbar=False, cmap='viridis')
plt.title(f'Missing values in the {title_master_dataset} dataset')
plt.show()

# Visualise the data distribution

In [None]:
# Visualising data distribution
import math
def plot_distribution_charts(numeric_columns, dataset, title=""):
    num_cols = len(numeric_columns)

    # Define the number of rows and columns dynamically
    chart_cols = 3  # Fixed number of columns
    chart_rows = math.ceil(num_cols / chart_cols)  # Calculate required rows based on columns

    fig, axes = plt.subplots(chart_rows, chart_cols, figsize=(15, 3 * chart_rows), constrained_layout=True)
    axes = axes.flatten()  # Flatten the axes for easy iteration

    for i, col in enumerate(numeric_columns):
        sns.histplot(dataset[col], kde=True, ax=axes[i])
        axes[i].set_title(f'Distribution of {col}')

    # Hide unused subplots
    for j in range(i + 1, len(axes)):
        axes[j].axis('off')

    plt.suptitle(f"Column distribution {title}")
    plt.show()


# plot distribution charts
numeric_columns_dataset = master_dataset.select_dtypes(include=['float64', 'int64']).columns
plot_distribution_charts(numeric_columns_dataset, master_dataset, title_master_dataset)