In [145]:
import pandas as pd
import matplotlib.pyplot as plt

# load raw dataset
dataset = pd.read_csv("D:/tierra/data/mexico_combined_data.csv")
lat_long_cols = ['latitude', 'longitude']

In [146]:
# handle pre-processing
def get_numeric_cols(dataset):
    # numeric columns except for latitude and longitude
    numeric_cols = dataset.select_dtypes(include=['number']).columns
    numeric_cols = numeric_cols.drop(lat_long_cols)
    return numeric_cols

numeric_cols = get_numeric_cols(dataset)
# Calculate percentage of missing values for each column
missing_pct = dataset[numeric_cols].isnull().mean()

# plot missing values
# plt.figure(figsize=(10, 6))
# missing_pct.sort_values(ascending=True).plot(kind='bar')
# plt.title('Percentage of Missing Values by Column')
# plt.xlabel('Columns')
# plt.ylabel('Percentage Missing')
# plt.xticks(rotation=45, ha='right')
# plt.tight_layout()
# plt.show()

In [147]:
# convert date to datetime
dataset['date'] = pd.to_datetime(dataset['date'].values, format='%Y-%m-%d', errors='coerce')

# drop columns with more than 70% missing values
dataset = dataset.dropna(thresh=0.3 * len(dataset), axis=1)

In [148]:
#%pip install geopy

In [None]:
import seaborn as sns
from geopy.distance import geodesic
import numpy as np

def fill_missing_values_by_distance(df, feature_col, lat="latitude", long="longitude"):
    # Create a copy to avoid modifying original
    df_copy = df.copy()
    
    # Get missing and valid data
    missing_mask = df_copy[feature_col].isnull()
    
    # If no missing values, return original
    if not missing_mask.any():
        return df_copy[feature_col]
        
    missing_coords = df_copy.loc[missing_mask, [lat, long]].values
    valid_coords = df_copy.loc[~missing_mask, [lat, long]].values
    valid_values = df_copy.loc[~missing_mask, feature_col].values
    
    # Calculate distances one at a time to handle single points
    distances = []
    for m in missing_coords:
        dist_row = []
        for v in valid_coords:
            dist_row.append(geodesic(m, v).km)
        distances.append(dist_row)
    distances = np.array(distances)
    
    # Handle case of single missing point
    if len(distances.shape) == 1:
        nearest_index = distances.argmin()
    else:
        nearest_indices = distances.argmin(axis=1)
        df_copy.loc[missing_mask, feature_col] = valid_values[nearest_indices]
    
    return df_copy[feature_col]

# Fill missing values for all numeric columns at once
numeric_cols = get_numeric_cols(dataset)
# dataset[numeric_cols] = dataset[numeric_cols].apply(
#     lambda col: fill_missing_values_by_distance(dataset, col.name)
# )

# fill missing values with the mean of the column
dataset[numeric_cols] = dataset[numeric_cols].fillna(dataset[numeric_cols].mean())

df = dataset[numeric_cols.tolist() + lat_long_cols].copy()

# plot distribution of numeric columns
# plt.figure(figsize=(10, 6))
# sns.histplot(df['orgc'], bins=30, kde=True)
# plt.title(f'Distribution of orgc')
# plt.show()
df.head(10)

In [None]:
# Calculate organic matter metrics if orgc present
if 'orgc' in df.columns:
    # Use vectorized operations for better performance
    organic_matter = 1.724 * df['orgc']
    df = df.assign(
        organic_matter=organic_matter,
        bulk_density=1.62 - 0.06 * organic_matter
    )

# Handle silt and clay columns if present
if {'silt', 'clay'}.issubset(df.columns):
    df['silt_plus_clay'] = df[['silt', 'clay']].sum(axis=1, skipna=True)
    df.drop(columns=['silt', 'clay'], inplace=True)


print(df.shape)
print(df.columns)

In [None]:
import geopandas as gpd
import folium

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

# introduce geospatial features
def load_grids(file_path):
    grid_gdf = gpd.read_file(file_path)
    return grid_gdf

def load_folium_map(grid_gdf):
    center_lat = grid_gdf.geometry.centroid.y.mean()
    center_lon = grid_gdf.geometry.centroid.x.mean()

    # # create a folium map
    m = folium.Map(location=[center_lat, center_lon], zoom_start=8)

    folium.GeoJson(
            grid_gdf, 
            name="10km Grid", 
            style_function=lambda x: {'color': 'blue', 'weight': 0.2, 'fillOpacity': 0},
            highlight_function=lambda x: {'weight': 3, 'fillOpacity': 0.5},
    ).add_to(m)

    # show the map
    return m

grid_gdf = load_grids("D:/tierra/data/grids/mexico_grid_10km.shp")
# Create GeoDataFrame with geometry and sequential grid_id
gdf = gpd.GeoDataFrame(
    {'grid_id': range(1, len(grid_gdf) + 1)}, 
    geometry=grid_gdf.geometry
)

# Convert df to GeoDataFrame using latitude/longitude
df_gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude))

# Perform spatial join between points and grid polygons
gdf = gpd.sjoin(gdf, df_gdf, how="left", predicate="contains")

# Drop unnecessary columns from the spatial join
gdf = gdf.drop(columns=['index_right'], errors='ignore')

print(gdf.shape)
print(gdf.columns)
gdf.head()

# create a figure with two subplots side by side
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 5))

# plot frequency of grid_id
ax1.hist(gdf['grid_id'], bins=100)
ax1.set_title('Frequency of Grid ID')
ax1.set_xlabel('Grid ID')
ax1.set_ylabel('Frequency')

# plot heatmap of grid_id
hist2d = ax2.hist2d(gdf.geometry.centroid.x, gdf.geometry.centroid.y, bins=50, cmap='YlOrRd')
ax2.set_title('Spatial Distribution of Sample Points')
ax2.set_xlabel('Longitude')
ax2.set_ylabel('Latitude')

# adjust layout and display
plt.tight_layout()
plt.show()


In [None]:
# introduce spatial features in the dataset


In [176]:
import tensorflow as tf
from tensorflow.keras.models import Sequential # type: ignore
from tensorflow.keras.layers import Dense, Input # type: ignore
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# deep learning for organic carbon prediction
X = df.drop(columns=['orgc'])
y = df['orgc']

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# build the model
model = Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(64, activation='relu'), # first hidden layer with 64 neurons
    Dense(32, activation='relu'), # second hidden layer with 32 neurons
    Dense(1) # output layer with 1 neuron
])

# compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

In [None]:
# train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2)

# plot the training history
plt.figure(figsize=(10, 5))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Mean Squared Error')
plt.title('Training History Organic Carbon')
plt.legend()
plt.show()

In [None]:
# evaluate the model
mse = model.evaluate(X_test, y_test)
print(f'Mean Squared Error: {mse}')

# make predictions
y_pred = model.predict(X_test)

# plot the predictions vs the actual values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r:', label='Perfect Prediction')
plt.xlabel('Actual Organic Carbon (g/Kg)')
plt.ylabel('Predicted Organic Carbon (g/Kg)')
plt.title('Actual vs Predicted Organic Carbon')
plt.legend()
plt.show()