#DL

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('ZIM_clinic_data - ZIM_clinic_data.csv.csv')

# Display the first 5 rows and column information
print(df.head().to_markdown(index=False, numalign="left", stralign="left"))
print(df.info())

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Load the dataset
df = pd.read_csv('ZIM_clinic_data - ZIM_clinic_data.csv.csv')

# Prepare data for classification
df_classification = df.dropna(subset=['mainlander_on_zb']).copy()

features = ['agegroup', 'sex', 'occupation', 'home_district', 'travel', 'travel_over_4_nights', 'travel_over_14_nights']
target = 'mainlander_on_zb'

X = df_classification[features]
y = df_classification[target]

categorical_features = ['agegroup', 'sex', 'occupation', 'home_district']
numerical_features = ['travel', 'travel_over_4_nights', 'travel_over_14_nights']

# Impute missing values
for col in numerical_features:
    X[col] = X[col].fillna(X[col].mean())
for col in categorical_features:
    X[col] = X[col].fillna(X[col].mode()[0])

# Preprocessing: One-hot encode categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)],
    remainder='passthrough'
)
X_preprocessed = preprocessor.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

# Build and train a deep learning model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Sigmoid for binary classification
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Tabular Classification Model Accuracy: {accuracy:.4f}")

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM

# Load the dataset
df = pd.read_csv('ZIM_clinic_data - ZIM_clinic_data.csv.csv')

# Prepare data for time series analysis
df_ts = df.copy()
df_ts['date'] = pd.to_datetime(df_ts['date'], format='%Y-%m')
travelers_by_month = df_ts.groupby('date')['travel'].sum().reset_index()

# Prepare data for LSTM model
def create_sequences(data, n_steps):
    X, y = [], []
    for i in range(len(data)):
        end_ix = i + n_steps
        if end_ix > len(data) - 1:
            break
        seq_x, seq_y = data[i:end_ix], data[end_ix]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

n_steps = 3  # Use the last 3 months to predict the next
X_ts, y_ts = create_sequences(travelers_by_month['travel'].values, n_steps)
X_ts = X_ts.reshape((X_ts.shape[0], X_ts.shape[1], 1))

# Split into train/test
X_train_ts, X_test_ts, y_train_ts, y_test_ts = train_test_split(X_ts, y_ts, test_size=0.2, random_state=42)

# Build and train an LSTM model
model_ts = Sequential([
    LSTM(50, activation='relu', input_shape=(n_steps, 1)),
    Dense(1)
])
model_ts.compile(optimizer='adam', loss='mse')
model_ts.fit(X_train_ts, y_train_ts, epochs=200, verbose=1)

# Make predictions
y_pred_ts = model_ts.predict(X_test_ts, verbose=0)
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test_ts, y_pred_ts)
print(f"Time Series Forecasting Model MSE: {mse:.4f}")

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten

# Load the dataset
df = pd.read_csv('ZIM_clinic_data - ZIM_clinic_data.csv.csv')

# Prepare data for entity embeddings
df_embeddings = df.copy().dropna(subset=['occupation', 'mainlander_on_zb'])

le = LabelEncoder()
df_embeddings['occupation_encoded'] = le.fit_transform(df_embeddings['occupation'])

vocab_size = len(le.classes_)

X_emb = df_embeddings['occupation_encoded']
y_emb = df_embeddings['mainlander_on_zb']

# Build a model with an embedding layer
embedding_dim = 5
model_emb = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=1, name='occupation_embedding'),
    Flatten(),
    Dense(1, activation='sigmoid')
])

model_emb.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model_emb.fit(X_emb, y_emb, epochs=10, batch_size=32, verbose=1)

# Extract embeddings
occupation_embeddings = model_emb.get_layer('occupation_embedding').get_weights()[0]
embeddings_df = pd.DataFrame(occupation_embeddings, index=le.classes_)
print("Entity Embeddings for 'occupation' (first 5 rows):\n")
print(embeddings_df.head())

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Flatten
import matplotlib.pyplot as plt
import seaborn as sns

# --- Data Preparation (Re-running from previous steps) ---
df = pd.read_csv('ZIM_clinic_data - ZIM_clinic_data.csv.csv')

# --- Tabular Classification Plots ---
# Prepare data
df_classification = df.dropna(subset=['mainlander_on_zb']).copy()
features = ['agegroup', 'sex', 'occupation', 'home_district', 'travel', 'travel_over_4_nights', 'travel_over_14_nights']
target = 'mainlander_on_zb'
X = df_classification[features]
y = df_classification[target]
categorical_features = ['agegroup', 'sex', 'occupation', 'home_district']
numerical_features = ['travel', 'travel_over_4_nights', 'travel_over_14_nights']
for col in numerical_features:
    X[col] = X[col].fillna(X[col].mean())
for col in categorical_features:
    X[col] = X[col].fillna(X[col].mode()[0])
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)],
    remainder='passthrough'
)
X_preprocessed = preprocessor.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

# Build and train model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=0)
y_pred_class = (model.predict(X_test, verbose=0) > 0.5).astype("int32")
cm = confusion_matrix(y_test, y_pred_class)

# Plotting
plt.figure(figsize=(18, 6))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)

plt.subplot(1, 2, 2)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Mainlander', 'Mainlander'], yticklabels=['Non-Mainlander', 'Mainlander'])
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.tight_layout()
plt.show()

# --- Time Series Analysis Plots ---
# Prepare data
df_ts = df.copy()
df_ts['date'] = pd.to_datetime(df_ts['date'], format='%Y-%m')
travelers_by_month = df_ts.groupby('date')['travel'].sum().reset_index()
def create_sequences(data, n_steps):
    X, y = [], []
    for i in range(len(data)):
        end_ix = i + n_steps
        if end_ix > len(data) - 1:
            break
        seq_x, seq_y = data[i:end_ix], data[end_ix]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)
n_steps = 3
X_ts, y_ts = create_sequences(travelers_by_month['travel'].values, n_steps)
X_ts = X_ts.reshape((X_ts.shape[0], X_ts.shape[1], 1))
X_train_ts, X_test_ts, y_train_ts, y_test_ts = train_test_split(X_ts, y_ts, test_size=0.2, random_state=42)

# Build and train model
model_ts = Sequential([
    LSTM(50, activation='relu', input_shape=(n_steps, 1)),
    Dense(1)
])
model_ts.compile(optimizer='adam', loss='mse')
model_ts.fit(X_train_ts, y_train_ts, epochs=200, verbose=0)
y_pred_ts = model_ts.predict(X_test_ts, verbose=0)

# Plotting
plt.figure(figsize=(10, 6))
plt.plot(travelers_by_month['date'].iloc[len(travelers_by_month) - len(y_test_ts):], y_test_ts, label='Actual Travelers', marker='o')
plt.plot(travelers_by_month['date'].iloc[len(travelers_by_month) - len(y_test_ts):], y_pred_ts, label='Predicted Travelers', marker='x')
plt.title('Time Series Forecasting: Actual vs. Predicted')
plt.xlabel('Date')
plt.ylabel('Number of Travelers')
plt.legend()
plt.grid(True)
plt.show()

# --- Entity Embeddings Plots ---
# Prepare data
df_embeddings = df.copy().dropna(subset=['occupation', 'mainlander_on_zb'])
le = LabelEncoder()
df_embeddings['occupation_encoded'] = le.fit_transform(df_embeddings['occupation'])
vocab_size = len(le.classes_)
X_emb = df_embeddings['occupation_encoded']
y_emb = df_embeddings['mainlander_on_zb']

# Build and train model
embedding_dim = 5
model_emb = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=1, name='occupation_embedding'),
    Flatten(),
    Dense(1, activation='sigmoid')
])
model_emb.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_emb.fit(X_emb, y_emb, epochs=10, batch_size=32, verbose=0)
occupation_embeddings = model_emb.get_layer('occupation_embedding').get_weights()[0]

# Plotting with PCA
pca = PCA(n_components=2)
embeddings_2d = pca.fit_transform(occupation_embeddings)
embeddings_df = pd.DataFrame(embeddings_2d, columns=['PC1', 'PC2'])
embeddings_df['occupation'] = le.classes_

plt.figure(figsize=(12, 10))
plt.scatter(embeddings_df['PC1'], embeddings_df['PC2'])
for i, txt in enumerate(embeddings_df['occupation']):
    plt.annotate(txt, (embeddings_df['PC1'][i] + 0.01, embeddings_df['PC2'][i] + 0.01), fontsize=8)
plt.title('2D PCA of Occupation Embeddings')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.grid(True)
plt.show()

# DL on Africa

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Load the dataset
df = pd.read_csv('AfricaDataset.csv.csv')

# Drop irrelevant columns and those with all null values
df = df.drop(columns=['Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'ID', 'AFR ADMIN2 Code', 'AFR Admin name', 'AREA_TYPE'])

# Features and target
features = ['COUNTRY', 'Lat', 'Long', 'MM', 'YY', 'LoAge', 'UpAge', 'Ex', 'Pf', 'METHOD']
target = 'PfPR2-10'

X_reg = df[features]
y_reg = df[target]

# Identify categorical and numerical features
categorical_features_reg = ['COUNTRY', 'METHOD']

# Create a preprocessor for one-hot encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features_reg)],
    remainder='passthrough'
)

# Split data into training and testing sets
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

# Apply preprocessing
X_train_processed_reg = preprocessor.fit_transform(X_train_reg)
X_test_processed_reg = preprocessor.transform(X_test_reg)

# Build a feedforward neural network
model_reg = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_processed_reg.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1, activation='linear')  # Linear activation for regression
])

model_reg.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

# Train the model
model_reg.fit(X_train_processed_reg, y_train_reg, epochs=10, batch_size=64, verbose=1)

# Evaluate the model
loss_reg, mae_reg = model_reg.evaluate(X_test_processed_reg, y_test_reg, verbose=0)
print(f"Regression Model Mean Absolute Error (MAE): {mae_reg:.4f}")

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM

# Load the dataset
df = pd.read_csv('AfricaDataset.csv.csv')

# Drop irrelevant columns
df = df.drop(columns=['Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'ID', 'AFR ADMIN2 Code', 'AFR Admin name', 'AREA_TYPE'])

# Filter data for one country and aggregate by month
df_ts = df[df['COUNTRY'] == 'Angola'].copy()
df_ts['date'] = pd.to_datetime(df_ts['YY'].astype(str) + '-' + df_ts['MM'].astype(str))
monthly_data = df_ts.groupby('date')['PfPR2-10'].mean().reset_index()

# Prepare data for LSTM model
def create_sequences(data, n_steps):
    X, y = [], []
    for i in range(len(data)):
        end_ix = i + n_steps
        if end_ix > len(data) - 1:
            break
        seq_x, seq_y = data[i:end_ix], data[end_ix]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

n_steps = 12  # Use the last 12 months to predict the next
X_ts, y_ts = create_sequences(monthly_data['PfPR2-10'].values, n_steps)
X_ts = X_ts.reshape((X_ts.shape[0], X_ts.shape[1], 1))

# Split into train/test
X_train_ts, X_test_ts, y_train_ts, y_test_ts = train_test_split(X_ts, y_ts, test_size=0.2, random_state=42)

# Build and train an LSTM model
model_ts = Sequential([
    LSTM(50, activation='relu', input_shape=(n_steps, 1)),
    Dense(1)
])
model_ts.compile(optimizer='adam', loss='mse')
model_ts.fit(X_train_ts, y_train_ts, epochs=100, verbose=0)

# Make predictions and evaluate
y_pred_ts = model_ts.predict(X_test_ts, verbose=0)
mse_ts = mean_squared_error(y_test_ts, y_pred_ts)
print(f"Time Series Forecasting Model MSE: {mse_ts:.4f}")

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Load the dataset
df = pd.read_csv('AfricaDataset.csv.csv')

# Drop irrelevant columns and those with all null values
df = df.drop(columns=['Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'ID', 'AFR ADMIN2 Code', 'AFR Admin name', 'AREA_TYPE'])

# Features including geographical data
features_geo = ['Lat', 'Long', 'MM', 'YY', 'LoAge', 'UpAge', 'Ex', 'Pf', 'METHOD']
target = 'PfPR2-10'

X_geo = df[features_geo]
y_geo = df[target]

# Identify categorical features for preprocessing
categorical_features_geo = ['METHOD']

preprocessor_geo = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features_geo)],
    remainder='passthrough'
)
X_train_geo, X_test_geo, y_train_geo, y_test_geo = train_test_split(X_geo, y_geo, test_size=0.2, random_state=42)
X_train_processed_geo = preprocessor_geo.fit_transform(X_train_geo)
X_test_processed_geo = preprocessor_geo.transform(X_test_geo)

# Build a feedforward neural network
model_geo = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_processed_geo.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1, activation='linear')
])

model_geo.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

# Train the model
model_geo.fit(X_train_processed_geo, y_train_geo, epochs=10, batch_size=64, verbose=1)

# Evaluate the model
loss_geo, mae_geo = model_geo.evaluate(X_test_processed_geo, y_test_geo, verbose=0)
print(f"Geospatial Regression Model Mean Absolute Error (MAE): {mae_geo:.4f}")

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import matplotlib.pyplot as plt

# --- 1. Visualize Predicted PfPR2-10 on a Map ---

# Load and preprocess the data
df = pd.read_csv('AfricaDataset.csv.csv')
df = df.drop(columns=['Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'ID', 'AFR ADMIN2 Code', 'AFR Admin name', 'AREA_TYPE'])
features_geo = ['Lat', 'Long', 'MM', 'YY', 'LoAge', 'UpAge', 'Ex', 'Pf', 'METHOD']
target = 'PfPR2-10'
X_geo = df[features_geo]
y_geo = df[target]
categorical_features_geo = ['METHOD']
preprocessor_geo = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features_geo)],
    remainder='passthrough'
)
X_train_geo, X_test_geo, y_train_geo, y_test_geo = train_test_split(X_geo, y_geo, test_size=0.2, random_state=42)
X_train_processed_geo = preprocessor_geo.fit_transform(X_train_geo)
X_test_processed_geo = preprocessor_geo.transform(X_test_geo)

# Re-run the geospatial regression model to get predictions
model_geo = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_processed_geo.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1, activation='linear')
])
model_geo.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
model_geo.fit(X_train_processed_geo, y_train_geo, epochs=10, batch_size=64, verbose=0)
y_pred_geo = model_geo.predict(X_test_processed_geo, verbose=0)

# Create a DataFrame for plotting
plot_df = X_test_geo[['Lat', 'Long']].copy()
plot_df['Actual'] = y_test_geo.values
plot_df['Predicted'] = y_pred_geo

# Create the map plot
plt.figure(figsize=(12, 10))
plt.scatter(x=plot_df['Long'], y=plot_df['Lat'], c=plot_df['Predicted'], cmap='viridis', s=10, alpha=0.7)
plt.scatter(x=plot_df['Long'], y=plot_df['Lat'], c=plot_df['Actual'], cmap='viridis', s=10, alpha=0.7)
plt.colorbar(label='Predicted PfPR2-10')
plt.title('Predicted PfPR2-10 Values on a Map')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.grid(True)
plt.show()


# --- 2. Spatial Feature Engineering and New Model ---

print("\n### Geospatial Regression with Advanced Spatial Features\n")

# Add new spatial features to the dataset
df['Lat_sq'] = df['Lat']**2
df['Long_sq'] = df['Long']**2
df['Lat_Long_inter'] = df['Lat'] * df['Long']

# Define new features
features_geo_adv = ['Lat', 'Long', 'Lat_sq', 'Long_sq', 'Lat_Long_inter', 'MM', 'YY', 'LoAge', 'UpAge', 'Ex', 'Pf', 'METHOD']
X_geo_adv = df[features_geo_adv]
y_geo_adv = df[target]
categorical_features_geo_adv = ['METHOD']
preprocessor_geo_adv = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features_geo_adv)],
    remainder='passthrough'
)
X_train_geo_adv, X_test_geo_adv, y_train_geo_adv, y_test_geo_adv = train_test_split(X_geo_adv, y_geo_adv, test_size=0.2, random_state=42)
X_train_processed_geo_adv = preprocessor_geo_adv.fit_transform(X_train_geo_adv)
X_test_processed_geo_adv = preprocessor_geo_adv.transform(X_test_geo_adv)

# Build a new feedforward neural network
model_geo_adv = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_processed_geo_adv.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1, activation='linear')
])

model_geo_adv.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

# Train the new model
model_geo_adv.fit(X_train_processed_geo_adv, y_train_geo_adv, epochs=10, batch_size=64, verbose=0)

# Evaluate the new model
loss_geo_adv, mae_geo_adv = model_geo_adv.evaluate(X_test_processed_geo_adv, y_test_geo_adv, verbose=0)
print(f"Geospatial Regression Model with Advanced Features Mean Absolute Error (MAE): {mae_geo_adv:.4f}")

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import matplotlib.pyplot as plt

# Load and preprocess the data
df = pd.read_csv('AfricaDataset.csv.csv')
df = df.drop(columns=['Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'ID', 'AFR ADMIN2 Code', 'AFR Admin name', 'AREA_TYPE'])
features_geo = ['Lat', 'Long', 'MM', 'YY', 'LoAge', 'UpAge', 'Ex', 'Pf', 'METHOD']
target = 'PfPR2-10'
X_geo = df[features_geo]
y_geo = df[target]
categorical_features_geo = ['METHOD']
preprocessor_geo = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features_geo)],
    remainder='passthrough'
)
X_train_geo, X_test_geo, y_train_geo, y_test_geo = train_test_split(X_geo, y_geo, test_size=0.2, random_state=42)
X_train_processed_geo = preprocessor_geo.fit_transform(X_train_geo)
X_test_processed_geo = preprocessor_geo.transform(X_test_geo)

# Re-run the geospatial regression model to get predictions
model_geo = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_processed_geo.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1, activation='linear')
])
model_geo.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
model_geo.fit(X_train_processed_geo, y_train_geo, epochs=10, batch_size=64, verbose=0)
y_pred_geo = model_geo.predict(X_test_processed_geo, verbose=0)


# Create a DataFrame for plotting
plot_df = X_test_geo[['Lat', 'Long']].copy()
plot_df['Actual'] = y_test_geo.values
plot_df['Predicted'] = y_pred_geo

# Create a single plot with two subplots for comparison
fig, axes = plt.subplots(1, 2, figsize=(20, 10))

# Plot actual values
scatter_actual = axes[0].scatter(x=plot_df['Long'], y=plot_df['Lat'], c=plot_df['Actual'], cmap='viridis', s=10, alpha=0.7)
axes[0].set_title('Actual PfPR2-10 Values')
axes[0].set_xlabel('Longitude')
axes[0].set_ylabel('Latitude')
axes[0].grid(True)
fig.colorbar(scatter_actual, ax=axes[0], label='Actual PfPR2-10')

# Plot predicted values
scatter_predicted = axes[1].scatter(x=plot_df['Long'], y=plot_df['Lat'], c=plot_df['Predicted'], cmap='viridis', s=10, alpha=0.7)
axes[1].set_title('Predicted PfPR2-10 Values')
axes[1].set_xlabel('Longitude')
axes[1].set_ylabel('Latitude')
axes[1].grid(True)
fig.colorbar(scatter_predicted, ax=axes[1], label='Predicted PfPR2-10')

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import matplotlib.pyplot as plt

# Load and preprocess the data
df = pd.read_csv('AfricaDataset.csv.csv')
df = df.drop(columns=['Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'ID', 'AFR ADMIN2 Code', 'AFR Admin name', 'AREA_TYPE'])
features_geo = ['Lat', 'Long', 'MM', 'YY', 'LoAge', 'UpAge', 'Ex', 'Pf', 'METHOD']
target = 'PfPR2-10'
X_geo = df[features_geo]
y_geo = df[target]
categorical_features_geo = ['METHOD']
preprocessor_geo = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features_geo)],
    remainder='passthrough'
)
X_train_geo, X_test_geo, y_train_geo, y_test_geo = train_test_split(X_geo, y_geo, test_size=0.2, random_state=42)
X_train_processed_geo = preprocessor_geo.fit_transform(X_train_geo)
X_test_processed_geo = preprocessor_geo.transform(X_test_geo)

# Re-run the geospatial regression model to get predictions
model_geo = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_processed_geo.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1, activation='linear')
])
model_geo.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
model_geo.fit(X_train_processed_geo, y_train_geo, epochs=10, batch_size=64, verbose=0)
y_pred_geo = model_geo.predict(X_test_processed_geo, verbose=0)

mae = mean_absolute_error(y_test_geo, y_pred_geo)
r2 = r2_score(y_test_geo, y_pred_geo)
rmse = np.sqrt(mean_squared_error(y_test_geo, y_pred_geo))

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R-squared (R2) Score: {r2:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
# Create a DataFrame for plotting
plot_df = X_test_geo[['Lat', 'Long']].copy()
plot_df['Actual'] = y_test_geo.values
plot_df['Predicted'] = y_pred_geo

# Create a single plot with two subplots for comparison
fig, axes = plt.subplots(1, 2, figsize=(20, 10))

# Plot actual values
scatter_actual = axes[0].scatter(x=plot_df['Long'], y=plot_df['Lat'], c=plot_df['Actual'], cmap='viridis', s=10, alpha=0.7)
axes[0].set_title('Actual PfPR2-10 Values')
axes[0].set_xlabel('Longitude')
axes[0].set_ylabel('Latitude')
axes[0].grid(True)
fig.colorbar(scatter_actual, ax=axes[0], label='Actual PfPR2-10')

# Plot predicted values
scatter_predicted = axes[1].scatter(x=plot_df['Long'], y=plot_df['Lat'], c=plot_df['Predicted'], cmap='viridis', s=10, alpha=0.7)
axes[1].set_title('Predicted PfPR2-10 Values')
axes[1].set_xlabel('Longitude')
axes[1].set_ylabel('Latitude')
axes[1].grid(True)
fig.colorbar(scatter_predicted, ax=axes[1], label='Predicted PfPR2-10')

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import matplotlib.pyplot as plt

# Load and preprocess the data
df = pd.read_csv('AfricaDataset.csv.csv')
df = df.drop(columns=['Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'ID', 'AFR ADMIN2 Code', 'AFR Admin name', 'AREA_TYPE'])

# Create new feature: distance to the equator
df['dist_to_equator'] = np.abs(df['Lat'])

# Features and target
features = ['COUNTRY', 'Lat', 'Long', 'dist_to_equator', 'MM', 'YY', 'LoAge', 'UpAge', 'Ex', 'Pf', 'METHOD']
target = 'PfPR2-10'
X_geo = df[features]
y_geo = df[target]

# Identify categorical and numerical features for the pipeline
categorical_features_geo = ['COUNTRY', 'METHOD']
numerical_features_geo = ['Lat', 'Long', 'dist_to_equator', 'MM', 'YY', 'LoAge', 'UpAge', 'Ex', 'Pf']

# Create a preprocessing pipeline with scaling and one-hot encoding
preprocessor_geo = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features_geo),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features_geo)
    ])

# Split data into training and testing sets
X_train_geo, X_test_geo, y_train_geo, y_test_geo = train_test_split(X_geo, y_geo, test_size=0.2, random_state=42)

# Apply preprocessing to training and testing sets
X_train_processed_geo = preprocessor_geo.fit_transform(X_train_geo)
X_test_processed_geo = preprocessor_geo.transform(X_test_geo)

# Build a new feedforward neural network with increased capacity
model_geo_tuned = Sequential([
    Dense(256, activation='relu', input_shape=(X_train_processed_geo.shape[1],)),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(1, activation='linear')
])

# Compile the model with MAE as the loss function
model_geo_tuned.compile(optimizer='adam', loss='mean_absolute_error', metrics=['mae'])

# Train the new model
history = model_geo_tuned.fit(X_train_processed_geo, y_train_geo, epochs=20, batch_size=64, verbose=1)

# Evaluate the new model
loss_geo_tuned, mae_geo_tuned = model_geo_tuned.evaluate(X_test_processed_geo, y_test_geo, verbose=0)
print(f"Geospatial Regression Model with Tuned Features and Architecture Mean Absolute Error (MAE): {mae_geo_tuned:.4f}")

# Generate predictions for plotting
y_pred_geo_tuned = model_geo_tuned.predict(X_test_processed_geo, verbose=0)


# Create a DataFrame for plotting
plot_df_tuned = X_test_geo[['Lat', 'Long']].copy()
plot_df_tuned['Actual'] = y_test_geo.values
plot_df_tuned['Predicted'] = y_pred_geo_tuned

# Create a single plot with two subplots for comparison
fig, axes = plt.subplots(1, 2, figsize=(20, 10))

# Plot actual values
scatter_actual = axes[0].scatter(x=plot_df_tuned['Long'], y=plot_df_tuned['Lat'], c=plot_df_tuned['Actual'], cmap='viridis', s=10, alpha=0.7)
axes[0].set_title('Actual PfPR2-10 Values')
axes[0].set_xlabel('Longitude')
axes[0].set_ylabel('Latitude')
axes[0].grid(True)
fig.colorbar(scatter_actual, ax=axes[0], label='Actual PfPR2-10')

# Plot predicted values
scatter_predicted = axes[1].scatter(x=plot_df_tuned['Long'], y=plot_df_tuned['Lat'], c=plot_df_tuned['Predicted'], cmap='viridis', s=10, alpha=0.7)
axes[1].set_title('Predicted PfPR2-10 Values')
axes[1].set_xlabel('Longitude')
axes[1].set_ylabel('Latitude')
axes[1].grid(True)
fig.colorbar(scatter_predicted, ax=axes[1], label='Predicted PfPR2-10')

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import matplotlib.pyplot as plt

# Load and preprocess the data
df = pd.read_csv('AfricaDataset.csv.csv')
df = df.drop(columns=['Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'ID', 'AFR ADMIN2 Code', 'AFR Admin name', 'AREA_TYPE'])

# Create new feature: distance to the equator
df['dist_to_equator'] = np.abs(df['Lat'])

# Features and target
features = ['COUNTRY', 'Lat', 'Long', 'dist_to_equator', 'MM', 'YY', 'LoAge', 'UpAge', 'Ex', 'Pf', 'METHOD']
target = 'PfPR2-10'
X_geo = df[features]
y_geo = df[target]

# Identify categorical and numerical features for the pipeline
categorical_features_geo = ['COUNTRY', 'METHOD']
numerical_features_geo = ['Lat', 'Long', 'dist_to_equator', 'MM', 'YY', 'LoAge', 'UpAge', 'Ex', 'Pf']

# Create a preprocessing pipeline with scaling and one-hot encoding
preprocessor_geo = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features_geo),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features_geo)
    ])

# Split data into training and testing sets
X_train_geo, X_test_geo, y_train_geo, y_test_geo = train_test_split(X_geo, y_geo, test_size=0.2, random_state=42)

# Apply preprocessing to training and testing sets
X_train_processed_geo = preprocessor_geo.fit_transform(X_train_geo)
X_test_processed_geo = preprocessor_geo.transform(X_test_geo)

# Build a new feedforward neural network with increased capacity
model_geo_tuned = Sequential([
    Dense(256, activation='relu', input_shape=(X_train_processed_geo.shape[1],)),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(1, activation='linear')
])

# Compile the model with MAE as the loss function
model_geo_tuned.compile(optimizer='adam', loss='mean_absolute_error', metrics=['mae'])

# Train the new model
history = model_geo_tuned.fit(X_train_processed_geo, y_train_geo, epochs=20, batch_size=64, verbose=1)

# Evaluate the new model
loss_geo_tuned, mae_geo_tuned = model_geo_tuned.evaluate(X_test_processed_geo, y_test_geo, verbose=0)
print(f"Geospatial Regression Model with Tuned Features and Architecture Mean Absolute Error (MAE): {mae_geo_tuned:.4f}")

# Generate predictions for plotting
y_pred_geo_tuned = model_geo_tuned.predict(X_test_processed_geo, verbose=0)

# Calculate and print the metrics
mae = mean_absolute_error(y_test_geo, y_pred_geo_tuned)
r2 = r2_score(y_test_geo, y_pred_geo_tuned)
rmse = np.sqrt(mean_squared_error(y_test_geo, y_pred_geo_tuned))

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R-squared (R2) Score: {r2:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

# Create a DataFrame for plotting
plot_df_tuned = X_test_geo[['Lat', 'Long']].copy()
plot_df_tuned['Actual'] = y_test_geo.values
plot_df_tuned['Predicted'] = y_pred_geo_tuned

# Create a single plot with two subplots for comparison
fig, axes = plt.subplots(1, 2, figsize=(20, 10))

# Plot actual values
scatter_actual = axes[0].scatter(x=plot_df_tuned['Long'], y=plot_df_tuned['Lat'], c=plot_df_tuned['Actual'], cmap='viridis', s=10, alpha=0.7)
axes[0].set_title('Actual PfPR2-10 Values')
axes[0].set_xlabel('Longitude')
axes[0].set_ylabel('Latitude')
axes[0].grid(True)
fig.colorbar(scatter_actual, ax=axes[0], label='Actual PfPR2-10')

# Plot predicted values
scatter_predicted = axes[1].scatter(x=plot_df_tuned['Long'], y=plot_df_tuned['Lat'], c=plot_df_tuned['Predicted'], cmap='viridis', s=10, alpha=0.7)
axes[1].set_title('Predicted PfPR2-10 Values')
axes[1].set_xlabel('Longitude')
axes[1].set_ylabel('Latitude')
axes[1].grid(True)
fig.colorbar(scatter_predicted, ax=axes[1], label='Predicted PfPR2-10')

plt.tight_layout()
plt.show()

In [None]:
# Create a DataFrame for plotting errors
plot_df_errors = X_test_geo[['Lat', 'Long']].copy()
plot_df_errors['Error'] = y_test_geo.values - y_pred_geo_tuned.flatten()

# Create the error plot
plt.figure(figsize=(12, 10))
plt.scatter(x=plot_df_errors['Long'], y=plot_df_errors['Lat'], c=plot_df_errors['Error'], cmap='coolwarm', s=10, alpha=0.7)
plt.colorbar(label='Prediction Error (Actual - Predicted)')
plt.title('Prediction Error on a Map')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.grid(True)
plt.show()

# Interpretation of the Error Plot:
# - Blue points indicate areas where the model is over-predicting (Predicted > Actual).
# - Red points indicate areas where the model is under-predicting (Predicted < Actual).
# - White/light points indicate areas where the model is highly accurate.
# Look for clusters of red or blue points to identify regions where the model has a systematic bias.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.cluster import MiniBatchKMeans
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import matplotlib.pyplot as plt

# Load and preprocess the data
df = pd.read_csv('AfricaDataset.csv.csv')
df = df.drop(columns=['Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'ID', 'AFR ADMIN2 Code', 'AFR Admin name', 'AREA_TYPE'])
df['dist_to_equator'] = np.abs(df['Lat'])

# --- New Step: Create a new feature using K-Means clustering ---

# Select Lat and Long for clustering
geo_data = df[['Lat', 'Long']].values

# Use MiniBatchKMeans for faster computation, choosing 20 clusters as a starting point
kmeans = MiniBatchKMeans(n_clusters=5, random_state=42, n_init=10)
df['geo_cluster'] = kmeans.fit_predict(geo_data)

# Features and target (now including the new 'geo_cluster' feature)
features = ['COUNTRY', 'Lat', 'Long', 'dist_to_equator', 'MM', 'YY', 'LoAge', 'UpAge', 'Ex', 'Pf', 'METHOD', 'geo_cluster']
target = 'PfPR2-10'
X_geo = df[features]
y_geo = df[target]

categorical_features_geo = ['COUNTRY', 'METHOD', 'geo_cluster']
numerical_features_geo = ['Lat', 'Long', 'dist_to_equator', 'MM', 'YY', 'LoAge', 'UpAge', 'Ex', 'Pf']

preprocessor_geo = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features_geo),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features_geo)
    ])

X_train_geo, X_test_geo, y_train_geo, y_test_geo = train_test_split(X_geo, y_geo, test_size=0.2, random_state=42)
X_train_processed_geo = preprocessor_geo.fit_transform(X_train_geo)
X_test_processed_geo = preprocessor_geo.transform(X_test_geo)

# Re-run the tuned model with the new feature
model_geo_tuned = Sequential([
    Dense(256, activation='relu', input_shape=(X_train_processed_geo.shape[1],)),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(1, activation='linear')
])
model_geo_tuned.compile(optimizer='adam', loss='mean_absolute_error', metrics=['mae'])
model_geo_tuned.fit(X_train_processed_geo, y_train_geo, epochs=20, batch_size=64, verbose=0)
loss_geo_tuned, mae_geo_tuned = model_geo_tuned.evaluate(X_test_processed_geo, y_test_geo, verbose=0)

print(f"Geospatial Regression Model with K-Means Clusters MAE: {mae_geo_tuned:.4f}")

In [None]:
# Generate predictions for plotting
y_pred_geo_tuned = model_geo_tuned.predict(X_test_processed_geo, verbose=0)
# Create a DataFrame for plotting
plot_df_tuned = X_test_geo[['Lat', 'Long']].copy()
plot_df_tuned['Actual'] = y_test_geo.values
plot_df_tuned['Predicted'] = y_pred_geo_tuned

# Create a single plot with two subplots for comparison
fig, axes = plt.subplots(1, 2, figsize=(20, 10))

# Plot actual values
scatter_actual = axes[0].scatter(x=plot_df_tuned['Long'], y=plot_df_tuned['Lat'], c=plot_df_tuned['Actual'], cmap='viridis', s=10, alpha=0.7)
axes[0].set_title('Actual PfPR2-10 Values')
axes[0].set_xlabel('Longitude')
axes[0].set_ylabel('Latitude')
axes[0].grid(True)
fig.colorbar(scatter_actual, ax=axes[0], label='Actual PfPR2-10')

# Plot predicted values
scatter_predicted = axes[1].scatter(x=plot_df_tuned['Long'], y=plot_df_tuned['Lat'], c=plot_df_tuned['Predicted'], cmap='viridis', s=10, alpha=0.7)
axes[1].set_title('Predicted PfPR2-10 Values')
axes[1].set_xlabel('Longitude')
axes[1].set_ylabel('Latitude')
axes[1].grid(True)
fig.colorbar(scatter_predicted, ax=axes[1], label='Predicted PfPR2-10')

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.cluster import MiniBatchKMeans
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import matplotlib.pyplot as plt
import shap

# --- Load, preprocess, and train the final model ---

# Load and preprocess the data
df = pd.read_csv('AfricaDataset.csv.csv')
df = df.drop(columns=['Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'ID', 'AFR ADMIN2 Code', 'AFR Admin name', 'AREA_TYPE'])
df['dist_to_equator'] = np.abs(df['Lat'])

# Create a new feature using K-Means clustering
geo_data = df[['Lat', 'Long']].values
kmeans = MiniBatchKMeans(n_clusters=5, random_state=42, n_init=10)
df['geo_cluster'] = kmeans.fit_predict(geo_data)

features = ['COUNTRY', 'Lat', 'Long', 'dist_to_equator', 'MM', 'YY', 'LoAge', 'UpAge', 'Ex', 'Pf', 'METHOD', 'geo_cluster']
target = 'PfPR2-10'
X_geo = df[features]
y_geo = df[target]

categorical_features_geo = ['COUNTRY', 'METHOD', 'geo_cluster']
numerical_features_geo = ['Lat', 'Long', 'dist_to_equator', 'MM', 'YY', 'LoAge', 'UpAge', 'Ex', 'Pf']

preprocessor_geo = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features_geo),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features_geo)
    ])

X_train_geo, X_test_geo, y_train_geo, y_test_geo = train_test_split(X_geo, y_geo, test_size=0.2, random_state=42)
X_train_processed_geo = preprocessor_geo.fit_transform(X_train_geo)
X_test_processed_geo = preprocessor_geo.transform(X_test_geo)

model_geo_tuned = Sequential([
    Dense(256, activation='relu', input_shape=(X_train_processed_geo.shape[1],)),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(1, activation='linear')
])
model_geo_tuned.compile(optimizer='adam', loss='mean_absolute_error', metrics=['mae'])
model_geo_tuned.fit(X_train_processed_geo, y_train_geo, epochs=20, batch_size=64, verbose=0)
y_pred_geo_tuned = model_geo_tuned.predict(X_test_processed_geo, verbose=0)

# --- 1. Visualize the New Model's Errors ---

print("### Visualizing the New Model's Errors\n")

plot_df_errors = X_test_geo[['Lat', 'Long']].copy()
plot_df_errors['Error'] = y_test_geo.values - y_pred_geo_tuned.flatten()

plt.figure(figsize=(12, 10))
plt.scatter(x=plot_df_errors['Long'], y=plot_df_errors['Lat'], c=plot_df_errors['Error'], cmap='coolwarm', s=10, alpha=0.7)
plt.colorbar(label='Prediction Error (Actual - Predicted)')
plt.title('Prediction Error on a Map with K-Means Clusters')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.grid(True)
plt.show()



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.cluster import MiniBatchKMeans
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import matplotlib.pyplot as plt

# Load and preprocess the data
df = pd.read_csv('AfricaDataset.csv.csv')
df = df.drop(columns=['Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'ID', 'AFR ADMIN2 Code', 'AFR Admin name', 'AREA_TYPE'])
df['dist_to_equator'] = np.abs(df['Lat'])

# --- New Step: Create a new feature using K-Means clustering ---

# Select Lat and Long for clustering
geo_data = df[['Lat', 'Long']].values

# Use MiniBatchKMeans for faster computation, choosing 20 clusters as a starting point
kmeans = MiniBatchKMeans(n_clusters=5, random_state=42, n_init=10)
df['geo_cluster'] = kmeans.fit_predict(geo_data)

# Features and target (now including the new 'geo_cluster' feature)
features = ['COUNTRY', 'Lat', 'Long', 'dist_to_equator', 'MM', 'YY', 'LoAge', 'UpAge', 'Ex', 'Pf', 'METHOD', 'geo_cluster']
target = 'PfPR2-10'
X_geo = df[features]
y_geo = df[target]

categorical_features_geo = ['COUNTRY', 'METHOD', 'geo_cluster']
numerical_features_geo = ['Lat', 'Long', 'dist_to_equator', 'MM', 'YY', 'LoAge', 'UpAge', 'Ex', 'Pf']

preprocessor_geo = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features_geo),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features_geo)
    ])

X_train_geo, X_test_geo, y_train_geo, y_test_geo = train_test_split(X_geo, y_geo, test_size=0.2, random_state=42)
X_train_processed_geo = preprocessor_geo.fit_transform(X_train_geo)
X_test_processed_geo = preprocessor_geo.transform(X_test_geo)

# Re-run the tuned model with the new feature
model_geo_tuned = Sequential([
    Dense(256, activation='relu', input_shape=(X_train_processed_geo.shape[1],)),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(1, activation='linear')
])
model_geo_tuned.compile(optimizer='adam', loss='mean_absolute_error', metrics=['mae'])
model_geo_tuned.fit(X_train_processed_geo, y_train_geo, epochs=20, batch_size=64, verbose=0)
loss_geo_tuned, mae_geo_tuned = model_geo_tuned.evaluate(X_test_processed_geo, y_test_geo, verbose=0)

print(f"Geospatial Regression Model with K-Means Clusters MAE: {mae_geo_tuned:.4f}")

# Generate predictions for plotting
y_pred_geo_tuned = model_geo_tuned.predict(X_test_processed_geo, verbose=0)

mae = mean_absolute_error(y_test_geo, y_pred_geo_tuned)
r2 = r2_score(y_test_geo, y_pred_geo_tuned)
rmse = np.sqrt(mean_squared_error(y_test_geo, y_pred_geo_tuned))

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R-squared (R2) Score: {r2:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

# Create a DataFrame for plotting
plot_df_tuned = X_test_geo[['Lat', 'Long']].copy()
plot_df_tuned['Actual'] = y_test_geo.values
plot_df_tuned['Predicted'] = y_pred_geo_tuned

# Create a single plot with two subplots for comparison
fig, axes = plt.subplots(1, 2, figsize=(20, 10))

# Plot actual values
scatter_actual = axes[0].scatter(x=plot_df_tuned['Long'], y=plot_df_tuned['Lat'], c=plot_df_tuned['Actual'], cmap='viridis', s=10, alpha=0.7)
axes[0].set_title('Actual PfPR2-10 Values')
axes[0].set_xlabel('Longitude')
axes[0].set_ylabel('Latitude')
axes[0].grid(True)
fig.colorbar(scatter_actual, ax=axes[0], label='Actual PfPR2-10')

# Plot predicted values
scatter_predicted = axes[1].scatter(x=plot_df_tuned['Long'], y=plot_df_tuned['Lat'], c=plot_df_tuned['Predicted'], cmap='viridis', s=10, alpha=0.7)
axes[1].set_title('Predicted PfPR2-10 Values')
axes[1].set_xlabel('Longitude')
axes[1].set_ylabel('Latitude')
axes[1].grid(True)
fig.colorbar(scatter_predicted, ax=axes[1], label='Predicted PfPR2-10')

plt.tight_layout()
plt.show()