In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from datetime import datetime

# --- 1. Data Loading and Initial Preparation ---
def load_and_prepare_data(file_path='/content/AfricaDataset.csv.csv'):



    """
    Loads the dataset, cleans it, and prepares it for analysis and modeling.
    """
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: The file {file_path} was not found. Please ensure it's in the same directory.")
        return None

    # Drop fully null columns
    df = df.drop(columns=['Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17'], errors='ignore')

    # Convert MM and YY to a proper datetime object for time-series analysis
    # Some dates might be problematic (e.g., MM=0, YY too low/high), handle errors
    df['Date'] = pd.to_datetime(df['YY'].astype(str) + '-' + df['MM'].astype(str) + '-01', errors='coerce')

    # Drop rows where Date could not be parsed
    df.dropna(subset=['Date'], inplace=True)

    # Feature Engineering for Prediction
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['DayOfYear'] = df['Date'].dt.dayofyear # Useful for seasonality

    print("Data loaded and prepared. First 5 rows:")
    print(df.head())
    print("\nData Info:")
    print(df.info())
    return df

# --- 2. Predictive Model for Malaria Prevalence (PfPR2-10) ---
def train_and_predict_malaria(df):
    """
    Trains a simple RandomForestRegressor model to predict PfPR2-10.
    This example uses geographical and temporal features.
    """
    print("\n--- Training Predictive Model ---")

    # Select features for prediction
    # We use Lat, Long, Year, Month, DayOfYear as features
    features = ['Lat', 'Long', 'Year', 'Month', 'DayOfYear']
    target = 'PfPR2-10'

    # Filter out rows with NaN in features or target
    df_model = df.dropna(subset=features + [target])

    if df_model.empty:
        print("Not enough data to train the model after dropping NaNs.")
        return None, None

    X = df_model[features]
    y = df_model[target]

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize and train the model
    model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Model Training Complete.")
    print(f"Mean Absolute Error on Test Set: {mae:.2f}")
    print(f"R-squared Score on Test Set: {r2:.2f}")

    # Example of making a future prediction
    # Let's predict for a hypothetical point in Angola in July 2025
    # (assuming similar Lat/Long as an existing Angola point)
    # This is a simplification; for real future prediction, you'd need
    # to consider specific locations and their time series.
    example_lat = -8.0
    example_long = 13.0
    future_year = 2025
    future_month = 7
    future_date = pd.Timestamp(future_year, future_month, 15) # Use 15th for DayOfYear calculation
    future_dayofyear = future_date.dayofyear

    future_data = pd.DataFrame([[example_lat, example_long, future_year, future_month, future_dayofyear]],
                               columns=features)
    future_prediction = model.predict(future_data)[0]
    print(f"\nExample Future Prediction (Hypothetical Point at Lat {example_lat}, Long {example_long}, July {future_year}):")
    print(f"Predicted PfPR2-10: {future_prediction:.2f}%")

    return model, features

# --- 3. Basic Question Answering from Dataset ---
def answer_basic_questions(df):
    """
    Provides examples of how to answer common questions directly from the DataFrame.
    """
    print("\n--- Answering Basic Questions from Dataset ---")

    # Q1: What is the average PfPR2-10 in a specific country (e.g., Angola)?
    country_name = 'Angola'
    angola_data = df[df['COUNTRY'] == country_name]
    if not angola_data.empty:
        avg_pfpr = angola_data['PfPR2-10'].mean()
        print(f"1. Average PfPR2-10 in {country_name}: {avg_pfpr:.2f}%")
    else:
        print(f"1. No data found for {country_name}.")

    # Q2: Which diagnostic methods are present in the dataset and their counts?
    method_counts = df['METHOD'].value_counts()
    print("\n2. Diagnostic Methods and their Counts:")
    print(method_counts)

    # Q3: What is the highest PfPR2-10 recorded and where/when?
    max_pfpr_row = df.loc[df['PfPR2-10'].idxmax()]
    print(f"\n3. Highest PfPR2-10 Recorded:")
    print(f"   Value: {max_pfpr_row['PfPR2-10']:.2f}%")
    print(f"   Country: {max_pfpr_row['COUNTRY']}")
    print(f"   Location: {max_pfpr_row['AFR Admin name']} (Lat: {max_pfpr_row['Lat']:.2f}, Long: {max_pfpr_row['Long']:.2f})")
    print(f"   Date: {max_pfpr_row['Date'].strftime('%Y-%m')}")

    # Q4: How has the average PfPR2-10 changed over time (yearly)?
    yearly_avg_pfpr = df.groupby('Year')['PfPR2-10'].mean().sort_index()
    print("\n4. Average PfPR2-10 over the years:")
    print(yearly_avg_pfpr.to_string())


# --- Main Execution ---
if __name__ == "__main__":
    df = load_and_prepare_data()

    if df is not None:
        model, features = train_and_predict_malaria(df)
        answer_basic_questions(df)

        print("\n--- Next Steps for Building the LLM RAG Chatbot in PyCharm ---")
        print("To create a full LLM RAG chatbot, you will combine the above functionalities with an LLM.")
        print("\n**Conceptual Architecture:**")
        print("1.  **User Query:** The user asks a question (e.g., 'What's the predicted malaria prevalence in Angola next year?' or 'What are the diagnostic methods?').")
        print("2.  **Intent Recognition (Optional but Recommended):** An initial classifier or a small LLM can determine if the query is for prediction or factual Q&A.")
        print("3.  **Retrieval System (RAG Component):**")
        print("    * **For Factual Q&A:** If it's a factual question, the system queries the pandas DataFrame (as shown in `answer_basic_questions`) or a more sophisticated knowledge base (e.g., a vector store of external malaria facts).")
        print("    * **For Prediction:** If it's a prediction query, extract parameters (e.g., location, date) and feed them to the trained `model` (from `train_and_predict_malaria`).")
        print("4.  **Context Construction:** The retrieved data (e.g., DataFrame query results, prediction output, or retrieved facts from a vector store) is formatted into a context string.")
        print("5.  **LLM Prompt Engineering:** A prompt is crafted for the LLM, incorporating the user's original query and the retrieved context.")
        print("    Example Prompt: 'Based on the following data: [RETRIEVED_CONTEXT], answer the question: [USER_QUERY]'")
        print("6.  **LLM Generation:** The LLM generates a natural language answer based on the prompt.")
        print("7.  **Chatbot Interface:** Present the LLM's answer to the user.")

        print("\n**Key Libraries for PyCharm Implementation:**")
        print("   * **Data Handling:** `pandas`, `numpy` (already used)")
        print("   * **Machine Learning:** `scikit-learn` (for prediction, already used)")
        print("   * **LLM Integration:** `langchain`, `llama_index` (these frameworks help build RAG pipelines). You'll also need a client library for your chosen LLM (e.g., `openai`, `google-generativeai`, `huggingface_hub`).")
        print("   * **Vector Databases (for external knowledge):** `chromadb`, `faiss-cpu`, `pinecone-client` (if you augment with external malaria documents).")
        print("   * **Chatbot Interface:** `streamlit`, `gradio`, or a simple `input()` loop for console-based.")

        print("\n**PyCharm Setup Advice:**")
        print("1.  **Create a New Project:** Open PyCharm, select 'New Project'. Choose 'Pure Python' or 'Poetry Environment' for dependency management.")
        print("2.  **Install Libraries:** Open PyCharm's Terminal (View -> Tool Windows -> Terminal) and install necessary packages:")
        print("    `pip install pandas scikit-learn`")
        print("    `pip install langchain openai # or google-generativeai, huggingface_hub, etc.`")
        print("    `pip install streamlit # or gradio for UI`")
        print("3.  **Place Dataset:** Ensure `AfricaDataset.csv` is in your project's root directory or provide the full path.")
        print("4.  **Copy this Code:** Paste the provided Python code into a `.py` file (e.g., `malaria_chatbot.py`).")
        print("5.  **API Keys:** For LLMs, you'll need to set up environment variables for your API keys (e.g., `OPENAI_API_KEY`). PyCharm allows you to configure environment variables for run configurations.")
        print("6.  **Develop Iteratively:** Start with the data loading, then the prediction, then basic Q&A. Once these parts work, begin integrating `langchain` or `llama_index` to connect them with an LLM.")

# Statistical methods

In [None]:
import pandas as pd

# Show column names
print(df.columns.tolist())


In [None]:
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Binomial GLM: Modeling malaria positives as a proportion of those examined
model = smf.glm(
    formula="Pf ~ Lat + Long + C(AREA_TYPE) + Year",
    data=df,
    family=sm.families.Binomial(),
    offset=np.log(df['Ex'])  # Ex = Examined count
)
result = model.fit()
print(result.summary())


In [None]:
# Use logit model for proportion (make sure values are in (0,1), not 0 or 1 exactly)
df = df[(df['prop_positive'] > 0) & (df['prop_positive'] < 1)]

model = smf.glm(
    formula="prop_positive ~ Lat + Long + C(AREA_TYPE) + Year",
    data=df,
    family=sm.families.Binomial()
)
result = model.fit()
print(result.summary())


In [None]:
import statsmodels.api as sm
from patsy import dmatrix

# Nonlinear spline for Year
spline = dmatrix("bs(Year, df=5, degree=3)", {"Year": df['Year']}, return_type='dataframe')
df = df.join(spline)

# Fit GAM-like GLM with spline
model = sm.GLM(df["prop_positive"], df[spline.columns.tolist() + ['Lat', 'Long']], family=sm.families.Binomial())
result = model.fit()
print(result.summary())


In [None]:
from statsmodels.discrete.discrete_model import NegativeBinomial

model = smf.glm("Pf ~ Lat + Long + C(AREA_TYPE) + Year", data=df,
                family=sm.families.Poisson(), offset=np.log(df['Ex']))
result = model.fit()
print(result.summary())

In [None]:
import matplotlib.pyplot as plt

df['predicted'] = result.predict()
plt.scatter(df['Year'], df['predicted'], alpha=0.3)
plt.xlabel("Year")
plt.ylabel("Predicted Prevalence (logit)")
plt.title("Predicted Malaria Prevalence Over Time")
plt.show()


In [None]:
# Select only relevant columns
df_model = df[['COUNTRY', 'YY', 'LoAge', 'UpAge', 'Ex', 'Pf']].dropna()

# Convert categorical column
df_model['COUNTRY'] = df_model['COUNTRY'].astype('category')

In [None]:
# One-hot encode COUNTRY
df_encoded = pd.get_dummies(df_model, columns=['COUNTRY'], drop_first=True)


In [None]:
X = df_encoded.drop(columns=['Pf'])
y = df_encoded['Pf']


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
import statsmodels.api as sm

poisson_model = sm.GLM(y_train, sm.add_constant(X_train), family=sm.families.Poisson())
poisson_result = poisson_model.fit()

# Predict and evaluate
y_pred_poisson = poisson_result.predict(sm.add_constant(X_test))


In [None]:
# Confirm all columns are numeric
print(X_train.dtypes)
X_train = X_train.apply(pd.to_numeric)
X_test = X_test.apply(pd.to_numeric)
y_train = pd.to_numeric(y_train)
y_test = pd.to_numeric(y_test)


In [None]:
# Convert boolean columns to integers (0 or 1)
X_train = X_train.astype(float)
X_test = X_test.astype(float)


In [None]:
import statsmodels.api as sm

# Add constant column
X_train_const = sm.add_constant(X_train)
X_test_const = sm.add_constant(X_test)

# Fit Poisson regression
poisson_model = sm.GLM(y_train, X_train_const, family=sm.families.Poisson())
poisson_result = poisson_model.fit()

# Predict
y_pred_poisson = poisson_result.predict(X_test_const)

# Evaluate
from sklearn.metrics import mean_squared_error
print("Poisson RMSE:", mean_squared_error(y_test, y_pred_poisson))


In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt

# ---------------------
# 1. Load & Prepare Data
# ---------------------
df = pd.read_csv("/content/AfricaDataset.csv.csv")

# Keep only relevant columns
df_model = df[['COUNTRY', 'YY', 'LoAge', 'UpAge', 'Ex', 'Pf']].dropna()

# Filter invalid cases
df_model = df_model[df_model['Ex'] > 0]
df_model = df_model[df_model['Pf'] <= df_model['Ex']]

# One-hot encode COUNTRY
df_model['COUNTRY'] = df_model['COUNTRY'].astype(str)
df_encoded = pd.get_dummies(df_model, columns=['COUNTRY'], drop_first=True)

# Create features (X) and targets (y)
X = df_encoded.drop(columns=['Pf'])
y_success = df_encoded['Pf']
y_trials = df_encoded['Ex']
y_fail = y_trials - y_success

# Train-test split
X_train, X_test, y_success_train, y_success_test, y_trials_train, y_trials_test = train_test_split(
    X, y_success, y_trials, test_size=0.2, random_state=42
)

# Add constant
X_train_const = sm.add_constant(X_train.astype(float))
X_test_const = sm.add_constant(X_test.astype(float))

# ---------------------
# 2. Fit Models
# ---------------------

# --- Poisson Regression ---
poisson_model = sm.GLM(y_success_train, X_train_const, family=sm.families.Poisson())
poisson_result = poisson_model.fit()
y_pred_poisson = poisson_result.predict(X_test_const)

# --- Binomial Regression (NEW) ---
# Combine successes and failures for binomial GLM
y_binom_train = np.column_stack((y_success_train, y_trials_train - y_success_train))

binom_model = sm.GLM(y_binom_train, X_train_const, family=sm.families.Binomial())
binom_result = binom_model.fit()
y_pred_binom = binom_result.predict(X_test_const) * y_trials_test  # convert probs to expected counts

# --- Linear Regression ---
lr = LinearRegression()
lr.fit(X_train, y_success_train)
y_pred_lr = lr.predict(X_test)

# ---------------------
# 3. Evaluate Models
# ---------------------
def evaluate_model(name, y_true, y_pred, result=None):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    print(f"\n {name} Model:")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  MAE:  {mae:.4f}")
    if result is not None:
        print(f"  AIC:  {result.aic:.2f}")
        print(f"  BIC:  {result.bic:.2f}")

evaluate_model("Poisson", y_success_test, y_pred_poisson, poisson_result)
evaluate_model("Binomial", y_success_test, y_pred_binom, binom_result)
evaluate_model("Linear Regression", y_success_test, y_pred_lr)

# ---------------------
# 4. Plot Actual vs Predicted
# ---------------------
plt.figure(figsize=(12, 4))

plt.subplot(1, 3, 1)
plt.scatter(y_success_test, y_pred_poisson, alpha=0.3)
plt.title("Poisson Regression")
plt.xlabel("Actual Pf")
plt.ylabel("Predicted Pf")

plt.subplot(1, 3, 2)
plt.scatter(y_success_test, y_pred_binom, alpha=0.3, color='green')
plt.title("Binomial Regression")
plt.xlabel("Actual Pf")
plt.ylabel("Predicted Pf")

plt.subplot(1, 3, 3)
plt.scatter(y_success_test, y_pred_lr, alpha=0.3, color='red')
plt.title("Linear Regression")
plt.xlabel("Actual Pf")
plt.ylabel("Predicted Pf")

plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt


# Keep relevant columns
df_model = df[['Lat', 'Long', 'Ex', 'Pf']].dropna()
df_model = df_model[df_model['Ex'] > 0]
df_model = df_model[df_model['Pf'] <= df_model['Ex']]

# Create features and target
X = df_model[['Lat', 'Long']].astype(float)
y_success = df_model['Pf'].astype(float)
y_trials = df_model['Ex'].astype(float)

# Train-test split
X_train, X_test, y_success_train, y_success_test, y_trials_train, y_trials_test = train_test_split(
    X, y_success, y_trials, test_size=0.2, random_state=42
)

# Add constant term
X_train_const = sm.add_constant(X_train)
X_test_const = sm.add_constant(X_test)

# ---------------------
# 2. Fit Models
# ---------------------

# --- Poisson Regression ---
poisson_model = sm.GLM(y_success_train, X_train_const, family=sm.families.Poisson())
poisson_result = poisson_model.fit()
y_pred_poisson = poisson_result.predict(X_test_const)

# --- Binomial Regression ---
y_binom_train = np.column_stack((y_success_train, y_trials_train - y_success_train))

binom_model = sm.GLM(y_binom_train, X_train_const, family=sm.families.Binomial())
binom_result = binom_model.fit()
y_pred_binom = binom_result.predict(X_test_const) * y_trials_test  # Convert probability to expected counts

# --- Linear Regression ---
lr = LinearRegression()
lr.fit(X_train, y_success_train)
y_pred_lr = lr.predict(X_test)

# ---------------------
# 3. Evaluate Models
# ---------------------
def evaluate_model(name, y_true, y_pred, result=None):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    print(f"\n {name} Model:")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  MAE:  {mae:.4f}")
    if result is not None:
        print(f"  AIC:  {result.aic:.2f}")
        print(f"  BIC:  {result.bic:.2f}")

evaluate_model("Poisson", y_success_test, y_pred_poisson, poisson_result)
evaluate_model("Binomial", y_success_test, y_pred_binom, binom_result)
evaluate_model("Linear Regression", y_success_test, y_pred_lr)

# ---------------------
# 4. Plot Actual vs Predicted
# ---------------------
plt.figure(figsize=(12, 4))

plt.subplot(1, 3, 1)
plt.scatter(y_success_test, y_pred_poisson, alpha=0.3)
plt.title("Poisson Regression")
plt.xlabel("Actual Pf")
plt.ylabel("Predicted Pf")

plt.subplot(1, 3, 2)
plt.scatter(y_success_test, y_pred_binom, alpha=0.3, color='green')
plt.title("Binomial Regression")
plt.xlabel("Actual Pf")
plt.ylabel("Predicted Pf")

plt.subplot(1, 3, 3)
plt.scatter(y_success_test, y_pred_lr, alpha=0.3, color='red')
plt.title("Linear Regression")
plt.xlabel("Actual Pf")
plt.ylabel("Predicted Pf")

plt.tight_layout()
plt.show()


In [None]:

import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt

df = pd.read_csv("/content/AfricaDataset.csv.csv")

# Keep relevant columns
df_model = df[['Lat', 'Long', 'Ex', 'Pf']].dropna()
df_model = df_model[df_model['Ex'] > 0]
df_model = df_model[df_model['Pf'] <= df_model['Ex']]

# Create features and target
X = df_model[['Lat', 'Long']].astype(float)
y_success = df_model['Pf'].astype(float)
y_trials = df_model['Ex'].astype(float)

# Train-test split
X_train, X_test, y_success_train, y_success_test, y_trials_train, y_trials_test = train_test_split(
    X, y_success, y_trials, test_size=0.2, random_state=42
)

# Add constant term
X_train_const = sm.add_constant(X_train)
X_test_const = sm.add_constant(X_test)

# ---------------------
# 2. Fit Models
# ---------------------

# --- Poisson Regression ---
poisson_model = sm.GLM(y_success_train, X_train_const, family=sm.families.Poisson())
poisson_result = poisson_model.fit()
y_pred_poisson = poisson_result.predict(X_test_const)

# --- Binomial Regression ---
y_binom_train = np.column_stack((y_success_train, y_trials_train - y_success_train))

binom_model = sm.GLM(y_binom_train, X_train_const, family=sm.families.Binomial())
binom_result = binom_model.fit()
y_pred_binom = binom_result.predict(X_test_const) * y_trials_test  # Convert probability to expected counts

# --- Linear Regression ---
lr = LinearRegression()
lr.fit(X_train, y_success_train)
y_pred_lr = lr.predict(X_test)

# ---------------------
# 3. Evaluate Models
# ---------------------
def evaluate_model(name, y_true, y_pred, result=None):
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(np.mean((y_true - y_pred)**2))
    print(f"\n {name} Model:")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  MAE:  {mae:.4f}")
    print(f" R2: {r2:.4f}")
    if result is not None:
        print(f"  AIC:  {result.aic:.2f}")
        print(f"  BIC:  {result.bic:.2f}")

evaluate_model("Poisson", y_success_test, y_pred_poisson, poisson_result)
evaluate_model("Binomial", y_success_test, y_pred_binom, binom_result)
evaluate_model("Linear Regression", y_success_test, y_pred_lr)

# ---------------------
# 4. Plot Actual vs Predicted
# ---------------------
plt.figure(figsize=(12, 4))

plt.subplot(1, 3, 1)
plt.scatter(y_success_test, y_pred_poisson, alpha=0.3)
plt.title("Poisson Regression")
plt.xlabel("Actual Pf")
plt.ylabel("Predicted Pf")

plt.subplot(1, 3, 2)
plt.scatter(y_success_test, y_pred_binom, alpha=0.3, color='green')
plt.title("Binomial Regression")
plt.xlabel("Actual Pf")
plt.ylabel("Predicted Pf")

plt.subplot(1, 3, 3)
plt.scatter(y_success_test, y_pred_lr, alpha=0.3, color='red')
plt.title("Linear Regression")
plt.xlabel("Actual Pf")
plt.ylabel("Predicted Pf")

plt.tight_layout()
plt.show()


In [None]:
def evaluate_country_models(country_name, df):
    import statsmodels.api as sm
    from sklearn.linear_model import LinearRegression
    from sklearn.metrics import mean_squared_error, mean_absolute_error
    from sklearn.model_selection import train_test_split
    import numpy as np

    def evaluate_model(name, y_true, y_pred, result=None):
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        mae = mean_absolute_error(y_true, y_pred)
        aic = result.aic if result else None
        bic = result.bic if result else None
        return {'Model': name, 'RMSE': rmse, 'MAE': mae, 'AIC': aic, 'BIC': bic}

    results = []

    df_c = df[df['COUNTRY'] == country_name].dropna()
    df_c = df_c[(df_c['Ex'] > 0) & (df_c['Pf'] <= df_c['Ex'])]

    if len(df_c) < 100:
        print(f"Skipping {country_name} due to insufficient data.")
        return pd.DataFrame()

    # ---------------- Temporal ----------------
    X_temp = df_c[['YY', 'LoAge', 'UpAge', 'Ex']]
    y_temp = df_c['Pf']
    y_trials_temp = df_c['Ex']
    y_fail_temp = y_trials_temp - y_temp

    X_train, X_test, y_train, y_test, trials_train, trials_test = train_test_split(
        X_temp, y_temp, y_trials_temp, test_size=0.2, random_state=42)

    X_train_const = sm.add_constant(X_train)
    X_test_const = sm.add_constant(X_test)

    # Poisson
    p_model = sm.GLM(y_train, X_train_const, family=sm.families.Poisson()).fit()
    y_pred = p_model.predict(X_test_const)
    results.append(evaluate_model('Poisson (Temporal)', y_test, y_pred, p_model))

    # Binomial
    y_bin = np.column_stack((y_train, trials_train - y_train))
    b_model = sm.GLM(y_bin, X_train_const, family=sm.families.Binomial()).fit()
    y_pred = b_model.predict(X_test_const) * trials_test
    results.append(evaluate_model('Binomial (Temporal)', y_test, y_pred, b_model))

    # Linear
    lr = LinearRegression().fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    results.append(evaluate_model('Linear (Temporal)', y_test, y_pred))

    # ---------------- Spatial ----------------
    X_spatial = df_c[['Lat', 'Long']]
    y_spatial = df_c['Pf']
    y_trials_spatial = df_c['Ex']

    X_train, X_test, y_train, y_test, trials_train, trials_test = train_test_split(
        X_spatial, y_spatial, y_trials_spatial, test_size=0.2, random_state=42)

    X_train_const = sm.add_constant(X_train)
    X_test_const = sm.add_constant(X_test)

    # Poisson
    p_model = sm.GLM(y_train, X_train_const, family=sm.families.Poisson()).fit()
    y_pred = p_model.predict(X_test_const)
    results.append(evaluate_model('Poisson (Spatial)', y_test, y_pred, p_model))

    # Binomial
    y_bin = np.column_stack((y_train, trials_train - y_train))
    b_model = sm.GLM(y_bin, X_train_const, family=sm.families.Binomial()).fit()
    y_pred = b_model.predict(X_test_const) * trials_test
    results.append(evaluate_model('Binomial (Spatial)', y_test, y_pred, b_model))

    # Linear
    lr = LinearRegression().fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    results.append(evaluate_model('Linear (Spatial)', y_test, y_pred))

    return pd.DataFrame(results)


In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import warnings
from statsmodels.genmod.generalized_linear_model import SET_USE_BIC_LLF

# Optional: suppress BIC FutureWarnings and switch to log-likelihood-based BIC
warnings.filterwarnings("ignore", category=FutureWarning)
SET_USE_BIC_LLF(True)

# --- Helper Function: Model Evaluation for a Country ---
def evaluate_country_models(country_name, df):
    df_country = df[df['COUNTRY'] == country_name].copy()

    # Drop missing values
    df_country = df_country.dropna()

    # Ensure enough data points
    if len(df_country) < 50:
        return pd.DataFrame()  # Skip small datasets

    results = []

    # Feature sets
    temporal_cols = ['YY', 'LoAge', 'UpAge', 'Ex']
    spatial_cols = ['Lat', 'Long']
    target_col = 'Pf'

    # ----------- TEMPORAL FEATURES -----------
    X_temp = df_country[temporal_cols]
    y_temp = df_country[target_col]

    X_train, X_test, y_train, y_test = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)

    # Linear Regression
    lr = LinearRegression().fit(X_train, y_train)
    y_pred_lr = lr.predict(X_test)
    rmse_lr = mean_squared_error(y_test, y_pred_lr)
    mae_lr = mean_absolute_error(y_test, y_pred_lr)
    results.append({'Model': 'Linear Regression (Temporal)', 'RMSE': rmse_lr, 'MAE': mae_lr, 'AIC': None, 'BIC': None})

    # Poisson Regression
    poisson_model = sm.GLM(y_train, sm.add_constant(X_train), family=sm.families.Poisson())
    poisson_result = poisson_model.fit()
    y_pred_poisson = poisson_result.predict(sm.add_constant(X_test))
    rmse_poisson = mean_squared_error(y_test, y_pred_poisson)
    mae_poisson = mean_absolute_error(y_test, y_pred_poisson)
    results.append({'Model': 'Poisson (Temporal)', 'RMSE': rmse_poisson, 'MAE': mae_poisson,
                    'AIC': poisson_result.aic, 'BIC': poisson_result.bic_llf})

    # Binomial Regression (must convert target to probability [0,1])
    y_bin = np.clip(y_train / (y_train.max() + 1), 0, 1)
    binomial_model = sm.GLM(y_bin, sm.add_constant(X_train), family=sm.families.Binomial())
    binomial_result = binomial_model.fit()
    y_pred_binomial = binomial_result.predict(sm.add_constant(X_test))
    y_pred_binomial = y_pred_binomial * (y_train.max() + 1)  # scale back
    rmse_bin = mean_squared_error(y_test, y_pred_binomial)
    mae_bin = mean_absolute_error(y_test, y_pred_binomial)
    results.append({'Model': 'Binomial (Temporal)', 'RMSE': rmse_bin, 'MAE': mae_bin,
                    'AIC': binomial_result.aic, 'BIC': binomial_result.bic_llf})

    # ----------- SPATIAL FEATURES -----------
    X_spatial = df_country[spatial_cols]
    y_spatial = df_country[target_col]

    X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_spatial, y_spatial, test_size=0.2, random_state=42)

    # Linear Regression (Spatial)
    lr_s = LinearRegression().fit(X_train_s, y_train_s)
    y_pred_lr_s = lr_s.predict(X_test_s)
    rmse_lr_s = mean_squared_error(y_test_s, y_pred_lr_s)
    mae_lr_s = mean_absolute_error(y_test_s, y_pred_lr_s)
    results.append({'Model': 'Linear Regression (Spatial)', 'RMSE': rmse_lr_s, 'MAE': mae_lr_s, 'AIC': None, 'BIC': None})

    # Poisson (Spatial)
    poisson_model_s = sm.GLM(y_train_s, sm.add_constant(X_train_s), family=sm.families.Poisson())
    poisson_result_s = poisson_model_s.fit()
    y_pred_poisson_s = poisson_result_s.predict(sm.add_constant(X_test_s))
    rmse_poisson_s = mean_squared_error(y_test_s, y_pred_poisson_s)
    mae_poisson_s = mean_absolute_error(y_test_s, y_pred_poisson_s)
    results.append({'Model': 'Poisson (Spatial)', 'RMSE': rmse_poisson_s, 'MAE': mae_poisson_s,
                    'AIC': poisson_result_s.aic, 'BIC': poisson_result_s.bic_llf})

    # Binomial (Spatial)
    y_bin_s = np.clip(y_train_s / (y_train_s.max() + 1), 0, 1)
    binomial_model_s = sm.GLM(y_bin_s, sm.add_constant(X_train_s), family=sm.families.Binomial())
    binomial_result_s = binomial_model_s.fit()
    y_pred_binomial_s = binomial_result_s.predict(sm.add_constant(X_test_s))
    y_pred_binomial_s = y_pred_binomial_s * (y_train_s.max() + 1)
    rmse_bin_s = mean_squared_error(y_test_s, y_pred_binomial_s)
    mae_bin_s = mean_absolute_error(y_test_s, y_pred_binomial_s)
    results.append({'Model': 'Binomial (Spatial)', 'RMSE': rmse_bin_s, 'MAE': mae_bin_s,
                    'AIC': binomial_result_s.aic, 'BIC': binomial_result_s.bic_llf})

    return pd.DataFrame(results)


# --- Main Execution ---
df_all = df.copy()

# Ensure proper columns
df_all = df_all[['COUNTRY', 'YY', 'LoAge', 'UpAge', 'Ex', 'Pf', 'Lat', 'Long']]
df_all = df_all.dropna()

# List of countries to test
countries = df_all['COUNTRY'].value_counts().index.tolist()

final_results = []

for country in countries:
    print(f"\n📍 Running for {country}")
    res = evaluate_country_models(country, df_all)
    if not res.empty:
        res['Country'] = country
        final_results.append(res)

# Combine all results
df_summary = pd.concat(final_results, ignore_index=True)

# Optional: Save results
# df_summary.to_csv("country_model_comparison.csv", index=False)

# Display top results
print("\n📊 Top 10 model performances:")
display(df_summary.sort_values(by="RMSE").head(10))


In [None]:
pip install plotly


In [None]:
import plotly.express as px

# Aggregate RMSE for a specific model (e.g., Binomial Temporal)
df_heat = df_summary[df_summary['Model'] == 'Binomial (Temporal)'][['Country', 'RMSE']].copy()
df_heat['Country'] = df_heat['Country'].str.strip()

# Plot
fig = px.choropleth(
    df_heat,
    locations='Country',
    locationmode='country names',
    color='RMSE',
    color_continuous_scale='YlOrRd',
    title=' RMSE for Binomial (Temporal) Model by Country',
    labels={'RMSE': 'RMSE'}
)
fig.update_geos(fitbounds="locations", visible=False)
fig.show()


In [None]:
import pymc as pm
import pandas as pd

# Assuming df has columns: Pf, Ex, COUNTRY

with pm.Model() as model:
    # Hyperpriors for group-level variation
    alpha = pm.HalfNormal("alpha", sigma=5)
    beta = pm.HalfNormal("beta", sigma=5)

    # Expected probability for each data point
    theta = pm.Beta("theta", alpha=alpha, beta=beta, shape=df.shape[0])

    # Observed data
    pf_obs = pm.Binomial("obs", n=df['Ex'].values, p=theta, observed=df['Pf'].values)

    trace = pm.sample(300, tune=300, target_accept=0.9, random_seed=42)
    pm.summary(trace)


In [None]:
import pymc as pm
import pandas as pd
import numpy as np

# Assume df has columns: Pf, Ex, COUNTRY
df = df.dropna(subset=['Pf', 'Ex', 'COUNTRY'])
df['country_id'] = df['COUNTRY'].astype('category').cat.codes

n_countries = df['country_id'].nunique()

with pm.Model() as hierarchical_binomial:
    # Hyperpriors
    mu_a = pm.Normal('mu_a', 0, 2)
    sigma_a = pm.Exponential('sigma_a', 1)

    # Country-level random intercepts (logit-scale)
    a = pm.Normal('a', mu=mu_a, sigma=sigma_a, shape=n_countries)

    # Linear predictor (logit link)
    logit_p = a[df['country_id'].values]
    p = pm.Deterministic('p', pm.math.sigmoid(logit_p))

    # Likelihood
    y_obs = pm.Binomial('y_obs', n=df['Ex'].values, p=p, observed=df['Pf'].values)

    trace = pm.sample(300, tune=300, target_accept=0.9, random_seed=42)
    pm.summary(trace, var_names=['mu_a', 'sigma_a'])


In [None]:
 pm.summary(trace, var_names=['mu_a', 'sigma_a'])

In [None]:
import arviz as az

country_probs = trace.posterior['p'].mean(dim=["chain", "draw"]).values
df['posterior_mean_p'] = country_probs

# Show top countries
print(df[['COUNTRY', 'posterior_mean_p']].groupby('COUNTRY').mean().sort_values(by='posterior_mean_p', ascending=False))


In [None]:
az.plot_posterior(trace, var_names=["mu_a", "sigma_a"], hdi_prob=0.95)


In [None]:
import matplotlib.pyplot as plt

mean_probs = df.groupby('COUNTRY')['posterior_mean_p'].mean().sort_values()
plt.figure(figsize=(10,6))
mean_probs.plot(kind='barh')
plt.xlabel("Estimated Malaria Prevalence")
plt.title("Posterior Mean Prevalence by Country")
plt.show()


In [None]:
import warnings
warnings.simplefilter("ignore")

import pandas as pd
import numpy as np
import pymc as pm
import arviz as az

# Step 1: Filter for one country — say, Tanzania
df_tz = df[df["COUNTRY"] == "Tanzania"].copy()

# Step 2: Add fake climate features — Temperature and Humidity
np.random.seed(42)  # for reproducibility
df_tz["Temperature"] = np.random.normal(loc=27, scale=2, size=len(df_tz))  # typical tropical temps
df_tz["Humidity"] = np.random.uniform(low=60, high=90, size=len(df_tz))    # typical high humidity

# Step 3: Clean the data — drop missing
df_tz = df_tz.dropna(subset=["Pf", "Ex", "Temperature", "Humidity"])

# Step 4: Fit a binomial model with climate covariates
with pm.Model() as climate_model:
    # Priors for coefficients
    intercept = pm.Normal("intercept", mu=0, sigma=2)
    beta_temp = pm.Normal("beta_temp", mu=0, sigma=1)
    beta_humid = pm.Normal("beta_humid", mu=0, sigma=1)

    # Linear model
    logit_p = (
        intercept
        + beta_temp * df_tz["Temperature"].values
        + beta_humid * df_tz["Humidity"].values
    )
    p = pm.Deterministic("p", pm.math.sigmoid(logit_p))

    # Likelihood
    y_obs = pm.Binomial("y_obs", n=df_tz["Ex"].values, p=p, observed=df_tz["Pf"].values)

    # Sample from posterior
    trace_climate = pm.sample(1000, tune=1000, target_accept=0.9, random_seed=42)


In [None]:
!pip install  pymc