In [1]:
# pip install pandas numpy scikit-learn joblib fastapi uvicorn xgboost
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

# Load the dataset
df = pd.read_csv("nairobi_cost_of_living.csv")

# Print some of the records
print(df.head())

         Date        Area   Rent   Food  Transport  Utilities   Misc   Total
0  2019-01-31   Westlands  90341  25454      15634       6142  13468  151039
1  2019-01-31   Westlands  93902  24808      17194       6803  13608  156315
2  2019-01-31  Kileleshwa  78160  23302      13104       6454  12959  133979
3  2019-01-31   Westlands  87721  29735      18420       7934  14908  158719
4  2019-01-31       Ngong  32881  13471      10989       3062   8450   68853


In [2]:
# Convert date column to datetime
df["Date"] = pd.to_datetime(df["Date"])

df["Year"] = df["Date"].dt.year
df["Month"] = df["Date"].dt.month

# Select features (X) and target variables (y)
X = df[["Year", "Month", "Area"]]
y = df[["Rent", "Food", "Transport", "Utilities", "Misc"]]

#One-hot encoding the "Area" column
X = pd.get_dummies(X, columns=["Area"], drop_first=True)

# Split the data into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = xgb.XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=10, random_state=42)
model.fit(X_train, y_train)

joblib.dump(model, "models/cost_of_living_forecaster.pkl")

# Make predictions
y_pred = model.predict(X_test)

print(y_pred.shape)

y_pred_df = pd.DataFrame(y_pred, columns=y.columns)
print(y_pred_df.head())


(12000, 5)
           Rent          Food     Transport    Utilities          Misc
0  25976.716797  14913.912109   6643.485840  3323.874512   6027.980469
1  27603.443359  13456.106445   6212.402832  3657.362061   6162.377441
2  27821.339844  13822.772461   6201.808594  3735.425049   6153.890137
3  57160.945312  19093.380859  10886.983398  5373.492188  10133.825195
4  57160.945312  19093.380859  10886.983398  5373.492188  10133.825195


In [3]:
# Evaluation
# Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.2f}")

# Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")

# R^2 Score (Coefficient of Determination)
r2 = r2_score(y_test, y_pred)
print(f"R2 Score: {r2:.2f}")

Mean Absolute Error: 1423.56
Mean Squared Error: 3033146.50
R2 Score: 0.83
