<a href="https://colab.research.google.com/github/syedshoaib14/Chennai-Geocoding-TB-data-/blob/main/Chennai_TB_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Install packages
!pip install libpysal esda
!pip install pandas geopandas shapely scikit-learn folium

In [None]:
#@title geocoding
import pandas as pd
import numpy as np
import geopandas as gpd
from sklearn.neighbors import KernelDensity
from libpysal.weights import DistanceBand
from esda.getisord import G_Local
import folium

# 1. Load the geocoded sheet
xl = pd.ExcelFile('chennai TB data.xlsx')
df = xl.parse('india_geo_coding_address_lat_an')

# 2. Strip whitespace and detect lat/lon columns (case-insensitive)
df.columns = df.columns.str.strip()
lat_cols = [c for c in df.columns if c.lower() == 'latitude']
lon_cols = [c for c in df.columns if c.lower() == 'longitude']

if not lat_cols or not lon_cols:
    raise KeyError(f"Latitude/Longitude columns not found. Available columns: {df.columns.tolist()}")

lat_col = lat_cols[0]
lon_col = lon_cols[0]

print(f"Using latitude column: '{lat_col}', longitude column: '{lon_col}'")

# 3. Convert to GeoDataFrame (WGS84)
gdf = gpd.GeoDataFrame(
    df,
    geometry=gpd.points_from_xy(df[lon_col], df[lat_col]),
    crs="EPSG:4326"
)

# 4. Project to UTM zone 43N (meters)
gdf = gdf.to_crs(epsg=32643)
coords = np.vstack([gdf.geometry.x, gdf.geometry.y]).T

# 5. Compute KDE density (1 km bandwidth)
kde = KernelDensity(bandwidth=1000, kernel='gaussian')
kde.fit(coords)
gdf['kde_density'] = np.exp(kde.score_samples(coords))

# 6. Build spatial weights (2 km threshold)
w = DistanceBand.from_dataframe(gdf, threshold=2000, silence_warnings=True)

# 7. Compute Local Getis-Ord Gi*
g_local = G_Local(gdf['kde_density'], w, transform='B')
gdf['GiZ'] = g_local.Zs
gdf['GiP'] = g_local.p_sim

# 8. Classify Gi* results
def classify_gistar(z):
    if z > 1.96:
        return 'hot_spot'
    elif z < -1.96:
        return 'cold_spot'
    else:
        return 'non_significant'

gdf['Gi_star'] = gdf['GiZ'].apply(classify_gistar)

# 9. Create and save Folium map
map_path = "kde_gistar_map.html"
center = [df[lat_col].mean(), df[lon_col].mean()]
m = folium.Map(location=center, zoom_start=12, tiles='CartoDB.Positron')
colors = {'hot_spot': 'red', 'cold_spot': 'blue', 'non_significant': 'gray'}

for _, row in gdf.iterrows():
    folium.CircleMarker(
        location=[row[lat_col], row[lon_col]],
        radius=4,
        color=colors[row['Gi_star']],
        fill=True,
        fill_opacity=0.7,
        popup=(
            f"<b>KDE density:</b> {row['kde_density']:.2f}<br>"
            f"<b>Gi* Z-score:</b> {row['GiZ']:.2f}<br>"
            f"<b>Gi* p-value:</b> {row['GiP']:.3f}<br>"
            f"<b>Classification:</b> {row['Gi_star']}"
        )
    ).add_to(m)


legend_html = '''
<div style="
     position: fixed;
     bottom: 50px; left: 50px; max-width: 360px;
     z-index:9999; font-size:14px; line-height: 1.5;
     background-color: rgba(255,255,255,0.95);
     padding: 12px 15px; border: 2px solid gray;
     border-radius: 10px;
     box-shadow: 0 0 8px rgba(0,0,0,0.3);
     word-wrap: break-word;
     overflow-wrap: break-word;">
<b>Z-Score & P-Value</b><br><br>
<b>Z-Score:</b><br>
<span style="color:red;">Z &gt; 1.96</span>: Hotspot (95%+ confidence)<br>
<span style="color:blue;">Z &lt; -1.96</span>: Coldspot (95%+ confidence)<br>
-1.96 ≤ Z ≤ 1.96: Not significant<br><br>
<b>P-Value:</b><br>
p &lt; 0.01 → Very strong evidence<br>
p &lt; 0.05 → Statistically significant<br>
p ≥ 0.05 → Not significant
</div>
'''


m.get_root().html.add_child(folium.Element(legend_html))
m.save(map_path)
print(f"Interactive KDE + Gi* hotspot map saved to {map_path}")



m.get_root().html.add_child(folium.Element(legend_html))

out_path = 'kde_gistar_map new .html'
m.save(out_path)
print(f"Map saved to {out_path}")



In [None]:
import pandas as pd
import folium
from IPython.display import display

# Load geocoded data
df = pd.read_excel('/content/chennai TB data.xlsx', sheet_name='india_geo_coding_address_lat_an')

# Define metrics and their column names
metrics = {
    'Infected rate': 'Infected rate',
    'Density Per Km2': 'Density Per Km2'
}

# Initialize summary storage
summary_rows = []

# Create and save maps for each metric
map_paths = {}

for name, col in metrics.items():
    # Compute quartiles
    q25, q75 = df[col].quantile([0.25, 0.75])

    # Classify spots
    spot_col = f'spot_{col.replace(" ", "_").lower()}'
    df[spot_col] = df[col].apply(
        lambda v: 'cold' if v < q25 else ('hot' if v > q75 else 'intermediate'))

    # Summarize counts
    counts = df[spot_col].value_counts()
    summary_rows.append({
        'metric': name,
        'cold count': counts.get('cold', 0),
        'intermediate count': counts.get('intermediate', 0),
        'hot count': counts.get('hot', 0)
    })

from folium.plugins import MarkerCluster

# Create Folium map
center = [df['Latitude'].mean(), df['Longitude'].mean()]
m = folium.Map(location=center, zoom_start=12)
colors = {'cold': 'blue', 'intermediate': 'orange', 'hot': 'red'}

# Add marker cluster
marker_cluster = MarkerCluster().add_to(m)

for _, row in df.iterrows():
    tooltip_text = (
        f"<b>Area:</b> {row['Area']}<br>"
        f"<b>Pin Code:</b> {row['Pin Code']}<br>"
        f"<b>Infected rate:</b> {row['Infected rate']}<br>"
        f"<b>Density Per Km²:</b> {row['Density Per Km2']}"
    )

    folium.CircleMarker(
        location=[row['Latitude'], row['Longitude']],
        radius=5,
        color=colors[row[spot_col]],
        fill=True,
        fill_opacity=0.8,
        tooltip=folium.Tooltip(tooltip_text, sticky=True)
    ).add_to(marker_cluster)

# Save map
path = '/content/infected rate and densty _map.html'
m.save(path)
map_paths[name] = path

# Display summary table
summary_df = pd.DataFrame(summary_rows)
print("Classification Summary by Metric")
display(summary_df)

# Print map locations
print("\nMaps saved at:")
for metric, path in map_paths.items():
    print(f"- {metric}: {path}")


In [None]:
import pandas as pd

# Load the data
df = pd.read_excel('/content/chennai TB data.xlsx', sheet_name='india_geo_coding_address_lat_an')

# Show the actual column names
print("Column names in the Excel sheet:")
print(df.columns.tolist())

In [None]:
# @title cold count

from matplotlib import pyplot as plt
summary_df['cold count'].plot(kind='hist', bins=20, title='cold count')
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
import pandas as pd
import folium
from folium.plugins import MarkerCluster
from IPython.display import display

# Load geocoded data
df = pd.read_excel('/content/chennai TB data.xlsx', sheet_name='india_geo_coding_address_lat_an')

# Define metrics and their column names
metrics = {
    'Infected rate': 'Infected rate',
    'Density Per Km2': 'Density Per Km2'
}

# Initialize summary storage
summary_rows = []

# Create and save maps for each metric
map_paths = {}

for name, col in metrics.items():
    # Compute quartiles
    q25, q75 = df[col].quantile([0.25, 0.75])

    # Classify spots
    spot_col = f'spot_{col.replace(" ", "_").lower()}'
    df[spot_col] = df[col].apply(
        lambda v: 'cold' if v < q25 else ('hot' if v > q75 else 'intermediate'))

    # Summarize counts
    counts = df[spot_col].value_counts()
    summary_rows.append({
        'metric': name,
        'cold count': counts.get('cold', 0),
        'intermediate count': counts.get('intermediate', 0),
        'hot count': counts.get('hot', 0)
    })

    # Create Folium map
    center = [df['Latitude'].mean(), df['Longitude'].mean()]
    m = folium.Map(location=center, zoom_start=12)
    colors = {'cold': 'blue', 'intermediate': 'orange', 'hot': 'red'}

    # Add marker cluster
    marker_cluster = MarkerCluster().add_to(m)

    for _, row in df.iterrows():
        classification = row[spot_col].capitalize() + " Spot"
        tooltip_text = (
            f"<b>Area:</b> {row['Area']}<br>"
            f"<b>Pin Code:</b> {row['Pin Code']}<br>"
            f"<b>Infected rate:</b> {row['Infected rate']}<br>"
            f"<b>Density Per Km²:</b> {row['Density Per Km2']}<br>"
            f"<b>Classification:</b> {classification}"
        )

        folium.CircleMarker(
            location=[row['Latitude'], row['Longitude']],
            radius=5,
            color=colors[row[spot_col]],
            fill=True,
            fill_opacity=0.8,
            tooltip=folium.Tooltip(tooltip_text, sticky=True)
        ).add_to(marker_cluster)

        folium.CircleMarker(
            location=[row['Latitude'], row['Longitude']],
            radius=5,
            color=colors[row[spot_col]],
            fill=True,
            fill_opacity=0.8,
            tooltip=folium.Tooltip(tooltip_text, sticky=True)
        ).add_to(marker_cluster)

    # Add legend for color coding
    legend_html = """
    <div style="position: fixed;
                bottom: 50px; left: 50px; width: 150px; height: 100px;
                border:2px solid grey; background-color: white; z-index:9999; font-size:12px;
                padding: 10px;">
                <b></b><br>
                <i style="background:blue; width: 20px; height: 20px; float:left; margin-right:10px;"></i>Cold<br>
                <i style="background:orange; width: 20px; height: 20px; float:left; margin-right:10px;"></i>Intermediate<br>
                <i style="background:red; width: 20px; height: 20px; float:left; margin-right:10px;"></i>Hot<br>
    </div>
    """
    m.get_root().html.add_child(folium.Element(legend_html))

    # Save map
    path = f'/content/_map.html'
    m.save(path)
    map_paths[name] = path

# Display summary table
summary_df = pd.DataFrame(summary_rows)
print("Classification Summary by Metric")
display(summary_df)

# Print map locations
print("\nMaps saved at:")
for metric, path in map_paths.items():
    print(f"- {metric}: {path}")


In [None]:
#@title Aimodel
!pip install libpysal esda
import pandas as pd
import numpy as np
import geopandas as gpd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib
from sklearn.neighbors import KernelDensity
from libpysal.weights import DistanceBand
from esda.getisord import G_Local

# 1. Load and prepare geocoded TB data
xl = pd.ExcelFile('chennai TB data.xlsx')
df = xl.parse('india_geo_coding_address_lat_an')
df.columns = df.columns.str.strip()

# Identify coordinate columns
tmp = df.copy()
lat_col = [c for c in tmp.columns if c.lower() == 'latitude'][0]
lon_col = [c for c in tmp.columns if c.lower() == 'longitude'][0]

gdf = gpd.GeoDataFrame(
    tmp,
    geometry=gpd.points_from_xy(tmp[lon_col], tmp[lat_col]),
    crs="EPSG:4326"
)
# Project to metric CRS for distance-based features
gdf = gdf.to_crs(epsg=32643)
coords = np.vstack([gdf.geometry.x, gdf.geometry.y]).T

# 2. Compute KDE density (bandwidth in meters)
kde = KernelDensity(bandwidth=1000, kernel='gaussian')
kde.fit(coords)
gdf['kde_density'] = np.exp(kde.score_samples(coords))

# 3. Build spatial weights and compute Local Getis-Ord Gi*
w = DistanceBand.from_dataframe(gdf, threshold=2000, silence_warnings=True)
g_local = G_Local(gdf['kde_density'], w, transform='B')
gdf['GiZ'] = g_local.Zs
gdf['Gi_star'] = gdf['GiZ'].apply(lambda z: 'hot_spot' if z > 1.96 else ('cold_spot' if z < -1.96 else 'non_significant'))

# 4. Prepare features and target for classification
gdf['x'] = gdf.geometry.x
gdf['y'] = gdf.geometry.y
feature_cols = ['x', 'y', 'kde_density']
X = gdf[feature_cols]
y = (gdf['Gi_star'] == 'hot_spot').astype(int)

# 5. Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 6. Build preprocessing + model pipeline
numeric_transformer = Pipeline([
    ('scaler', StandardScaler())
])
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, feature_cols)
])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# 7. Hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20]
}
grid = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)

# 8. Evaluate on test set
y_pred = grid.predict(X_test)
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# 9. Save the trained pipeline
model_path = 'tb_hotspot_classifier.pkl'
joblib.dump(grid.best_estimator_, model_path)
print(f"Trained model saved to {model_path}")


In [None]:
import pandas as pd
import numpy as np
import geopandas as gpd
import joblib
import folium

# 1. Load the trained model
model = joblib.load('/content/tb_hotspot_classifier.pkl')

# 2. Load and prepare new TB data (geocoded)
xl = pd.ExcelFile('/content/chennai TB data.xlsx')
df = xl.parse('india_geo_coding_address_lat_an')
df.columns = df.columns.str.strip()

# Identify lat/lon columns
tmp = df.copy()
lat_col = [c for c in tmp.columns if c.lower() == 'latitude'][0]
lon_col = [c for c in tmp.columns if c.lower() == 'longitude'][0]

gdf = gpd.GeoDataFrame(
    tmp,
    geometry=gpd.points_from_xy(tmp[lon_col], tmp[lat_col]),
    crs="EPSG:4326"
)
# Project to metric CRS matching training
gdf = gdf.to_crs(epsg=32643)
# Extract features: x, y, and compute KDE density
coords = np.vstack([gdf.geometry.x, gdf.geometry.y]).T
from sklearn.neighbors import KernelDensity
kde = KernelDensity(bandwidth=1000, kernel='gaussian')
kde.fit(coords)
gdf['kde_density'] = np.exp(kde.score_samples(coords))
gdf['x'] = gdf.geometry.x
gdf['y'] = gdf.geometry.y

# 3. Prepare feature matrix
feature_cols = ['x', 'y', 'kde_density']
X_new = gdf[feature_cols]

# 4. Predict hotspot probability and class
gdf['hotspot_prob'] = model.predict_proba(X_new)[:, 1]
gdf['hotspot_pred'] = model.predict(X_new).astype(int)

gdf['hotspot_label'] = gdf['hotspot_pred'].map({1: 'hot_spot', 0: 'non_hotspot'})

# 5. Create Folium map showing predicted hotspots
center = [df[lat_col].mean(), df[lon_col].mean()]
m = folium.Map(location=center, zoom_start=12, tiles='CartoDB.Positron')
colors = {'hot_spot': 'red', 'non_hotspot': 'blue'}

for _, row in gdf.iterrows():
    folium.CircleMarker(
        location=[row[lat_col], row[lon_col]],
        radius=5,
        color=colors[row['hotspot_label']],
        fill=True,
        fill_opacity=0.6,
        popup=(
            f"<b>Probability:</b> {row['hotspot_prob']:.2f}<br>"
            f"<b>Prediction:</b> {row['hotspot_label']}"
        )
    ).add_to(m)

# Save map
out_map = 'tb_hotspot_predictions_map.html'
m.save(out_map)
print(f"Prediction map saved to {out_map}")
