# Sri Lanka 2024 Census - Complete Data Analysis

This notebook analyzes GN-level population data from the 2024 Sri Lanka Census.

## Table of Contents
1. Data Loading & Overview
2. Sex Ratio Analysis
3. Dependency Ratio Analysis
4. District Summary
5. K-Means Clustering
6. Resource Allocation Index
7. Anomaly Detection

In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from scipy import stats

# Set display options
pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Data Loading & Overview

In [None]:
# Load the cleaned data
df = pd.read_csv('GN_population_cleaned.csv')

# Clean column names (remove newlines if any)
df.columns = [col.replace('\n', '_') for col in df.columns]

print(f"Dataset Shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

In [None]:
# Data Info
print("Data Types:")
print(df.dtypes)
print(f"\nMissing Values:")
print(df.isnull().sum())

## 2. Sex Ratio Analysis

**Sex Ratio** = (Number of Males / Number of Females) * 100

A ratio of 100 means equal males and females.

In [None]:
# Identify correct column names
male_col = [c for c in df.columns if 'Male' in c and 'Female' not in c][0]
female_col = [c for c in df.columns if 'Female' in c][0]
gn_name_col = [c for c in df.columns if 'GN_Division' in c and 'Name' in c][0]
district_col = [c for c in df.columns if 'District' in c and 'Name' in c][0]
province_col = [c for c in df.columns if 'Province' in c and 'Name' in c][0]

print(f"Male: {male_col}, Female: {female_col}")

# Calculate Sex Ratio
df['Sex_Ratio'] = (df[male_col] / df[female_col].replace(0, np.nan)) * 100
print("\nSex Ratio Statistics:")
print(df['Sex_Ratio'].describe())

In [None]:
# Identify GNs with extreme sex ratios
extreme_male = df[df['Sex_Ratio'] > 120].sort_values('Sex_Ratio', ascending=False)
extreme_female = df[df['Sex_Ratio'] < 80].sort_values('Sex_Ratio')

print(f"GN Divisions with Male Majority (Ratio > 120): {len(extreme_male)}")
print(extreme_male[[gn_name_col, district_col, 'Sex_Ratio']].head(10))

print(f"\nGN Divisions with Female Majority (Ratio < 80): {len(extreme_female)}")
print(extreme_female[[gn_name_col, district_col, 'Sex_Ratio']].head(10))

In [None]:
# Visualize Sex Ratio Distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(df['Sex_Ratio'].dropna(), bins=50, edgecolor='black', alpha=0.7)
axes[0].axvline(100, color='red', linestyle='--', label='Equal Ratio (100)')
axes[0].set_xlabel('Sex Ratio')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Sex Ratio Across GN Divisions')
axes[0].legend()

df.boxplot(column='Sex_Ratio', by=province_col, ax=axes[1], rot=45)
axes[1].set_title('Sex Ratio by Province')
axes[1].set_xlabel('Province')
axes[1].set_ylabel('Sex Ratio')

plt.tight_layout()
plt.savefig('sex_ratio_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

## 3. Dependency Ratio Analysis

**Child Dependency Ratio** = (Population aged 0-14 / Population aged 15-59) * 100

**Old-Age Dependency Ratio** = (Population aged 60+ / Population aged 15-59) * 100

In [None]:
# Find age group columns
child_col = [c for c in df.columns if '0_to_14' in c or '0-14' in c or '0_14' in c]
working_col = [c for c in df.columns if '15_to_59' in c or '15-59' in c or '15_59' in c]
elderly_60_64 = [c for c in df.columns if '60_to_64' in c or '60-64' in c or '60_64' in c]
elderly_65_plus = [c for c in df.columns if '65' in c and 'above' in c.lower()]

print(f"Child: {child_col}")
print(f"Working: {working_col}")
print(f"Elderly 60-64: {elderly_60_64}")
print(f"Elderly 65+: {elderly_65_plus}")

In [None]:
# Calculate Dependency Ratios
try:
    child_pop = df[child_col[0]] if child_col else 0
    working_pop = df[working_col[0]] if working_col else 1
    elderly_60 = df[elderly_60_64[0]] if elderly_60_64 else 0
    elderly_65 = df[elderly_65_plus[0]] if elderly_65_plus else 0
    
    df['Child_Dependency_Ratio'] = (child_pop / working_pop.replace(0, np.nan)) * 100
    df['Old_Age_Dependency_Ratio'] = ((elderly_60 + elderly_65) / working_pop.replace(0, np.nan)) * 100
    df['Total_Dependency_Ratio'] = df['Child_Dependency_Ratio'] + df['Old_Age_Dependency_Ratio']
    
    print("Child Dependency Ratio Statistics:")
    print(df['Child_Dependency_Ratio'].describe())
    print("\nOld-Age Dependency Ratio Statistics:")
    print(df['Old_Age_Dependency_Ratio'].describe())
except Exception as e:
    print(f"Error: {e}")

In [None]:
# Top 10 GNs needing elder care and schools
print("Top 10 GN Divisions with Highest Old-Age Dependency (Elderly Care Priority):")
print(df.nlargest(10, 'Old_Age_Dependency_Ratio')[[gn_name_col, district_col, 'Old_Age_Dependency_Ratio']])

print("\nTop 10 GN Divisions with Highest Child Dependency (School Priority):")
print(df.nlargest(10, 'Child_Dependency_Ratio')[[gn_name_col, district_col, 'Child_Dependency_Ratio']])

In [None]:
# Visualize Dependency Ratios
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

axes[0].hist(df['Child_Dependency_Ratio'].dropna(), bins=50, edgecolor='black', alpha=0.7, color='skyblue')
axes[0].set_xlabel('Child Dependency Ratio')
axes[0].set_title('Child Dependency Ratio Distribution')

axes[1].hist(df['Old_Age_Dependency_Ratio'].dropna(), bins=50, edgecolor='black', alpha=0.7, color='salmon')
axes[1].set_xlabel('Old-Age Dependency Ratio')
axes[1].set_title('Old-Age Dependency Ratio Distribution')

axes[2].scatter(df['Child_Dependency_Ratio'], df['Old_Age_Dependency_Ratio'], alpha=0.3)
axes[2].set_xlabel('Child Dependency Ratio')
axes[2].set_ylabel('Old-Age Dependency Ratio')
axes[2].set_title('Child vs Old-Age Dependency')

plt.tight_layout()
plt.savefig('dependency_ratio_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

## 4. District Summary

In [None]:
# Aggregate by District
district_summary = df.groupby(district_col).agg({
    'Sex_Ratio': 'mean',
    'Child_Dependency_Ratio': 'mean',
    'Old_Age_Dependency_Ratio': 'mean',
    gn_name_col: 'count'
}).rename(columns={gn_name_col: 'GN_Count'}).round(2)

district_summary = district_summary.sort_values('Old_Age_Dependency_Ratio', ascending=False)
print("District-Level Summary (Sorted by Old-Age Dependency):")
print(district_summary)

In [None]:
# Visualize District Summary
plt.figure(figsize=(14, 8))
district_summary['Old_Age_Dependency_Ratio'].sort_values().plot(kind='barh', color='coral')
plt.xlabel('Average Old-Age Dependency Ratio')
plt.ylabel('District')
plt.title('Average Old-Age Dependency Ratio by District')
plt.tight_layout()
plt.savefig('district_old_age_dependency.png', dpi=150, bbox_inches='tight')
plt.show()

---
# Phase 5: Advanced Analytics & Modeling

## 5. K-Means Clustering (Demographic Profiles)

In [None]:
# Prepare features for clustering
features = ['Sex_Ratio', 'Child_Dependency_Ratio', 'Old_Age_Dependency_Ratio']
df_cluster = df[features].dropna()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_cluster)

print(f"Clustering {len(df_cluster)} GN divisions...")

In [None]:
# Elbow Method for Optimal K
inertias = []
K_range = range(2, 10)
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertias.append(kmeans.inertia_)

plt.figure(figsize=(10, 5))
plt.plot(K_range, inertias, 'bo-')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal K')
plt.savefig('elbow_method.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Apply K-Means with K=4
K = 4
kmeans = KMeans(n_clusters=K, random_state=42, n_init=10)
df.loc[df_cluster.index, 'Cluster'] = kmeans.fit_predict(X_scaled)

# Cluster Profile Analysis
cluster_profiles = df.groupby('Cluster')[features].mean().round(2)
print("Cluster Profiles (Mean Values):")
print(cluster_profiles)

# Assign Profile Names
profile_names = {
    cluster_profiles['Old_Age_Dependency_Ratio'].idxmax(): 'Aging Villages',
    cluster_profiles['Child_Dependency_Ratio'].idxmax(): 'Young Families',
    cluster_profiles['Sex_Ratio'].idxmax(): 'Male-Dominated',
    cluster_profiles['Sex_Ratio'].idxmin(): 'Female-Dominated'
}
df['Profile'] = df['Cluster'].map(lambda x: profile_names.get(x, f'Profile_{x}'))

In [None]:
# Visualize Clusters
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

scatter = axes[0].scatter(df['Child_Dependency_Ratio'], df['Old_Age_Dependency_Ratio'], 
                          c=df['Cluster'], cmap='viridis', alpha=0.5)
axes[0].set_xlabel('Child Dependency Ratio')
axes[0].set_ylabel('Old-Age Dependency Ratio')
axes[0].set_title('GN Clusters: Child vs Old-Age Dependency')
plt.colorbar(scatter, ax=axes[0], label='Cluster')

df['Cluster'].value_counts().sort_index().plot(kind='bar', ax=axes[1], color='teal')
axes[1].set_xlabel('Cluster')
axes[1].set_ylabel('Number of GN Divisions')
axes[1].set_title('Cluster Distribution')

plt.tight_layout()
plt.savefig('cluster_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. Resource Allocation Index

In [None]:
# Create Priority Scores (normalized to 0-100)
df['School_Priority'] = (df['Child_Dependency_Ratio'] / df['Child_Dependency_Ratio'].max()) * 100
df['ElderCare_Priority'] = (df['Old_Age_Dependency_Ratio'] / df['Old_Age_Dependency_Ratio'].max()) * 100

print("Top 15 GN Divisions for NEW SCHOOL Allocation:")
print(df.nlargest(15, 'School_Priority')[[gn_name_col, district_col, 'Child_Dependency_Ratio', 'School_Priority']].to_string(index=False))

print("\nTop 15 GN Divisions for NEW ELDER CARE FACILITY Allocation:")
print(df.nlargest(15, 'ElderCare_Priority')[[gn_name_col, district_col, 'Old_Age_Dependency_Ratio', 'ElderCare_Priority']].to_string(index=False))

## 7. Anomaly Detection

In [None]:
# Calculate Z-scores for key metrics
df['Sex_Ratio_Zscore'] = np.abs(stats.zscore(df['Sex_Ratio'].fillna(df['Sex_Ratio'].median())))
df['Child_Dep_Zscore'] = np.abs(stats.zscore(df['Child_Dependency_Ratio'].fillna(0)))
df['Old_Age_Dep_Zscore'] = np.abs(stats.zscore(df['Old_Age_Dependency_Ratio'].fillna(0)))

# Anomalies: Z-score > 3
threshold = 3
anomalies = df[(df['Sex_Ratio_Zscore'] > threshold) | 
               (df['Child_Dep_Zscore'] > threshold) | 
               (df['Old_Age_Dep_Zscore'] > threshold)]

print(f"Detected {len(anomalies)} Anomalous GN Divisions (Z-score > {threshold}):")
print(anomalies[[gn_name_col, district_col, 'Sex_Ratio', 'Child_Dependency_Ratio', 'Old_Age_Dependency_Ratio']].head(20))

In [None]:
# Save Final Enhanced Dataset
final_cols = [c for c in df.columns if not c.endswith('_Zscore')]
df[final_cols].to_csv('GN_population_final_analysis.csv', index=False)
print("Final dataset saved to 'GN_population_final_analysis.csv'")
print(f"Total columns: {len(final_cols)}")

---
## Summary

This analysis covered:
- **Sex Ratio Analysis**: Identified gender-imbalanced GNs
- **Dependency Ratios**: Child and Old-Age dependency per working population
- **K-Means Clustering**: Created demographic profiles (Aging Villages, Young Families, etc.)
- **Resource Allocation**: Ranked GNs for school and elder care prioritization
- **Anomaly Detection**: Flagged statistically unusual GN divisions

### Output Files:
- `GN_population_final_analysis.csv` - Full dataset with clusters and priorities
- PNG visualizations for reporting