## 1. Import Libraries and Load Data

In [9]:
# Import required libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)

print("Libraries imported successfully!")

Libraries imported successfully!


In [10]:
# Load the dataset
df = pd.read_csv('../../data/raw/Income-category-wise-countries.csv')

# Display first few rows
print(f"Dataset Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
print("\n" + "="*60)
print("First 5 rows of the dataset:")
print("="*60)
df.head()

Dataset Shape: 265 rows × 6 columns

First 5 rows of the dataset:


Unnamed: 0,Country Code,Region,IncomeGroup,SpecialNotes,TableName,Unnamed: 5
0,ABW,Latin America & Caribbean,High income,,Aruba,
1,AFE,,,"26 countries, stretching from the Red Sea in t...",Africa Eastern and Southern,
2,AFG,Middle East & North Africa,Low income,The reporting period for national accounts dat...,Afghanistan,
3,AFW,,,"22 countries, stretching from the westernmost ...",Africa Western and Central,
4,AGO,Sub-Saharan Africa,Lower middle income,The World Bank systematically assesses the app...,Angola,


## 2. Data Cleaning and Preprocessing

In [11]:
# Check for missing values
print("Missing Values:")
print(df.isnull().sum())

# Drop rows where IncomeGroup is missing (usually aggregates like 'World', 'Arab World')
df_clean = df.dropna(subset=['IncomeGroup'])
print(f"\nCleaned Dataset Shape: {df_clean.shape[0]:,} rows")

Missing Values:
Country Code      0
Region           48
IncomeGroup      50
SpecialNotes    135
TableName         0
Unnamed: 5      265
dtype: int64

Cleaned Dataset Shape: 215 rows


## 3. Distribution Analysis

In [12]:
# Distribution of Countries by Income Group
income_counts = df_clean['IncomeGroup'].value_counts().reset_index()
income_counts.columns = ['Income Group', 'Count']

fig = px.bar(income_counts, x='Income Group', y='Count', color='Income Group',
             title='Distribution of Countries by Income Group',
             text='Count', color_discrete_sequence=px.colors.qualitative.Set3)
fig.update_layout(height=500)
fig.show()

In [13]:
# Regional Distribution of Income Groups
region_income = df_clean.groupby(['Region', 'IncomeGroup']).size().reset_index(name='Count')

fig = px.bar(region_income, x='Region', y='Count', color='IncomeGroup',
             title='Income Group Distribution by Region',
             barmode='stack', color_discrete_sequence=px.colors.qualitative.Set3)
fig.update_layout(height=600, xaxis={'categoryorder':'total descending'})
fig.show()

## 4. Global Map Visualization

In [14]:
# Map of Income Groups
fig = px.choropleth(df_clean, locations='Country Code', color='IncomeGroup',
                    hover_name='TableName',
                    title='Global Distribution of Income Groups',
                    color_discrete_map={
                        'High income': 'forestgreen',
                        'Upper middle income': 'yellowgreen',
                        'Lower middle income': 'orange',
                        'Low income': 'firebrick'
                    })
fig.update_layout(height=600)
fig.show()

---
## Summary of Findings

### Key Observations
1. **Distribution**: [To be filled] - High income countries are concentrated in Europe/NA. Low income in Africa.
2. **Regional Disparities**: [To be filled] - Sub-Saharan Africa has the highest proportion of Low Income countries.

### Implications
- **Health Resources**: Income level strongly dictates healthcare infrastructure.
- **Nutrition Transition**: Income growth drives dietary changes (more processed food).

### Next Steps
- Use Income Group as a categorical variable to segment Diabetes and Obesity analysis.