# Day 1: Project Setup

**WISE Workshop | Addis Ababa, Feb 2026**

In this notebook, you'll set up a reproducible analysis project and explore the workshop dataset.

## Part 1: Environment Setup

In [None]:
# Check environment
import sys
print(f"Python: {sys.version}")
print(f"Environment: {'Colab' if 'google.colab' in sys.modules else 'Local'}")

In [None]:
# Import required packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Settings
pd.set_option('display.max_columns', 50)
plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline

print("Packages loaded successfully!")

## Part 2: Load the Workshop Dataset

In [None]:
# Load supply chain data from GitHub
url = "https://raw.githubusercontent.com/ssylvia/ethiopia-ds-workshop-2026/main/data/supply-chain-sample.csv"

try:
    df = pd.read_csv(url)
    print(f"Data loaded successfully! Shape: {df.shape}")
except:
    # If URL not available, create sample data
    print("Creating sample data for demonstration...")
    np.random.seed(42)
    n_rows = 500
    
    df = pd.DataFrame({
        'facility_id': np.random.choice(['ETH001', 'ETH002', 'ETH003', 'ETH004', 'ETH005'], n_rows),
        'region': np.random.choice(['Addis Ababa', 'Oromia', 'Amhara', 'SNNP', 'Tigray'], n_rows),
        'facility_type': np.random.choice(['Hospital', 'Health Center', 'Clinic'], n_rows, p=[0.2, 0.5, 0.3]),
        'date': pd.date_range('2023-01-01', periods=n_rows, freq='D').strftime('%Y-%m'),
        'medication_class': np.random.choice(['Antibiotics', 'Antimalarials', 'Chronic Disease', 'Vaccines', 'Other'], n_rows),
        'demand': np.random.poisson(100, n_rows) + np.random.randint(0, 50, n_rows),
        'stock_level': np.random.poisson(150, n_rows),
        'lead_time_days': np.random.choice([7, 14, 21, 30], n_rows, p=[0.3, 0.4, 0.2, 0.1])
    })
    print(f"Sample data created! Shape: {df.shape}")

## Part 3: Data Exploration

In [None]:
# First look at the data
df.head()

In [None]:
# Data types and missing values
df.info()

In [None]:
# Summary statistics
df.describe()

In [None]:
# Check categorical variables
for col in ['region', 'facility_type', 'medication_class']:
    print(f"\n{col}:")
    print(df[col].value_counts())

## Part 4: Initial Visualizations

In [None]:
# Distribution of demand
fig, ax = plt.subplots(figsize=(10, 4))
df['demand'].hist(bins=30, ax=ax, edgecolor='black')
ax.set_xlabel('Demand (units)')
ax.set_ylabel('Frequency')
ax.set_title('Distribution of Demand')
plt.tight_layout()
plt.show()

In [None]:
# Demand by region
fig, ax = plt.subplots(figsize=(10, 5))
df.groupby('region')['demand'].mean().sort_values().plot(kind='barh', ax=ax)
ax.set_xlabel('Average Demand')
ax.set_ylabel('Region')
ax.set_title('Average Demand by Region')
plt.tight_layout()
plt.show()

In [None]:
# Demand by facility type
fig, ax = plt.subplots(figsize=(8, 5))
sns.boxplot(data=df, x='facility_type', y='demand', ax=ax)
ax.set_xlabel('Facility Type')
ax.set_ylabel('Demand')
ax.set_title('Demand Distribution by Facility Type')
plt.tight_layout()
plt.show()

## Part 5: Save Your Work

Don't forget to save a copy of this notebook to your Google Drive!

**File > Save a copy in Drive**

## Summary

In this notebook, you:
- Set up your Python environment
- Loaded and explored the workshop dataset
- Created initial visualizations

**Next:** Day 2 - Machine Learning Pipeline