In [1]:
import pandas as pd
import numpy as np

In [2]:
#read data 

#sheet_url = "../../NESIP_FIELD_DATA/data.xlsx"

#sheet_ea_passed = "2. Energy Access(Passed)"

#ea_passed = pd.read_excel(sheet_url, sheet_ea_passed)


In [3]:
# Read data
sheet_url = "../../NESIP_FIELD_DATA/data.xlsx"

sheet_ea_passed = "2. Energy Access(Passed)"

tariff_df = "disco_information_lga"

# Load the sheet into a DataFrame
energy_access_data = pd.read_excel(sheet_url, sheet_name=sheet_ea_passed, dtype=str)

# Load the tariff sheet into a DataFrame
tariff_data = pd.read_excel(sheet_url, sheet_name=tariff_df)


#### 2. Availability

###### Daytime Availability

In [4]:
# Define mapping dictionary
tier_mapping = {
    "0 hour": "Tier 0",
    "1 hour": "Tier 1",
    "2 hours": "Tier 2",
    "3 hours": "Tier 3",
    "4 hours": "Tier 4",
    "> 4 hours": "Tier 5"
}

In [5]:
# Strip spaces and map values
energy_access_data["Daytime Availability"] = energy_access_data["2G. During the day (6 am–6 pm), how many hours of electricity do you get from the primary sources?"].str.strip().map(tier_mapping)

# Fill NaN values with "No Tier"
energy_access_data["Daytime Availability"].fillna("No Tier", inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  energy_access_data["Daytime Availability"].fillna("No Tier", inplace=True)


In [6]:
# Display result
print(energy_access_data[["Daytime Availability"]].head())

  Daytime Availability
0               Tier 5
1              No Tier
2               Tier 2
3               Tier 2
4               Tier 3


###### Evening Availability

In [7]:
# Define mapping dictionary
tier_mapping = {
    "0 hour": "Tier 0",
    "1 hour": "Tier 1",
    "2 hours": "Tier 2",
    "3 hours": "Tier 3",
    "4 hours": "Tier 4",
    "> 4 hours": "Tier 5"
}

In [8]:
# Strip spaces and map values
energy_access_data["Evening Availability"] = energy_access_data["2H. At Night (6 pm–6 am), how many hours of electricity do you get from the primary sources?"].str.strip().map(tier_mapping)

# Fill NaN values with "No Tier"
energy_access_data["Evening Availability"].fillna("No Tier", inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  energy_access_data["Evening Availability"].fillna("No Tier", inplace=True)


In [9]:
# Display result
print(energy_access_data[["Evening Availability"]].head())

  Evening Availability
0               Tier 5
1              No Tier
2               Tier 4
3               Tier 2
4               Tier 1


##### DAILY AVAILABILITY

In [10]:
# Define column names
day_col = "2G. During the day (6 am–6 pm), how many hours of electricity do you get from the primary sources?"
night_col = "2H. At Night (6 pm–6 am), how many hours of electricity do you get from the primary sources?"

# Remove "hour" and "hours" while converting to string safely
energy_access_data[[day_col, night_col]] = energy_access_data[[day_col, night_col]].astype(str).replace({"hour": "", "s": ""}, regex=True).apply(lambda x: x.str.strip())


In [11]:
# Remove "hour" and "hours" while converting to string safely
energy_access_data[[day_col, night_col]].head()

Unnamed: 0,"2G. During the day (6 am–6 pm), how many hours of electricity do you get from the primary sources?","2H. At Night (6 pm–6 am), how many hours of electricity do you get from the primary sources?"
0,> 4,> 4
1,,
2,2,4
3,2,2
4,3,1


disruptions_col = "2I. Do you experience disruptions from your primary source of electricity supply?"
outages_col = "2J. How often do you experience power outages?"
interruptions_col = "2K. On an average day, how many times does the electricity supply get interrupted?"
restoration_col = "2L. When the electricity is interrupted, how long does it typically take to be restored?"

# Function to clean and convert numeric values
def clean_numeric(value):
    if pd.isna(value) or value in ["", "None"]:
        return 0
    elif value == "> 4":
        return 8  # Assume > 4 means 8
    elif value == "0 - 1":
        return 1
    elif value == "> 10":
        return 12
    return float(value)

# Function to compute total hours of power supply
def compute_total_hours(row):
    day_val = clean_numeric(row[day_col])
    night_val = clean_numeric(row[night_col])

    # If both are ≤ 4, simply sum them
    if day_val <= 4 and night_val <= 4:
        return day_val + night_val

    # If either is > 4, apply escalation logic
    if day_val > 4 or night_val > 4:
        if not row[disruptions_col]:  # No disruptions
            return 24
        elif row[outages_col] == "Rarely (less than 4 times per week)":
            return 22
        elif row[outages_col] == "Occasionally (4–8 times per week)":
            if row[restoration_col] == "More than 4 hours":
                return 16
            elif row[restoration_col] == "Less than 30 minutes":
                return 23
            elif row[restoration_col] == "2 - 4 hours":
                return 21
            elif row[restoration_col] == "1–2 hours":
                return 22

    return day_val + night_val

# Apply function to compute total hours
energy_access_data["total_hours"] = energy_access_data.apply(compute_total_hours, axis=1)

# Function to adjust for frequent daily interruptions **AFTER total_hours is computed**
def adjust_for_interruptions(row):
    total_hours = row["total_hours"] if pd.notna(row["total_hours"]) else 0
    times_interrupted = clean_numeric(row.get(interruptions_col, 0))

    if row.get(outages_col, "") == "Frequently (daily interruptions)":
        if row.get(restoration_col, "") == "Less than 30 minutes":
            return max(0, total_hours - (0.5 * times_interrupted))
        elif row.get(restoration_col, "") == "30 minutes–1 hour":
            return max(0, total_hours - (1 * times_interrupted))
        elif row.get(restoration_col, "") == "1–2 hours":
            return max(0, total_hours - (2 * times_interrupted))
        elif row.get(restoration_col, "") == "2 - 4 hours":
            return max(0, total_hours - (2 * times_interrupted))
        elif row.get(restoration_col, "") == "More than 4 hours":
            return max(0, total_hours - (8 * times_interrupted))

    return total_hours

# Apply the adjustment function
energy_access_data["total_hours"] = energy_access_data.apply(adjust_for_interruptions, axis=1)

# Display final DataFrame
print(energy_access_data["total_hours"].head(15))


In [12]:
disruptions_col = "2I. Do you experience disruptions from your primary source of electricity supply?"
outages_col = "2J. How often do you experience power outages?"
interruptions_col = "2K. On an average day, how many times does the electricity supply get interrupted?"
restoration_col = "2L. When the electricity is interrupted, how long does it typically take to be restored?"

# Function to clean and convert numeric values
def clean_numeric(value):
    if pd.isna(value) or value in ["", "None"]:
        return 0
    value = str(value).replace("hours", "").replace("hour", "").strip()  # Remove "hours" from string
    if value == "> 4":
        return 8  # Assuming "> 4" means 8 hours (worst case)
    elif value == "0 - 1":
        return 1
    elif value == "> 10":
        return 12
    try:
        return float(value)
    except:
        return 0

# Function to compute total hours
def compute_total_hours(row):
    # Clean and convert numeric values
    day_val = clean_numeric(row[day_col])
    night_val = clean_numeric(row[night_col])
    disruptions = row[disruptions_col]
    outage_freq = row[outages_col]
    times_interrupted = str(row[interruptions_col]).strip()
    restoration_time = str(row[restoration_col]).strip()  # Ensure it's a string

    # Convert "> 4" to 8 for calculations
    day_val = 8 if row[day_col] == "> 4" else day_val
    night_val = 8 if row[night_col] == "> 4" else night_val

    # Convert times_interrupted values
    if times_interrupted == "0 - 1":
        times_interrupted = 1
    elif times_interrupted == "> 10":
        times_interrupted = 12
    else:
        times_interrupted = clean_numeric(times_interrupted)

    total_hours = 0  # Default value

    # **🔹 Step 1: If both values are ≤ 4, sum them up and return the result**
    if day_val <= 4 and night_val <= 4:
        return day_val + night_val

    # **🔹 Step 2: Handle cases where both day and night values are greater than 4**
    if day_val > 4 and night_val > 4:
        if not disruptions:  # Column c is FALSE
            return 24
        elif outage_freq == "Rarely (less than 4 times per week)":
            return 22
        elif outage_freq == "Occasionally (4–8 times per week)":
            if "More than 4 hours" in restoration_time:
                return 16
            elif "Less than 30 minutes" in restoration_time:
                return 23
            elif "2 - 4 hours" in restoration_time:
                return 21
            elif "1–2 hours" in restoration_time:
                return 22

    # **🔹 Step 3: Handle "Frequently (daily interruptions)" cases**
    if outage_freq == "Frequently (daily interruptions)":
        if "Less than 30 minutes" in restoration_time:
            total_hours = max(0, 24 - (0.5 * times_interrupted))
        elif "30 minutes–1 hour" in restoration_time:
            total_hours = max(0, 24 - (1 * times_interrupted))
        elif "1–2 hours" in restoration_time:
            total_hours = max(0, 24 - (2 * times_interrupted))
        elif "2 - 4 hours" in restoration_time:
            total_hours = max(0, 24 - (2 * times_interrupted))
        elif "More than 4 hours" in restoration_time:
            total_hours = max(0, 24 - (8 * times_interrupted))
        return total_hours

    # **🔹 Step 4: If neither special case applies, sum the values (normal behavior)**
    return day_val + night_val


# Apply function
energy_access_data["total_hours"] = energy_access_data.apply(compute_total_hours, axis=1)

# Display DataFrame
print(energy_access_data[["total_hours"]].head(15))


    total_hours
0          16.0
1           NaN
2           6.0
3           4.0
4           4.0
5           NaN
6           NaN
7          16.0
8           8.0
9           NaN
10          7.0
11          NaN
12          NaN
13          6.0
14          8.0


In [13]:
# Function to categorize into tiers
def categorize_tier(value):
    if value == "no connection" or pd.isna(value):
        return "Tier 0"
    elif value < 4:
        return "Tier 0"
    elif 4 <= value <= 7:
        return "Tier 1"
    elif 8 <= value <= 15:
        return "Tier 3"
    elif 16 <= value <= 22:
        return "Tier 4"
    elif 23 <= value <= 24:
        return "Tier 5"
    else:
        return "Unknown"

In [14]:
# Apply function to create tier_category column
energy_access_data["Daily Availability"] = energy_access_data["total_hours"].map(categorize_tier)

In [15]:
# Display result
print(energy_access_data["Daily Availability"].head(15))

0     Tier 4
1     Tier 0
2     Tier 1
3     Tier 1
4     Tier 1
5     Tier 0
6     Tier 0
7     Tier 4
8     Tier 3
9     Tier 0
10    Tier 1
11    Tier 0
12    Tier 0
13    Tier 1
14    Tier 3
Name: Daily Availability, dtype: object


#### 3. Reliability

In [16]:
# Mapping values for response categories
short_durations = ['Less than 30 minutes', '30 minutes–1 hour', '1–2 hours']
long_durations = ['3 - 4 hours', 'More than 4 hours']



# Function to determine reliability tier
def classify_tier(row):
    col_a = row['2J. How often do you experience power outages?']
    col_b = row['2K. On an average day, how many times does the electricity supply get interrupted?']
    col_c = row['2L. When the electricity is interrupted, how long does it typically take to be restored?']

    # Handling NaN values
    if pd.isna(col_a):
        return "Tier 0"
    
    # Rarely cases
    if col_a == 'Rarely (less than 4 times per week)':
        return "Tier 5" if col_c in ['Less than 30 minutes', '30 minutes–1 hour', '1–2 hours'] else "Tier 4"

    # Occasionally cases
    if col_a == 'Occasionally (4–8 times per week)':
        return "Tier 4" if col_c in ['Less than 30 minutes', '30 minutes–1 hour', '1–2 hours'] else "Tier 3"

    # Frequently cases
    if col_a == 'Frequently (daily interruptions)':
        # Normalize col_b values
        if str(col_b).strip() in ["0 - 1", "0-1"]:  # Handle both "0 - 1" and "0-1"
            col_b = 1
        elif str(col_b).strip() in [">10", "> 10"]:  # Handle both ">10" and "> 10"
            col_b = 12
        else:
            try:
                col_b = int(col_b)  # Convert numeric values
            except ValueError:
                return "Tier 1"  # Default case if conversion fails

        interruptions_per_week = col_b * 7

        return "Tier 3" if interruptions_per_week <= 14 else "Tier 1"

    return "Tier 1"


In [17]:
# Apply function to create new column
energy_access_data['Reliability'] = energy_access_data.apply(classify_tier, axis=1)

In [18]:
# Display result
print(energy_access_data['Reliability'].head(15))

0     Tier 3
1     Tier 0
2     Tier 4
3     Tier 3
4     Tier 3
5     Tier 0
6     Tier 0
7     Tier 3
8     Tier 3
9     Tier 0
10    Tier 3
11    Tier 0
12    Tier 0
13    Tier 1
14    Tier 1
Name: Reliability, dtype: object


#### 4. Quality

In [19]:
# Function to determine the tier
def map_quality(row):
    challenges = row["2R. If yes, what are the main challenges you face with your primary electricity source? (Outages, cost, low voltage, etc.)"]
    quality = row["2U. How would you describe the quality of energy supply to your household"]
    
    if pd.isna(challenges) or pd.isna(quality) or (challenges.strip() == "" and quality.strip() == ""):
        return "No Tier"
    
    if "Low Voltage" in challenges:
        if quality == "Good quality of energy supply, Voltage does not affect use of appliances":
            return "Tier 2"

    if quality == "Good quality of energy supply, Voltage does not affect use of appliances":
        return "Tier 5"
    
    if quality == "Poor Quality (Damages or cannot operate appliances)":
        return "Tier 1"

    return "No Tier"



In [20]:
# Apply transformation
energy_access_data["Quality"] = energy_access_data.apply(map_quality, axis=1)

# Display the first few rows to verify
print(energy_access_data[["Quality"]].head())


   Quality
0   Tier 5
1  No Tier
2   Tier 1
3   Tier 1
4   Tier 2


##### 5. Affordability

In [21]:
# Convert numerical column to numeric type, forcing errors to NaN for empty values
energy_access_data["2P. how much do you pay for the primary source of electricity for the period mentioned earlier? (Exact Amount Specified n ₦)"] = pd.to_numeric(
    energy_access_data["2P. how much do you pay for the primary source of electricity for the period mentioned earlier? (Exact Amount Specified n ₦)"], errors="coerce"
)

# Mapping of frequency to yearly multiplier
multipliers = {
    "Daily": 365,
    "Weekly": 52,
    "Bi-Weekly": 26,
    "Monthly": 12,
    "Quarterly": 4,
    "Yearly": 1
}

# Compute yearly and monthly payments, keeping NaNs
energy_access_data["yearly grid payment"] = energy_access_data["2N. How often do you pay for the primary source of energy?"].map(multipliers) * energy_access_data["2P. how much do you pay for the primary source of electricity for the period mentioned earlier? (Exact Amount Specified n ₦)"]
energy_access_data["monthly grid payment"] = energy_access_data["yearly grid payment"] / 12  # Convert yearly to monthly


In [22]:
energy_access_data["1L(iii). What is your total monthly household income?, Specify in ₦."] = pd.to_numeric(energy_access_data["1L(iii). What is your total monthly household income?, Specify in ₦."], errors="coerce")

energy_access_data["monthly grid payment"].head()

0     8666.666667
1             NaN
2    12000.000000
3     5000.000000
4     2000.000000
Name: monthly grid payment, dtype: float64

In [23]:
# Function to determine "Legality"
def determine_affordability(monthly_pay, monthly_allowance):
    if monthly_pay >= (0.05 * monthly_allowance):
        return "Tier 1"
    elif monthly_pay < (0.05 * monthly_allowance):
        return "Tier 5"
    else:
        return "No Tier"  # Return blank if none of the conditions match

In [24]:
monthly_grid_spend = "monthly grid payment"
household_income = "1L(iii). What is your total monthly household income?, Specify in ₦."

# Apply function to create "Legality" column
energy_access_data["Affordability"] = energy_access_data.apply(
    lambda row: determine_affordability(row[monthly_grid_spend], row[household_income]), axis=1
)

# Display the first few rows to verify
print(energy_access_data[["Affordability"]].head())

  Affordability
0        Tier 1
1       No Tier
2        Tier 1
3        Tier 5
4        Tier 5


#### 5. Legality

In [25]:
# Column names
metered_col = "5D. Are you metered?"
payment_col = "5E. If not metered, How do you pay for electricity?"


In [26]:
# Clean and normalize the data
energy_access_data[metered_col] = energy_access_data[metered_col].astype(str).str.strip().str.lower()
energy_access_data[payment_col] = energy_access_data[payment_col].astype(str).str.strip().str.lower()

In [27]:
# Function to determine "Legality"
def determine_legality(metered, payment):
    if metered in ["yes, with a prepaid meter", "yes, with a postpaid meter"]:
        return "Tier 5"
    elif payment in [
        "through community billing arrangements (e.g., shared connections in the neighborhood)",
        "through a flat rate or estimated billing system provided by the electricity company"
    ]:
        return "Tier 5"
    elif payment == "do not pay for electricity":
        return "Tier 0"
    else:
        return "No Tier"  # Return blank if none of the conditions match

In [28]:
# Apply function to create "Legality" column
energy_access_data["Legality"] = energy_access_data.apply(
    lambda row: determine_legality(row[metered_col], row[payment_col]), axis=1
)

# Display the first few rows to verify
print(energy_access_data[["Legality"]].head(20))


   Legality
0    Tier 5
1   No Tier
2    Tier 5
3    Tier 5
4    Tier 5
5   No Tier
6   No Tier
7    Tier 5
8    Tier 5
9   No Tier
10   Tier 5
11   Tier 5
12  No Tier
13   Tier 5
14   Tier 5
15   Tier 5
16  No Tier
17   Tier 5
18  No Tier
19   Tier 5


#### 6. Health and Safety

In [29]:
# Column name
col_name = "6B. Have you or anyone in your household experienced an incident caused by an electricity connection (e.g., shock, fire)?"

# Clean and normalize values
energy_access_data[col_name] = (
    energy_access_data[col_name]
    .str.strip()  # Remove leading/trailing spaces
    .str.lower()  # Convert to lowercase for consistency
)

In [30]:
# Mapping function
def map_health_safety(value):
    if value == "true":
        return "Tier 0"
    elif value == "false":
        return "Tier 5"
    else:
        return "No Tier"  # Keep NaN as an empty string

In [31]:
# Apply transformation
energy_access_data["Health and Safety"] = energy_access_data[col_name].apply(map_health_safety)

# Display the first few rows to verify
print(energy_access_data[["Health and Safety"]].head())


  Health and Safety
0            Tier 5
1           No Tier
2            Tier 5
3            Tier 5
4            Tier 5


#### 1. CAPACITY

In [32]:
# Merging df1 with df2 using 'State' and 'LGA' as keys
merged_df_for_capacity = energy_access_data.merge(tariff_data, on=['State', 'LGA'], how='left')

In [33]:
import numpy as np

# Define a function to determine the correct tariff band based on 'total_hours'
def select_tariff(row):
    if row['total_hours'] >= 20:
        return row['BAND A']
    elif 16 <= row['total_hours'] <= 19:
        return row['BAND B']
    elif 12 <= row['total_hours'] <= 15:
        return row['BAND C']
    elif 8 <= row['total_hours'] <= 11:
        return row['BAND D']
    elif 1 <= row['total_hours'] <= 7:
        return row['BAND E']
    else:
        return np.nan  # If total_hours is 0 or NaN, return NaN

# Apply the function to select the tariff
merged_df_for_capacity['selected_tariff'] = merged_df_for_capacity.apply(select_tariff, axis=1)

# Calculate Capacity in kWh
merged_df_for_capacity['Capacity_in_kwh'] = np.where(
    (merged_df_for_capacity['total_hours'] == 0) | (merged_df_for_capacity['total_hours'].isna()),
    0,
    merged_df_for_capacity['monthly grid payment'] / merged_df_for_capacity['selected_tariff']
)



In [34]:
# Display the final DataFrame
print(merged_df_for_capacity['Capacity_in_kwh'].head(50))

0      125.676721
1        0.000000
2      291.191458
3       90.203861
4       36.081544
5        0.000000
6        0.000000
7       58.004640
8      234.530038
9        0.000000
10      27.061158
11       0.000000
12       0.000000
13     132.479576
14      79.792539
15      97.063819
16       0.000000
17      62.052506
18       0.000000
19     188.797986
20     409.062303
21       0.000000
22     121.329774
23      72.797865
24      72.797865
25       0.000000
26     119.688809
27      97.063819
28      72.797865
29     171.526587
30     121.329774
31     184.632865
32     121.329774
33      72.797865
34      85.763293
35      42.881647
36     266.925504
37      21.440823
38      36.081544
39      14.319809
40      87.006961
41       0.000000
42       0.000000
43       0.000000
44       0.000000
45    6283.836040
46      27.061158
47     126.285405
48      90.203861
49       0.000000
Name: Capacity_in_kwh, dtype: float64


In [35]:
# Define tier bins and labels
bins = [-float('inf'), 0.012, 0.2, 1.0, 3.4, 8.2, float('inf')]
labels = ['Tier 0', 'Tier 1', 'Tier 2', 'Tier 3', 'Tier 4', 'Tier 5']

# Assign tiers based on Capacity_in_kWh
merged_df_for_capacity['Capacity'] = pd.cut(merged_df_for_capacity['Capacity_in_kwh'], bins=bins, labels=labels, right=False)



In [36]:
# Display the updated DataFrame
print(merged_df_for_capacity['Capacity'].head(15))

0     Tier 5
1     Tier 0
2     Tier 5
3     Tier 5
4     Tier 5
5     Tier 0
6     Tier 0
7     Tier 5
8     Tier 5
9     Tier 0
10    Tier 5
11    Tier 0
12    Tier 0
13    Tier 5
14    Tier 5
Name: Capacity, dtype: category
Categories (6, object): ['Tier 0' < 'Tier 1' < 'Tier 2' < 'Tier 3' < 'Tier 4' < 'Tier 5']


##### Dashboard Analysis - PER HOUSEHOLD

In [37]:
# Create an empty DataFrame
per_household_summary = pd.DataFrame()

##### 1. State and LGA

In [38]:
# Assign specific columns (with their values)
per_household_summary['State'] = merged_df_for_capacity['State']
per_household_summary['LGA'] = merged_df_for_capacity['LGA']

print(per_household_summary.head())


         State                LGA
0  Cross River  Calabar Municipal
1  Cross River  Calabar Municipal
2        Delta      Ughelli North
3      Bayelsa              Nembe
4  Cross River  Calabar Municipal


##### 2. Health and Safety

In [39]:
# Assign specific columns (with their values)
per_household_summary['Health and Safety'] = merged_df_for_capacity['Health and Safety']

print(per_household_summary.head())


         State                LGA Health and Safety
0  Cross River  Calabar Municipal            Tier 5
1  Cross River  Calabar Municipal           No Tier
2        Delta      Ughelli North            Tier 5
3      Bayelsa              Nembe            Tier 5
4  Cross River  Calabar Municipal            Tier 5


##### 3. Legality

In [40]:
# Assign specific columns (with their values)
per_household_summary['Legality'] = merged_df_for_capacity['Legality']

print(per_household_summary['Legality'].head())


0     Tier 5
1    No Tier
2     Tier 5
3     Tier 5
4     Tier 5
Name: Legality, dtype: object


##### 4. Affordability

In [41]:
# Assign specific columns (with their values)
per_household_summary['Affordability'] = merged_df_for_capacity['Quality']

print(per_household_summary['Affordability'].head())


0     Tier 5
1    No Tier
2     Tier 1
3     Tier 1
4     Tier 2
Name: Affordability, dtype: object


##### 5. Quality

In [42]:
# Assign specific columns (with their values)
per_household_summary['Quality'] = merged_df_for_capacity['Quality']

print(per_household_summary['Quality'].head())


0     Tier 5
1    No Tier
2     Tier 1
3     Tier 1
4     Tier 2
Name: Quality, dtype: object


##### 6. Reliability

In [43]:
# Assign specific columns (with their values)
per_household_summary['Reliability'] = merged_df_for_capacity['Reliability']

print(per_household_summary['Reliability'].head())


0    Tier 3
1    Tier 0
2    Tier 4
3    Tier 3
4    Tier 3
Name: Reliability, dtype: object


##### 7. Daily Availability

In [44]:
# Assign specific columns (with their values)
per_household_summary["Daily Availability"] = merged_df_for_capacity["Daily Availability"]

print(per_household_summary["Daily Availability"].head())


0    Tier 4
1    Tier 0
2    Tier 1
3    Tier 1
4    Tier 1
Name: Daily Availability, dtype: object


##### 8. Daytime Availability

In [45]:
# Assign specific columns (with their values)
per_household_summary['Daytime Availability'] = merged_df_for_capacity['Daytime Availability']

print(per_household_summary['Daytime Availability'].head())


0     Tier 5
1    No Tier
2     Tier 2
3     Tier 2
4     Tier 3
Name: Daytime Availability, dtype: object


##### 9. Night time Availability

In [46]:
# Assign specific columns (with their values)
per_household_summary['Evening Availability'] = merged_df_for_capacity['Evening Availability']

print(per_household_summary['Evening Availability'].head())


0     Tier 5
1    No Tier
2     Tier 4
3     Tier 2
4     Tier 1
Name: Evening Availability, dtype: object


##### 10. Capacity

In [55]:
# Assign specific columns (with their values)
per_household_summary['Capacity'] = merged_df_for_capacity['Capacity']

print(per_household_summary['Capacity'].head())


0    Tier 5
1    Tier 0
2    Tier 5
3    Tier 5
4    Tier 5
Name: Capacity, dtype: category
Categories (6, object): ['Tier 0' < 'Tier 1' < 'Tier 2' < 'Tier 3' < 'Tier 4' < 'Tier 5']


In [None]:
national_level_df = per_household_summary.copy()
state_level_df = per_household_summary.copy()

# write to csv

In [48]:
# Drop unnecessary index column if it exists
per_household_summary = per_household_summary.drop(columns=["Unnamed: 0"], errors='ignore')

# Trim spaces from column names
per_household_summary.columns = per_household_summary.columns.str.strip()

# Function to extract the highest tier from a given string
def extract_highest_tier(value):
    if isinstance(value, str):
        if "No Tier" in value:
            return 0  # Assign 0 for "No Tier"
        tiers = [int(s) for s in value.split() if s.isdigit()]  # Extract numbers
        return max(tiers) if tiers else np.nan  # Get the highest tier or NaN if empty
    return np.nan

# Define metric columns
metric_columns = [
    "Health and Safety", "Legality", "Affordability", "Quality",
    "Reliability", "Daily Availability", "Daytime Availability",
    "Evening Availability", "Capacity"
]

# Debug: Print unique values in one column to check format
print("Sample unique values from Capacity column:", per_household_summary["Capacity"].unique()[:10])

# Apply extraction function to all metric columns
per_household_summary[metric_columns] = per_household_summary[metric_columns].applymap(extract_highest_tier)

# Debug: Check if data was correctly processed
print("After extraction, Capacity column unique values:", per_household_summary["Capacity"].unique()[:10])

# Define tier thresholds based on the energy access framework
tier_thresholds = {
    "Health and Safety": [0, 1, 2, 3, 4, 5],
    "Legality": [0, 1, 2, 3, 4, 5],
    "Affordability": [0, 1, 2, 3, 4, 5],
    "Quality": [0, 1, 2, 3, 4, 5],
    "Reliability": [0, 1, 2, 3, 4, 5],
    "Daily Availability": [0, 4, 8, 16, 23],
    "Daytime Availability": [0, 4, 8, 16, 23],
    "Evening Availability": [0, 1, 2, 3, 4],
    "Capacity": [0, 12, 200, 1000, 3400, 8200]
}

# Function to assign tiers based on thresholds
def assign_tier(value, thresholds, reverse=False):
    if pd.isna(value):
        return np.nan
    if reverse:  # Lower values indicate higher tiers
        thresholds = thresholds[::-1]
    for i, threshold in enumerate(thresholds):
        if value < threshold:
            return i
    return len(thresholds) - 1

# Apply the tiering function to each metric
per_household_summary_tiers = per_household_summary.copy()
for metric, thresholds in tier_thresholds.items():
    reverse = metric in ["Reliability", "Affordability"]  # Lower is better for these
    per_household_summary_tiers[f"{metric}_Tier"] = per_household_summary[metric].apply(lambda x: assign_tier(x, thresholds, reverse))

# Compute the national average tier for each metric
national_tiers = per_household_summary_tiers[[col for col in per_household_summary_tiers.columns if "_Tier" in col]].mean()

# Compute the overall national tier
overall_national_tier = national_tiers.mean()

# Print the metric breakdown
print("### Metric Breakdown:")
for metric, value in national_tiers.items():
    tier_range = f"(Tier {int(value)}-{int(value) + 1})" if not pd.isna(value) else "N/A"
    print(f"- {metric.replace('_Tier', '')}: {value:.2f} {tier_range}")

# Ensure overall tier is a valid number before converting to int
if not pd.isna(overall_national_tier):
    print(f"\n### Overall National Tier: {overall_national_tier:.2f} (≈ Tier {int(np.round(overall_national_tier))})")
else:
    print("\n### Overall National Tier: Data issue - Cannot determine tier")

Sample unique values from Capacity column: ['Tier 5', 'Tier 0', 'Tier 4', 'Tier 3']
Categories (6, object): ['Tier 0' < 'Tier 1' < 'Tier 2' < 'Tier 3' < 'Tier 4' < 'Tier 5']
After extraction, Capacity column unique values: [5, 0, 4, 3]
Categories (6, int64): [0 < 1 < 2 < 3 < 4 < 5]
### Metric Breakdown:
- Health and Safety: 4.20 (Tier 4-5)
- Legality: 4.43 (Tier 4-5)
- Affordability: 1.33 (Tier 1-2)
- Quality: 2.33 (Tier 2-3)
- Reliability: 0.12 (Tier 0-1)
- Daily Availability: 1.26 (Tier 1-2)
- Daytime Availability: 1.42 (Tier 1-2)
- Evening Availability: 3.30 (Tier 3-4)
- Capacity: 1.00 (Tier 1-2)

### Overall National Tier: 2.15 (≈ Tier 2)


  per_household_summary[metric_columns] = per_household_summary[metric_columns].applymap(extract_highest_tier)


In [None]:
### state

In [51]:
# Drop unnecessary index column if it exists
per_household_summary = per_household_summary.drop(columns=["Unnamed: 0"], errors='ignore')

# Trim spaces from column names
per_household_summary.columns = per_household_summary.columns.str.strip()

# Function to extract the highest tier from a given string
def extract_highest_tier(value):
    if isinstance(value, str):
        if "No Tier" in value:
            return 0  # Assign 0 for "No Tier"
        tiers = [int(s) for s in value.split() if s.isdigit()]  # Extract numbers
        return max(tiers) if tiers else np.nan  # Get the highest tier or NaN if empty
    return np.nan

# Define metric columns
metric_columns = [
    "Health and Safety", "Legality", "Affordability", "Quality",
    "Reliability", "Daily Availability", "Daytime Availability",
    "Evening Availability", "Capacity"
]

# Apply extraction function to all metric columns
per_household_summary[metric_columns] = per_household_summary[metric_columns].applymap(extract_highest_tier)

# Define tier thresholds based on the energy access framework
tier_thresholds = {
    "Health and Safety": [0, 1, 2, 3, 4, 5],
    "Legality": [0, 1, 2, 3, 4, 5],
    "Affordability": [0, 1, 2, 3, 4, 5],
    "Quality": [0, 1, 2, 3, 4, 5],
    "Reliability": [0, 1, 2, 3, 4, 5],
    "Daily Availability": [0, 4, 8, 16, 23],
    "Daytime Availability": [0, 4, 8, 16, 23],
    "Evening Availability": [0, 1, 2, 3, 4],
    "Capacity": [0, 12, 200, 1000, 3400, 8200]
}

# Function to assign tiers based on thresholds
def assign_tier(value, thresholds, reverse=False):
    if pd.isna(value):
        return np.nan
    if reverse:  # Lower values indicate higher tiers
        thresholds = thresholds[::-1]
    for i, threshold in enumerate(thresholds):
        if value < threshold:
            return i
    return len(thresholds) - 1

# Apply the tiering function to each metric
per_household_summary_tiers = per_household_summary.copy()
for metric, thresholds in tier_thresholds.items():
    reverse = metric in ["Reliability", "Affordability"]  # Lower is better for these
    per_household_summary_tiers[f"{metric}_Tier"] = per_household_summary[metric].apply(lambda x: assign_tier(x, thresholds, reverse))

# Compute the state average tier for each metric
state_tiers = per_household_summary_tiers.groupby("State")[[col for col in per_household_summary_tiers.columns if "_Tier" in col]].mean()

# Compute the overall tier for each state using the weakest-link approach
state_tiers["Overall_Tier"] = state_tiers.min(axis=1)

# Save state-level energy access tiers to CSV
state_tiers.to_csv("state_energy_access_tiers.csv")

print("State-level energy access tiers saved to 'state_energy_access_tiers.csv' in the current directory.")


State-level energy access tiers saved to 'state_energy_access_tiers.csv' in the current directory.


  per_household_summary[metric_columns] = per_household_summary[metric_columns].applymap(extract_highest_tier)


In [52]:
print(per_household_summary.columns.tolist())


['State', 'LGA', 'Health and Safety', 'Legality', 'Affordability', 'Quality', 'Reliability', 'Daily Availability', 'Daytime Availability', 'Evening Availability', 'Capacity']


In [53]:
per_household_summary.columns = per_household_summary.columns.str.strip()


In [56]:
print(per_household_summary.head(5))
print(per_household_summary.head(5))



         State                LGA  Health and Safety  Legality  Affordability  \
0  Cross River  Calabar Municipal                NaN       NaN            NaN   
1  Cross River  Calabar Municipal                NaN       NaN            NaN   
2        Delta      Ughelli North                NaN       NaN            NaN   
3      Bayelsa              Nembe                NaN       NaN            NaN   
4  Cross River  Calabar Municipal                NaN       NaN            NaN   

   Quality  Reliability  Daily Availability  Daytime Availability  \
0      NaN          NaN                 NaN                   NaN   
1      NaN          NaN                 NaN                   NaN   
2      NaN          NaN                 NaN                   NaN   
3      NaN          NaN                 NaN                   NaN   
4      NaN          NaN                 NaN                   NaN   

   Evening Availability Capacity  
0                   NaN   Tier 5  
1                   NaN   Ti