# HIV 2015 ORS – Impact Score Computation

This notebook computes the Impact Score for HIV 2015 data from the ORS (2015_2017_2019) Excel file.

In [16]:
import pandas as pd
import numpy as np
import re
from typing import Optional
from IPython.display import display

## Load HIV2015 Data

In [17]:
# Load the HIV2015 sheet from the ORS Excel file
file_path = "ORS (2015_2017_2019) copied 2025-7-5.xlsx"
sheet_name = "HIV2015"

main_df = pd.read_excel(file_path, sheet_name=sheet_name, header=None)

## Fix Headers

In [18]:
# Extract header from row 2 (0-indexed)
new_header = main_df.iloc[2]
main_df = main_df.iloc[4:]  # Data starts from row 4
main_df.columns = new_header
main_df = main_df.reset_index(drop=True)

main_df.head()

2,Country,WHO Region,Population,Geographical Region,WHO Group,DALY,Adult DALYs,Children DALYs,Retention Rate,Retention Rate (ADULT),...,All ages,Children (0-14),Adults (15+),Year,NaN,http://apps.who.int/gho/data/node.main.626?lang=en,Estimated antiretroviral therapy coverage among people living with HIV (%),Reported number of people receiving antiretroviral therapy,Cleaned coverage,Cleaned number of people receiving antiretroviral therapy
0,Afghanistan,EMR,33736494,"East, South and South-East Asia",A,10752.548,9224.366,1528.182,72.0,73.0,...,92,100.0,92.0,2015,,Afghanistan,5 [3-12],364,0.05,364.0
1,Albania,EUR,2880703,Europe and Central Asia,A,98.495261,96.59686,1.898401,92.0,92.0,...,92,77.0,92.0,2015,,Albania,No data,423,,423.0
2,Algeria,AFR,39871528,Middle East and North Africa,A,11586.0447,11055.12,530.9247,92.0,92.0,...,100,,,2015,,Algeria,90 [70->95],7 915,0.9,7915.0
3,American Samoa,WPR,55537,,A,28.518669,25.84525,2.673419,97.14,97.14,...,66,,,2015,,Andorra,High-income country,No data,,
4,Andorra,EUR,78014,,A,83.36974,83.20017,0.16957,97.14,97.14,...,85,100.0,85.0,2015,,Angola,29 [20-40],90 204,0.29,90204.0


In [19]:
main_df.columns

Index([                                                                   'Country',
                                                                       'WHO Region',
                                                                       'Population',
                                                              'Geographical Region',
                                                                        'WHO Group',
                                                                             'DALY',
                                                                      'Adult DALYs',
                                                                   'Children DALYs',
                                                                   'Retention Rate',
                                                           'Retention Rate (ADULT)',
                                                           'Retention Rate (CHILD)',
                                                            '# Re

## Helper Functions

In [20]:
def _col_idx(col_letters: str) -> int:
    """Convert Excel column letters to 0-based index."""
    col_letters = col_letters.strip().upper()
    n = 0
    for ch in col_letters:
        n = n * 26 + (ord(ch) - ord("A") + 1)
    return n - 1

def _to_float(x):
    """Robust float conversion."""
    if pd.isna(x):
        return np.nan
    if isinstance(x, str):
        s = x.strip().replace(",", "")
        if s == "" or s.lower() in ["nan", "no data"]:
            return np.nan
        if s.endswith("%"):
            try:
                return float(s[:-1]) / 100.0
            except:
                return np.nan
        try:
            return float(s)
        except:
            return np.nan
    try:
        return float(x)
    except:
        return np.nan

def _contains_drug(regimen_text, drug_token: str) -> bool:
    """Check if regimen contains the drug."""
    if pd.isna(regimen_text):
        return False
    reg = str(regimen_text).upper().replace(" ", "")
    tok = str(drug_token).upper().replace(" ", "")
    return tok != "" and tok in reg

def _regimen_size(regimen_text) -> float:
    """Count number of drugs in regimen."""
    if pd.isna(regimen_text):
        return 3.0
    s = str(regimen_text)
    parts = [p.strip() for p in re.split(r"\+", s) if p.strip() and p.strip().lower() != "nan"]
    if len(parts) >= 2:
        return float(len(parts))
    return 3.0

def _impact_term(base, const, coef1, var, coef2, denom) -> float:
    """Calculate impact term: base * const * coef1 * var * coef2 / (1 - const*coef1*var*coef2) / denom"""
    if denom == 0 or pd.isna(denom):
        return 0.0
    if any(pd.isna(v) for v in [base, const, coef1, var, coef2]):
        return 0.0
    prod = const * coef1 * var * coef2
    d = 1.0 - prod
    if d == 0:
        return 0.0
    return (base * prod / d) / denom

## Load Regimen Tables and Constants

In [21]:
# Reload the raw data to access regimen tables
raw_df = pd.read_excel(file_path, sheet_name=sheet_name, header=None)

# Constants from column 41 (AO in Excel)
const_col = 41
AQ5 = _to_float(raw_df.iloc[4, const_col])   # Group A first-line adult
AQ6 = _to_float(raw_df.iloc[5, const_col])   # Group A second-line adult
AP5 = _to_float(raw_df.iloc[9, const_col])   # Group B first-line adult
AP6 = _to_float(raw_df.iloc[10, const_col])  # Group B second-line adult

# For children, use the same constants
AQ10 = AQ5
AQ11 = AQ6
AP10 = AP5
AP11 = AP6

print(f"Constants loaded:")
print(f"  Group A: First-line={AQ5}, Second-line={AQ6}")
print(f"  Group B: First-line={AP5}, Second-line={AP6}")

Constants loaded:
  Group A: First-line=0.90835, Second-line=0.09165
  Group B: First-line=0.89325, Second-line=0.10675


## Compute Impact Score Function

In [22]:
def compute_impact_score_2015(
    main_df: pd.DataFrame,
    raw_df: pd.DataFrame,
    drug_name: str,
    output_col: Optional[str] = None,
    debug: bool = False,
    debug_show_first_n_rows: int = 3
):
    """
    Compute impact score using EXACT Excel formula.
    
    Formula: SUM(
        IF(drug in regimen, G*Q*AY/(1-Q*AY)/AX, 0) for adults,
        IF(drug in regimen, H*T*AZ/(1-T*AZ)/AX, 0) for children
    ) / (100/(100-I))
    
    Where:
    - AS (col 44) = Regimen name
    - AX (col 49) = Number of drugs
    - AY (col 50) = Adult x (proportion * efficacy pre-multiplied)
    - AZ (col 51) = Child x (proportion * efficacy pre-multiplied)
    """
    
    if output_col is None:
        output_col = f"Computed Impact Score ({drug_name})"
    
    # Column indices for country data
    idx_adult_daly = 6   # G - Adult DALYs
    idx_child_daly = 7   # H - Children DALYs
    idx_retention = 8    # I - Retention Rate
    idx_adult_cov = 13   # Q - Adult coverage
    idx_child_cov = 19   # T - Child coverage
    
    if debug:
        print("---- DEBUG ----")
        print(f"Drug: {drug_name}")
        print("Using EXACT Excel formula")
        print("---------------")
    
    # Excel rows to check (from the formula)
    regimen_rows = [5,6,7,8,9,12,13,14,15,16,17,18,21,22,23,24,25,26,27,28,29,31,32,33,34,35,36,37]
    
    scores = []
    
    for i in range(len(main_df)):
        row = main_df.iloc[i]
        
        try:
            G = _to_float(row.iloc[idx_adult_daly])
            H = _to_float(row.iloc[idx_child_daly])
            Q = _to_float(row.iloc[idx_adult_cov])
            T = _to_float(row.iloc[idx_child_cov])
            I = _to_float(row.iloc[idx_retention])
        except:
            scores.append(0.0)
            continue
        
        total = 0.0
        hits = 0
        
        for excel_row in regimen_rows:
            idx = excel_row - 1  # Convert to 0-indexed
            if idx >= len(raw_df):
                continue
            
            regimen = raw_df.iloc[idx, 44]  # AS - regimen name
            if _contains_drug(regimen, drug_name):
                num_drugs = _to_float(raw_df.iloc[idx, 49])  # AX - number of drugs
                adult_x = _to_float(raw_df.iloc[idx, 50])    # AY - adult x (pre-multiplied)
                child_x = _to_float(raw_df.iloc[idx, 51])    # AZ - child x (pre-multiplied)
                
                # Adult: G * Q * adult_x / (1 - Q * adult_x) / num_drugs
                if not pd.isna(adult_x) and not pd.isna(num_drugs) and num_drugs > 0 and not pd.isna(Q):
                    denom_adult = 1.0 - Q * adult_x
                    if denom_adult != 0:
                        total += (G * Q * adult_x / denom_adult) / num_drugs
                
                # Child: H * T * child_x / (1 - T * child_x) / num_drugs
                if not pd.isna(child_x) and not pd.isna(num_drugs) and num_drugs > 0 and not pd.isna(T):
                    denom_child = 1.0 - T * child_x
                    if denom_child != 0:
                        total += (H * T * child_x / denom_child) / num_drugs
                
                hits += 1
        
        # Normalize: / (100/(100-I))
        try:
            norm = 100.0 / (100.0 - I)
            result = total / norm if norm != 0 and not pd.isna(norm) else 0.0
        except:
            result = 0.0
        
        if debug and i < debug_show_first_n_rows:
            country = row.iloc[0]
            print(f"[Row {i}] {country}: hits={hits}, result={result:.6f}")
        
        scores.append(result)
    
    main_df[output_col] = scores
    return main_df

## Compute Impact Score for First Drug (with debug)

In [23]:
# Compute for 3TC first with debug output
main_df = compute_impact_score_2015(
    main_df,
    raw_df,
    "3TC",
    output_col="Computed Impact Score (3TC)",
    debug=True
)

main_df[["Country", "WHO Group", "3TC", "Computed Impact Score (3TC)"]].head(10)

---- DEBUG ----
Drug: 3TC
Using EXACT Excel formula
---------------
[Row 0] Afghanistan: hits=23, result=35.742465
[Row 1] Albania: hits=23, result=0.000000
[Row 2] Algeria: hits=23, result=189.858290


2,Country,Country.1,WHO Group,3TC,Computed Impact Score (3TC)
0,Afghanistan,Albania,A,35.667802,35.742465
1,Albania,Algeria,A,0.0,0.0
2,Algeria,Antigua and Barbuda,A,186.166832,189.85829
3,American Samoa,Argentina,A,0.0,0.05496
4,Andorra,Armenia,A,0.176927,0.176927
5,Angola,Azerbaijan,A,1085.331064,1072.832954
6,Anguilla,Bahamas,B,0.0,0.0
7,Antigua and Barbuda,Bahrain,B,0.826518,0.826518
8,Argentina,Bangladesh,B,3783.17524,3835.597657
9,Armenia,Barbados,A,5.906418,6.06745


## Compute for All Drugs

In [24]:
# List of all drugs (excluding 3TC which is already computed)
drugs = ['ABC', 'AZT', 'ddl', 'd4T', 'EFV', 'FTC', 'LPV/r', 'NVP', 'TDF', 'ATV/r']

for drug in drugs:
    main_df = compute_impact_score_2015(
        main_df,
        raw_df,
        drug,
        debug=False
    )

In [25]:
main_df.columns

Index([                                                                   'Country',
                                                                       'WHO Region',
                                                                       'Population',
                                                              'Geographical Region',
                                                                        'WHO Group',
                                                                             'DALY',
                                                                      'Adult DALYs',
                                                                   'Children DALYs',
                                                                   'Retention Rate',
                                                           'Retention Rate (ADULT)',
                                                           'Retention Rate (CHILD)',
                                                            '# Re

## Compute Overall Treatment Impact

In [26]:
# List of computed drug columns
computed_drug_cols_wanted = [
    'Computed Impact Score (3TC)',
    'Computed Impact Score (ABC)',
    'Computed Impact Score (AZT)',
    'Computed Impact Score (ddl)',
    'Computed Impact Score (d4T)',
    'Computed Impact Score (EFV)',
    'Computed Impact Score (FTC)',
    'Computed Impact Score (LPV/r)',
    'Computed Impact Score (NVP)',
    'Computed Impact Score (TDF)',
    'Computed Impact Score (ATV/r)',
]

# Keep only existing columns
computed_drug_cols = [c for c in computed_drug_cols_wanted if c in main_df.columns]
missing = [c for c in computed_drug_cols_wanted if c not in main_df.columns]

if missing:
    print("⚠️ Missing computed columns:")
    for m in missing:
        print(f"   - {m}")

# Ensure numeric
for c in computed_drug_cols:
    main_df[c] = pd.to_numeric(main_df[c], errors="coerce")

# Compute overall impact
main_df["Computed Overall Treatment Impact"] = main_df[computed_drug_cols].sum(axis=1, skipna=True)

## Global Comparison

In [27]:
# Global totals comparison
global_computed_total = main_df["Computed Overall Treatment Impact"].sum(skipna=True)
global_actual_total = pd.to_numeric(main_df["Overall Treatment Impact"], errors="coerce").sum(skipna=True)

print("Global totals (sum across countries):")
print(f"  Computed: {global_computed_total}")
print(f"  Actual  : {global_actual_total}")
print(f"  Diff    : {global_computed_total - global_actual_total}")

Global totals (sum across countries):
  Computed: 2647615.833043346
  Actual  : 2639736.7135998914
  Diff    : 7879.11944345478


## Detailed Comparison Table

In [28]:
# Build comparison table like HIV2013
pairs = [
    ("3TC",   "3TC",   "Computed Impact Score (3TC)"),
    ("ABC",   "ABC",   "Computed Impact Score (ABC)"),
    ("AZT",   "AZT",   "Computed Impact Score (AZT)"),
    ("ddl",   "ddl",   "Computed Impact Score (ddl)"),
    ("d4T",   "d4T",   "Computed Impact Score (d4T)"),
    ("EFV",   "EFV",   "Computed Impact Score (EFV)"),
    ("FTC",   "FTC",   "Computed Impact Score (FTC)"),
    ("LPV/r", "LPV/r", "Computed Impact Score (LPV/r)"),
    ("NVP",   "NVP",   "Computed Impact Score (NVP)"),
    ("TDF",   "TDF",   "Computed Impact Score (TDF)"),
    ("ATV/r", "ATV/r", "Computed Impact Score (ATV/r)"),
    ("Overall", "Overall Treatment Impact", "Computed Overall Treatment Impact"),
]

# Keep only pairs where both columns exist
pairs_existing = []
for drug, a, c in pairs:
    if a in main_df.columns and c in main_df.columns:
        pairs_existing.append((drug, a, c))
    else:
        print(f"⚠️ Skipping {drug}: missing column(s)")

# Ensure numeric
for _, a, c in pairs_existing:
    main_df[a] = pd.to_numeric(main_df[a], errors="coerce")
    main_df[c] = pd.to_numeric(main_df[c], errors="coerce")

# Build comparison table
out = main_df[["Country", "WHO Group"]].copy()

for drug, a, c in pairs_existing:
    out[f"{drug} | Actual"] = main_df[a]
    out[f"{drug} | Computed"] = main_df[c]
    out[f"{drug} | Diff (Comp-Act)"] = main_df[c] - main_df[a]

display(out.head(50))

2,Country,Country.1,WHO Group,3TC | Actual,3TC | Computed,3TC | Diff (Comp-Act),ABC | Actual,ABC | Computed,ABC | Diff (Comp-Act),AZT | Actual,...,NVP | Diff (Comp-Act),TDF | Actual,TDF | Computed,TDF | Diff (Comp-Act),ATV/r | Actual,ATV/r | Computed,ATV/r | Diff (Comp-Act),Overall | Actual,Overall | Computed,Overall | Diff (Comp-Act)
0,Afghanistan,Albania,A,35.667802,35.742465,0.0746627,3.483114,3.483425,0.0003106033,15.831563,...,0.03071683,23.749159,23.113528,-0.635631,0.412628,0.413849,0.00122102,129.229982,128.814112,-0.41587
1,Albania,Algeria,A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Algeria,Antigua and Barbuda,A,186.166832,189.85829,3.691457,3.714239,3.725742,0.01150353,70.801802,...,1.428887,164.992819,164.127475,-0.865344,2.513475,2.558872,0.04539665,710.727413,720.593201,9.865788
3,American Samoa,Argentina,A,0.0,0.05496,0.05496046,0.0,0.000219,0.0002192879,0.0,...,0.02244353,0.0,0.049874,0.049874,0.0,0.000863,0.0008625692,0.0,0.211411,0.211411
4,Andorra,Armenia,A,0.176927,0.176927,-9.965223e-12,0.000706,0.000706,5.924818e-14,0.065802,...,3.094067e-12,0.165287,0.160553,-0.004734,0.002777,0.002777,-9.394785e-14,0.685301,0.680567,-0.004734
5,Angola,Azerbaijan,A,1085.331064,1072.832954,-12.49811,84.823245,84.775053,-0.04819174,466.42703,...,-5.072977,780.409167,746.712865,-33.696302,13.092495,12.902838,-0.1896577,3984.248983,3913.861891,-70.387092
6,Anguilla,Bahamas,B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Antigua and Barbuda,Bahrain,B,0.826518,0.826518,5.36825e-11,0.003298,0.003298,-2.455805e-14,0.307398,...,3.936457e-11,0.772143,0.750028,-0.022116,0.012972,0.012972,4.902245e-12,3.2014,3.179284,-0.022116
8,Argentina,Bangladesh,B,3783.17524,3835.597657,52.42242,122.256802,122.436603,0.1798018,1502.665107,...,20.76645,3192.017671,3153.376046,-38.641625,50.92785,51.636571,0.7087207,14313.30008,14427.796753,114.496673
9,Armenia,Barbados,A,5.906418,6.06745,0.1610319,0.023988,0.024616,0.0006280641,2.206146,...,0.06550314,5.511532,5.497166,-0.014366,0.094334,0.096806,0.002471334,22.882402,23.340979,0.458576


## Show All Data

In [29]:
pd.set_option('display.max_rows', None)
main_df

2,Country,WHO Region,Population,Geographical Region,WHO Group,DALY,Adult DALYs,Children DALYs,Retention Rate,Retention Rate (ADULT),...,Computed Impact Score (AZT),Computed Impact Score (ddl),Computed Impact Score (d4T),Computed Impact Score (EFV),Computed Impact Score (FTC),Computed Impact Score (LPV/r),Computed Impact Score (NVP),Computed Impact Score (TDF),Computed Impact Score (ATV/r),Computed Overall Treatment Impact
0,Afghanistan,EMR,33736494.0,"East, South and South-East Asia",A,10752.548,9224.366,1528.182,72.0,73.0,...,15.859608,0.102642,0.523836,21.420655,7.078738,4.618749,16.456618,23.113528,0.413849,128.814112
1,Albania,EUR,2880703.0,Europe and Central Asia,A,98.495261,96.59686,1.898401,92.0,92.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Algeria,AFR,39871528.0,Middle East and North Africa,A,11586.0447,11055.12,530.9247,92.0,92.0,...,72.088139,0.088267,0.455925,143.006871,50.157411,16.005256,78.520953,164.127475,2.558872,720.593201
3,American Samoa,WPR,55537.0,,A,28.518669,25.84525,2.673419,97.14,97.14,...,0.020441,0.0,0.0,0.042521,0.015478,0.004611,0.022444,0.049874,0.000863,0.211411
4,Andorra,EUR,78014.0,,A,83.36974,83.20017,0.16957,97.14,97.14,...,0.065802,0.0,0.0,0.136883,0.049826,0.014843,0.072249,0.160553,0.002777,0.680567
5,Angola,AFR,27859305.0,Sub-Saharan Africa,A,648951.4,485015.9,163935.5,97.14,97.14,...,461.815783,2.455045,12.552467,679.736498,228.881974,127.112411,484.084004,746.712865,12.902838,3913.861891
6,Anguilla,AMR,14723.0,,B,0.0,0.0,0.0,97.14,97.14,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Antigua and Barbuda,AMR,99923.0,,B,413.85098,388.6716,25.17938,97.14,97.14,...,0.307398,0.0,0.0,0.639453,0.232762,0.069341,0.337515,0.750028,0.012972,3.179284
8,Argentina,AMR,43417765.0,Latin America and the Caribbean,B,78326.475,73886.85,4439.625,66.0,97.14,...,1521.432508,3.136901,16.203024,2761.342505,968.653231,354.603342,1639.378365,3153.376046,51.636571,14427.796753
9,Armenia,EUR,2916950.0,Europe and Central Asia,A,776.945553,773.9614,2.984153,85.0,85.0,...,2.265725,0.0,0.0,4.678311,1.709286,0.516312,2.485307,5.497166,0.096806,23.340979


## Export to CSV

In [30]:
main_df.to_csv("impact_score.csv", index=False)
print("File saved as impact_score.csv")

File saved as impact_score.csv
