<a href="https://colab.research.google.com/github/tamayodb/ndrrmc-typhoon-data-preprocess/blob/main/ndrrmc_typhoon_data_fuzzy_match.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re
from difflib import SequenceMatcher
from datetime import datetime

In [None]:
from google.colab import files
uploaded = files.upload()


Saving ALL-2022.xlsx to ALL-2022.xlsx


# First Data Prep | Merge Sheets

## Normalize Location Name

In [None]:
def normalize_location_name(name, keep_parentheses=False):
    """Comprehensive location name normalization with parentheses handling"""
    if pd.isna(name):
        return name

    name = str(name).strip()

    # Remove common prefixes/suffixes
    name = re.sub(r'^(city of|municipality of|province of)\s+', '', name, flags=re.IGNORECASE)

    # Handle parentheses - extract base name unless keep_parentheses=True
    if not keep_parentheses:

        base_name = re.sub(r'\s*\([^)]*\).*', '', name).strip()
        if base_name:
            name = base_name

    # Standardize separators
    name = name.replace('-', ' ')
    name = name.replace('_', ' ')
    name = name.replace('.', '')

    # Handle common abbreviations
    abbreviations = {
        ' st ': ' saint ',
        ' st.': ' saint',
        ' sto ': ' santo ',
        ' sto.': ' santo',
        ' sta ': ' santa ',
        ' sta.': ' santa',
        ' n ': ' north ',
        ' s ': ' south ',
        ' e ': ' east ',
        ' w ': ' west ',
    }

    name_lower = name.lower()
    for abbr, full in abbreviations.items():
        name_lower = name_lower.replace(abbr, full)

    # Remove extra whitespace and standardize case
    name = ' '.join(name_lower.split())
    name = name.title()  # Proper case

    return name


## Create Location Mapping

In [None]:
def create_location_mapping(df):
    """Create a mapping for similar location names to handle parentheses cases"""
    location_col = 'City/Municipality'
    if location_col not in df.columns:
        return {}

    locations = df[location_col].dropna().unique()
    mapping = {}

    for loc in locations:
        base_name = normalize_location_name(loc, keep_parentheses=False)
        if base_name != loc:
            mapping[loc] = base_name
            print(f" Mapping: '{loc}' ‚Üí '{base_name}'")

    return mapping

In [None]:
def create_city_mapping(df):
    """Create mapping to standardize city names - prefer 'City' version"""
    location_col = 'City/Municipality'
    if location_col not in df.columns:
        return {}

    locations = df[location_col].dropna().unique()
    mapping = {}

    # Group locations by their base name (without "City")
    location_groups = {}
    for loc in locations:
        # Get base name without "City"
        base_name = re.sub(r'\s+city$', '', loc, flags=re.IGNORECASE).strip()
        if base_name not in location_groups:
            location_groups[base_name] = []
        location_groups[base_name].append(loc)

    # For each group, prefer the version with "City"
    for base_name, variants in location_groups.items():
        if len(variants) > 1:
            # Find if there's a "City" version
            city_version = None
            for variant in variants:
                if variant.lower().endswith(' city'):
                    city_version = variant
                    break

            # If we found a city version, map all others to it
            if city_version:
                for variant in variants:
                    if variant != city_version:
                        mapping[variant] = city_version
                        print(f"    üèôÔ∏è  Mapping: '{variant}' ‚Üí '{city_version}'")
            else:
                # If no city version, keep the longest name (most descriptive)
                preferred = max(variants, key=len)
                for variant in variants:
                    if variant != preferred:
                        mapping[variant] = preferred
                        print(f"    üìç Mapping: '{variant}' ‚Üí '{preferred}'")

    return mapping

In [None]:
def prefer_city_over_base(df):
    df = df.copy()
    df['base_name'] = df['City/Municipality'].str.replace(r'\s*City$', '', regex=True)

    # mark rows that should be dropped
    drop_idx = []
    for (typhoon, year, region, prov, base), group in df.groupby(['Typhoon Name','Year','Region','Province','base_name']):
        if any(group['City/Municipality'].str.endswith('City')):
            # if City exists, drop the plain base
            drop_idx.extend(group[group['City/Municipality'] == base].index.tolist())

    df = df.drop(drop_idx).drop(columns='base_name')
    return df

df = prefer_city_over_base(df)


## Clean Typhoon Name

In [None]:
def clean_typhoon_name(name):
    """Clean and standardize typhoon names"""
    if pd.isna(name):
        return name

    name = str(name).strip().upper()

    # Remove common prefixes
    name = re.sub(r'^(TYPHOON|TY|TROPICAL STORM|TS)\s+', '', name)

    # Handle parentheses and additional info
    name = re.sub(r'\s*\([^)]*\)', '', name)
    name = re.sub(r'\s*\d{4}.*', '', name)

    return name.strip()

## Standardize Numeric Columns

In [None]:
def standardize_numeric_columns(df, numeric_cols):
    """Clean and standardize numeric columns"""
    for col in numeric_cols:
        if col not in df.columns:
            continue

        # Convert to string first to handle mixed types
        df[col] = df[col].astype(str)

        # Remove common non-numeric characters
        df[col] = df[col].str.replace(',', '')  # Remove commas
        df[col] = df[col].str.replace('‚Ç±', '')  # Remove peso sign
        df[col] = df[col].str.replace('PHP', '', case=False)
        df[col] = df[col].str.replace('$', '')
        df[col] = df[col].str.replace(' ', '')  # Remove spaces

        # Handle common text values
        df[col] = df[col].str.replace('none', '0', case=False)
        df[col] = df[col].str.replace('nil', '0', case=False)
        df[col] = df[col].str.replace('n/a', '0', case=False)
        df[col] = df[col].str.replace('na', '0', case=False)
        df[col] = df[col].str.replace('-', '0')

        # Convert to numeric, replacing non-convertible with 0
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

        # Handle negative values (set to 0 as they're likely data entry errors)
        df[col] = df[col].clip(lower=0)

    return df

## Assistance Type Mapping

In [None]:
def clean_assistance_type(assistance_type):
    """Standardize assistance type names"""
    if pd.isna(assistance_type):
        return assistance_type

    assistance_type = str(assistance_type).strip()

    # Standardize common assistance types
    type_mapping = {
        'ffp': 'Family Food Pack',
        'family food packs': 'Family Food Pack',
        'food pack': 'Family Food Pack',
        'relief goods': 'Relief Goods',
        'relief good': 'Relief Goods',
        'financial assistance': 'Financial',
        'cash assistance': 'Financial',
        'financial aid': 'Financial',
        'medicine': 'Medical Assistance',
        'medical supplies': 'Medical Assistance',
        'hygiene kit': 'Hygiene Kit',
        'hygiene kits': 'Hygiene Kit',
        'sleeping kit': 'Sleeping Kit',
        'sleeping kits': 'Sleeping Kit',
    }

    assistance_lower = assistance_type.lower()
    for key, value in type_mapping.items():
        if key in assistance_lower:
            return value

    # If no mapping found, return title case
    return assistance_type.title()

## Validate and Clean Year

In [None]:
def validate_and_clean_year(year):
    """Validate and clean year values"""
    if pd.isna(year):
        return year

    try:
        year = int(float(str(year)))
        # Reasonable range for typhoon data
        if 2020 <= year <= datetime.now().year:
            return year
        else:
            return np.nan
    except:
        return np.nan

## Preprocess Dataframe

In [None]:
def preprocess_dataframe(df, sheet_name):
    """Apply comprehensive preprocessing to a dataframe"""
    print(f"  Preprocessing {sheet_name}...")

    # Store original shape
    original_shape = df.shape

    # 1. Clean column names
    df.columns = df.columns.str.strip()

    # 2. Remove completely empty rows and columns
    df = df.dropna(how='all')  # Remove empty rows
    df = df.loc[:, df.notna().any()]  # Remove empty columns

    # 3. Create location mappings for city names and parentheses cases
    if 'City/Municipality' in df.columns:
        # First handle city mappings (Calapan ‚Üí Calapan City)
        city_mapping = create_city_mapping(df)
        if city_mapping:
            df['City/Municipality'] = df['City/Municipality'].replace(city_mapping)

        # Then handle parentheses mappings (Bulalacao (San Pedro) ‚Üí Bulalacao)
        location_mapping = create_location_mapping(df)
        if location_mapping:
            df['City/Municipality'] = df['City/Municipality'].replace(location_mapping)

    # 4. Standardize location names
    location_cols = ['Province', 'City/Municipality', 'Region']
    for col in location_cols:
        if col in df.columns:
            df[col] = df[col].apply(lambda x: normalize_location_name(x, keep_parentheses=False))

    # 5. Clean typhoon names
    if 'Typhoon Name' in df.columns:
        df['Typhoon Name'] = df['Typhoon Name'].apply(clean_typhoon_name)

    # 6. Validate and clean years
    if 'Year' in df.columns:
        df['Year'] = df['Year'].apply(validate_and_clean_year)

    # 7. Handle numeric columns based on sheet type
    numeric_cols = []
    if sheet_name == "Affected Population":
        numeric_cols = ['Families', 'Person', 'Brgy']
    elif sheet_name == "Casualties":
        numeric_cols = ['Dead', 'Injured/Ill', 'Missing']
    elif sheet_name == "Damaged Houses":
        numeric_cols = ['Totally', 'Partially', 'Total']
    elif sheet_name == "Assistance Provided":
        numeric_cols = ['Quantity', 'Cost']
        # Special handling for assistance type
        if 'Type' in df.columns:
            df['Type'] = df['Type'].apply(clean_assistance_type)

    df = standardize_numeric_columns(df, numeric_cols)

    # 8. Remove duplicate records (but aggregate them properly)
    key_cols = ['Typhoon Name', 'Year', 'Region', 'Province', 'City/Municipality']
    available_keys = [col for col in key_cols if col in df.columns]

    if sheet_name != "Assistance Provided":  # Don't remove duplicates from assistance data
        # Instead of just dropping duplicates, aggregate them
        if available_keys:
            numeric_cols_in_df = [col for col in numeric_cols if col in df.columns]
            text_cols_in_df = [col for col in df.columns if col not in available_keys + numeric_cols_in_df]

            agg_funcs = {}
            for col in numeric_cols_in_df:
                agg_funcs[col] = 'sum'
            for col in text_cols_in_df:
                agg_funcs[col] = lambda x: ', '.join(x.dropna().astype(str).unique()) if len(x.dropna()) > 0 else np.nan

            if agg_funcs:
                before_agg = len(df)
                df = df.groupby(available_keys, as_index=False).agg(agg_funcs)
                after_agg = len(df)
                if before_agg != after_agg:
                    print(f" Aggregated {before_agg} records into {after_agg} records")

    # 9. Data validation
    validation_issues = []

    # Check for missing key information
    for col in available_keys:
        missing_count = df[col].isna().sum()
        if missing_count > 0:
            validation_issues.append(f"{col}: {missing_count} missing values")

    # Check for outliers in numeric columns
    for col in numeric_cols:
        if col in df.columns:
            q99 = df[col].quantile(0.99)
            if q99 > 0:  # Avoid division by zero
                outliers = (df[col] > q99 * 10).sum()  # Values 10x larger than 99th percentile
                if outliers > 0:
                    validation_issues.append(f"{col}: {outliers} potential outliers")

    if validation_issues:
        print(f"    Data quality issues: {'; '.join(validation_issues)}")

    print(f"   Shape: {original_shape} ‚Üí {df.shape}")

    return df

## Main Data Pre Processing

In [None]:
# Main processing
file_path = "/content/ALL-2024.xlsx"
xls = pd.ExcelFile(file_path)

print("Starting comprehensive data preprocessing...")

# --- Define sheets we want ---
sheets = {
    "Affected Population": ["Families", "Person", "Brgy"],
    "Casualties": ["Dead", "Injured/Ill", "Missing"],
    "Damaged Houses": ["Totally", "Partially", "Total"],
}

# --- Key columns for merging ---
key_cols = ["Typhoon Name", "Year", "Region", "Province", "City/Municipality"]

merged = None

# Process sheets that can be aggregated
for sheet, cols in sheets.items():
    print(f"\n Processing {sheet}...")
    df = pd.read_excel(file_path, sheet_name=sheet)

    # Apply preprocessing
    df = preprocess_dataframe(df, sheet)

    # Keep only relevant columns
    keep_cols = [c for c in key_cols + cols if c in df.columns]
    df = df[keep_cols]

    # Group by keys for final aggregation
    numeric_cols = [c for c in cols if c in df.columns and df[c].dtype in ['int64', 'float64']]
    text_cols = [c for c in cols if c in df.columns and df[c].dtype == 'object']

    agg_funcs = {}
    for col in numeric_cols:
        agg_funcs[col] = 'sum'
    for col in text_cols:
        agg_funcs[col] = lambda x: ', '.join(x.dropna().astype(str).unique())

    if agg_funcs:
        df = df.groupby([col for col in key_cols if col in df.columns], as_index=False).agg(agg_funcs)
    else:
        df = df.drop_duplicates(subset=[col for col in key_cols if col in df.columns])

    # Merge with main table
    if merged is None:
        merged = df
    else:
        merge_keys = [col for col in key_cols if col in merged.columns and col in df.columns]
        merged = pd.merge(merged, df, on=merge_keys, how="outer")

# Process assistance data
print(f"\n Processing Assistance Provided...")
assistance_df = pd.read_excel(file_path, sheet_name="Assistance Provided")
assistance_data = preprocess_dataframe(assistance_df, "Assistance Provided")

# Keep relevant assistance columns
assistance_cols = ["Quantity", "Type", "Cost"]
assistance_keep_cols = [c for c in key_cols + assistance_cols if c in assistance_data.columns]
assistance_data = assistance_data[assistance_keep_cols]

# Final merge
print(f"\n Performing final merge...")
merge_keys = [col for col in key_cols if col in merged.columns and col in assistance_data.columns]
final_merged = pd.merge(merged, assistance_data, on=merge_keys, how="outer")

# Final data quality report
print(f"\n Final Data Quality Report:")
print(f"  ‚Ä¢ Total records: {len(final_merged):,}")
print(f"  ‚Ä¢ Unique typhoons: {final_merged['Typhoon Name'].nunique()}")
print(f"  ‚Ä¢ Year range: {final_merged['Year'].min():.0f} - {final_merged['Year'].max():.0f}")
print(f"  ‚Ä¢ Provinces covered: {final_merged['Province'].nunique()}")
print(f"  ‚Ä¢ Cities/Municipalities: {final_merged['City/Municipality'].nunique()}")

if 'Type' in final_merged.columns:
    print(f"  ‚Ä¢ Assistance types: {final_merged['Type'].nunique()}")
    print(f"  ‚Ä¢ Top assistance types: {final_merged['Type'].value_counts().head(3).to_dict()}")

Starting comprehensive data preprocessing...

 Processing Affected Population...
  Preprocessing Affected Population...
    üèôÔ∏è  Mapping: 'Cabuyao' ‚Üí 'Cabuyao City'
    üèôÔ∏è  Mapping: 'Lucena' ‚Üí 'Lucena City'
    üèôÔ∏è  Mapping: 'Tayabas' ‚Üí 'Tayabas City'
    üèôÔ∏è  Mapping: 'Talisay' ‚Üí 'Talisay City'
    üèôÔ∏è  Mapping: 'Naga' ‚Üí 'Naga City'
    üèôÔ∏è  Mapping: 'Tabaco' ‚Üí 'Tabaco City'
    üèôÔ∏è  Mapping: 'Santiago' ‚Üí 'Santiago City'
    üèôÔ∏è  Mapping: 'BALANGA' ‚Üí 'BALANGA CITY'
    üèôÔ∏è  Mapping: 'GAPAN' ‚Üí 'GAPAN CITY'
    üèôÔ∏è  Mapping: 'TARLAC' ‚Üí 'TARLAC CITY'
    üèôÔ∏è  Mapping: 'OLONGAPO' ‚Üí 'OLONGAPO CITY'
    üèôÔ∏è  Mapping: 'Bacoor' ‚Üí 'Bacoor City'
    üèôÔ∏è  Mapping: 'Imus' ‚Üí 'Imus City'
    üèôÔ∏è  Mapping: 'San Pedro' ‚Üí 'San Pedro City'
    üèôÔ∏è  Mapping: 'Antipolo' ‚Üí 'Antipolo City'
    üèôÔ∏è  Mapping: 'Victorias' ‚Üí 'Victorias City'
    üèôÔ∏è  Mapping: 'Kabankalan' ‚Üí 'Kabankalan City'
    üèôÔ∏è  Mapp

## Save Results

In [None]:
# Save results | Naming convention - merged_typhoon_year_data_cleaned
final_merged.to_excel("/content/2024.xlsx", index=False)
print("Cleaned and merged file saved")

# Show sample for verification
print(f"\n Sample of cleaned data:")
sample_cols = ['Typhoon Name', 'Year', 'Province', 'City/Municipality', 'Type', 'Quantity', 'Cost']
display_cols = [col for col in sample_cols if col in final_merged.columns]
print(final_merged[display_cols].head(5))

Cleaned and merged file saved

 Sample of cleaned data:
  Typhoon Name  Year          Province City/Municipality    Type  Quantity  \
0        AGHON  2024  Misamis Oriental        Balingasag     NaN       NaN   
1        AGHON  2024            Aurora         Casiguran     NaN       NaN   
2        AGHON  2024            Aurora           Dilasag     NaN       NaN   
3        AGHON  2024             Albay      Legazpi City     NaN       NaN   
4        AGHON  2024             Albay          Pioduran  Malong     727.0   

       Cost  
0       NaN  
1       NaN  
2       NaN  
3       NaN  
4  181750.0  


# Second Data Prep | Fuzzy Match Municipalities



In [None]:
!pip install rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.14.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.14.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.3 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m3.3/3.3 MB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.14.0


In [None]:
import pandas as pd
from rapidfuzz import fuzz, process

# --- Step 1. Load dataset ---
df = pd.read_excel("merged_typhoon_2024_data_cleaned.xlsx")

# --- Step 2. Basic cleaning ---
# Make everything uppercase
df['City/Municipality'] = df['City/Municipality'].str.upper()

# --- Step 3. Fuzzy standardization with preference for City ---
def fuzzy_prefer_city(df, col="City/Municipality", threshold=90, verbose=True):
    # Normalize text
    df[col] = df[col].fillna("").astype(str).str.upper().str.strip()

    names = df[col].unique()
    mapping = {}

    for name in names:
        base = name.replace(" CITY", "")
        candidates = [n for n in mapping.keys() if fuzz.ratio(base, n.replace(" CITY","")) >= threshold]

        if candidates:
            canonical = candidates[0]
        else:
            canonical = name

        # Prefer CITY form if the name has "CITY"
        if "CITY" in name:
            canonical = name

        mapping[name] = canonical

    if verbose:
        print("City/Municipality Mapping:")
        print(mapping)

    df[col] = df[col].map(mapping)
    return df

df = fuzzy_prefer_city(df, col="City/Municipality", threshold=90, verbose=True)

# --- Step 4. Merge rows (prefer City + sum numeric values) ---
def merge_city_with_base(df):
    df = df.copy()
    df['base_name'] = df['City/Municipality'].str.replace(r'\s*CITY$', '', regex=True)

    rows_out = []

    for keys, group in df.groupby(['Typhoon Name','Year','Region','Province','base_name']):
        if any(group['City/Municipality'].str.endswith("CITY")):
            # pick the CITY row as base
            city_rows = group[group['City/Municipality'].str.endswith("CITY")].copy()
            base_rows = group[~group['City/Municipality'].str.endswith("CITY")].copy()

            # merge base row values into CITY rows (only if CITY cell is empty/NaN)
            for _, base_row in base_rows.iterrows():
                for col in df.columns:
                    if col not in ["Typhoon Name","Year","Region","Province","City/Municipality","base_name"]:
                        if pd.notna(base_row[col]):
                            # put into the first CITY row if its cell is NaN
                            if pd.isna(city_rows.iloc[0][col]) or city_rows.iloc[0][col] in ["", 0]:
                                city_rows.iat[0, city_rows.columns.get_loc(col)] = base_row[col]

            rows_out.extend(city_rows.to_dict("records"))
        else:
            # no CITY ‚Üí keep all rows
            rows_out.extend(group.to_dict("records"))

    df_out = pd.DataFrame(rows_out)
    df_out = df_out.drop(columns='base_name', errors='ignore')

    return df_out


df = merge_city_with_base(df)

# --- Step 5. Save cleaned dataframe ---
df.to_excel("fuzzy_typhoon_2024_data_cleaned.xlsx", index=False)

df.head()


City/Municipality Mapping:
{'BALINGASAG': 'BALINGASAG', 'CASIGURAN': 'CASIGURAN', 'DILASAG': 'DILASAG', 'LEGAZPI CITY': 'LEGAZPI CITY', 'PIODURAN': 'PIODURAN', 'TABACO CITY': 'TABACO CITY', 'BASUD': 'BASUD', 'LABO': 'LABO', 'MERCEDES': 'MERCEDES', 'STA ELENA': 'STA ELENA', 'BAAO': 'BAAO', 'CANAMAN': 'CANAMAN', 'GOA': 'GOA', 'NAGA CITY': 'NAGA CITY', 'PASACAO': 'PASACAO', 'SAN ANDRES': 'SAN ANDRES', 'VIRAC': 'VIRAC', 'BALENO': 'BALENO', 'MASBATE CITY': 'MASBATE CITY', 'MOBO': 'MOBO', 'PALANAS': 'PALANAS', 'BULAN': 'BULAN', 'MATNOG': 'MATNOG', 'PILAR': 'PILAR', 'BOGO': 'BOGO', 'DANAO': 'DANAO', 'SAN REMIGIO': 'SAN REMIGIO', 'BALANGIGA': 'BALANGIGA', 'DOLORES': 'DOLORES', 'MAYDOLONG': 'MAYDOLONG', 'LAVEZARES': 'LAVEZARES', 'SANTA MARGARITA': 'SANTA MARGARITA', 'BATANGAS CITY': 'BATANGAS CITY', 'CALACA': 'CALACA', 'LIPA': 'LIPA', 'MABINI': 'MABINI', 'MATAAS NA KAHOY': 'MATAAS NA KAHOY', 'NO BREAKDOWN': 'NO BREAKDOWN', 'PADRE GARCIA': 'PADRE GARCIA', 'ROSARIO': 'ROSARIO', 'SAN JOSE': 'SAN J

Unnamed: 0,Typhoon Name,Year,Region,Province,City/Municipality,Families,Person,Brgy,Dead,Injured/Ill,Missing,Totally,Partially,Total,Quantity,Type,Cost
0,AGHON,2024,10,Misamis Oriental,BALINGASAG,,,,1.0,1.0,0.0,,,,,,
1,AGHON,2024,3,Aurora,CASIGURAN,1.0,5.0,1.0,,,,,,,,,
2,AGHON,2024,3,Aurora,DILASAG,846.0,3384.0,7.0,,,,,,,,,
3,AGHON,2024,5,Albay,LEGAZPI CITY,1.0,5.0,1.0,,,,,,,,,
4,AGHON,2024,5,Albay,PIODURAN,657.0,727.0,1.0,,,,,,,727.0,Malong,181750.0


# Third Data Pre-Process


In [None]:
import pandas as pd

# Load your cleaned Excel
df = pd.read_excel("fuzzy_typhoon_2020_2025.xlsx")

# --- Standardize text fields ---
df["Typhoon Name"] = df["Typhoon Name"].str.strip().str.upper()
df["Province"] = df["Province"].str.strip().str.upper()
df["City/Municipality"] = df["City/Municipality"].str.strip().str.upper()

# --- Keep only the needed columns ---
unique_places = (
    df[["Typhoon Name", "Year", "Province", "City/Municipality"]]
    .drop_duplicates()   # remove exact duplicates
    .reset_index(drop=True)
)

# --- Save to Excel ---
unique_places.to_excel("unique_provinces_municipalities_per_typhoon_year.xlsx", index=False)

print("Done! Unique typhoon-province-municipality list saved.")
print(unique_places.head(20))


Done! Unique typhoon-province-municipality list saved.
   Typhoon Name  Year       Province   City/Municipality
0          AMBO  2020   ILOCOS NORTE              BANGUI
1          AMBO  2020  NUEVA VIZCAYA              QUEZON
2          AMBO  2020         AURORA               BALER
3          AMBO  2020         AURORA           CASIGURAN
4          AMBO  2020         AURORA          DINALUNGAN
5          AMBO  2020         AURORA            DINGALAN
6          AMBO  2020         AURORA           DIPACULAO
7          AMBO  2020         AURORA        MARIA AURORA
8          AMBO  2020         AURORA            SAN LUIS
9          AMBO  2020        BULACAN             BULAKAN
10         AMBO  2020        BULACAN              OBANDO
11         AMBO  2020        BULACAN  SAN JOSE DEL MONTE
12         AMBO  2020         QUEZON          BUENAVISTA
13         AMBO  2020         QUEZON           CATANAUAN
14         AMBO  2020         QUEZON              MAUBAN
15         AMBO  2020         QUE

In [None]:
import pandas as pd
from collections import OrderedDict

# Load cleaned file
df = pd.read_excel("fuzzy_typhoon_2020_2025.xlsx")

# Standardize
df["Typhoon Name"] = df["Typhoon Name"].astype(str).str.strip().str.upper()
df["Province"] = df["Province"].astype(str).str.strip().str.upper()
df["City/Municipality"] = df["City/Municipality"].astype(str).str.strip().str.upper()

# Remove duplicates but keep first-seen order
df = df.drop_duplicates(subset=["Typhoon Name", "Year", "Province", "City/Municipality"])

# Build hierarchical mapping
typhoon_dict = OrderedDict()

for _, row in df.iterrows():
    key = (row["Typhoon Name"], row["Year"])
    if key not in typhoon_dict:
        typhoon_dict[key] = OrderedDict()
    if row["Province"] not in typhoon_dict[key]:
        typhoon_dict[key][row["Province"]] = []
    if row["City/Municipality"] not in typhoon_dict[key][row["Province"]]:
        typhoon_dict[key][row["Province"]].append(row["City/Municipality"])

# Print nicely
for (typhoon, year), provinces in typhoon_dict.items():
    print(f"\nüå™Ô∏è {typhoon} ({year})")
    for province, municipalities in provinces.items():
        print(f"  üìç {province}")
        for muni in municipalities:
            print(f"     - {muni}")



üå™Ô∏è AMBO (2020)
  üìç ILOCOS NORTE
     - BANGUI
  üìç NUEVA VIZCAYA
     - QUEZON
  üìç AURORA
     - BALER
     - CASIGURAN
     - DINALUNGAN
     - DINGALAN
     - DIPACULAO
     - MARIA AURORA
     - SAN LUIS
  üìç BULACAN
     - BULAKAN
     - OBANDO
     - SAN JOSE DEL MONTE
  üìç QUEZON
     - BUENAVISTA
     - CATANAUAN
     - MAUBAN
     - TAYABAS
  üìç MARINDUQUE
     - BOAC
     - BUENAVISTA
     - GASAN
     - MOGPOG
     - SANTA CRUZ
     - TORRIJOS
  üìç ALBAY
     - BACACAY
     - CAMALIG
     - DARAGA
     - GUINOBATAN
     - JOVELLAR
     - LEGAZPI
     - LIBON
     - LIGAO
     - MALILIPOT
     - MALINAO
     - MANITO
     - OAS
     - PIO DURAN
     - POLANGUI
     - RAPU RAPU
     - SANTO DOMINGO
     - TABACO
     - TIWI
  üìç CAMARINES NORTE
     - BASUD
     - CAPALONGA
     - DAET
     - JOSE PANGANIBAN
     - LABO
     - MERCEDES
     - PARACALE
     - SAN LORENZO RUIZ
     - SAN VICENTE
     - SANTA ELENA
     - TALISAY
     - VINZONS
  üìç CAMAR

# Fourth Data Pre-Process | Unique Assistance Types

### Get Unique Assistance Type

In [None]:
import pandas as pd

df = pd.read_excel("fuzzy_typhoon_2020_2025.xlsx")

# Get all unique assistance types
unique_types = df["Type"].dropna().unique()

unique_types_df = pd.DataFrame(unique_types, columns=["Assistance_Type"])

unique_types_df.to_excel("unique_assistance_types.xlsx", index=False)

print("Unique assistance types saved to unique_assistance_types.xlsx")


Unique assistance types saved to unique_assistance_types.xlsx


### Standardize Types

In [None]:
import re
import pandas as pd
from google.colab import files

# Load file
filename = "unique_assistance_types.xlsx"
df = pd.read_excel(filename)

def normalize_text(text):
    if pd.isna(text):
        return ''

    text = str(text).strip()

    # --- Step 1: Fix common spelling errors ---
    corrections = {
        "Generatir": "Generator",
        "Repait": "Repair",
        "Vatious": "Various",
        "Hotmeals": "Hot Meal",
        "Hot Meals": "Hot Meal"
    }
    for wrong, right in corrections.items():
        text = re.sub(wrong, right, text, flags=re.IGNORECASE)

    # --- Step 2: Standardize AICS early ---
    if re.search(r"\bAICS\b", text, flags=re.IGNORECASE) or \
       "ASSISTANCE TO INDIVIDUALS IN CRISIS SITUATION" in text.upper():
        return "AICS"

    # --- Step 3: Remove parentheses and their contents ---
    text = re.sub(r"\s*\([^)]*\)", "", text)

    # --- Step 4: Remove extra descriptors (colors, boxes, etc.) ---
    text = re.sub(r"\s*-\s*(Orange|White|Black).*?$", "", text, flags=re.IGNORECASE)
    text = re.sub(r"\s*(Orange Box|White|Black)\s*$", "", text, flags=re.IGNORECASE)

    # --- Step 5: Remove numbers and units ---
    text = re.sub(r"\b\d+(\.\d+)?\s*(ML|L|LTRS?|LITERS?|PCS?|PIECES?)\b", "", text, flags=re.IGNORECASE)
    text = re.sub(r"\b\d+\b", "", text)

    # --- Step 6: Remove container words ---
    text = re.sub(r"\b(BOXES?|BOTTLES?|ROLLS?|SHEETS?|PCS?|PIECES?)\b", "", text, flags=re.IGNORECASE)

    # --- Step 7: Handle "AND" / "OF" ---
    text = re.sub(r"\bAND\b", ",", text, flags=re.IGNORECASE)  # replace AND with comma
    text = re.sub(r"\bOF\b", "", text, flags=re.IGNORECASE)    # remove OF completely

      # --- Step 8: Plural to singular conversion ---
    words = []
    for w in text.split():
        upper_w = w.upper()

        # Skip words we want to preserve as-is
        if upper_w in ["AICS", "CLOTHES", "SARDINES", "VARIOUS", "SUPPLIES"]:
            words.append(upper_w)
            continue

        # Simple plural rule: remove trailing S (but not for short words)
        if upper_w.endswith("S") and len(upper_w) > 3:
            w = upper_w[:-1]
        else:
            w = upper_w

        words.append(w)

    text = " ".join(words)

    # --- Step 9: Clean commas and spaces ---
    text = re.sub(r"\b(AND|OF)\b", ",", text)
    text = text.replace("-", " ")
    text = re.sub(r"\s*,\s*", ", ", text)  # normalize comma spacing
    text = re.sub(r",\s*,+", ",", text)    # collapse double commas
    text = re.sub(r"\s+", " ", text).strip()

    # --- Step 10: Convert to UPPER CASE ---
    text = text.upper()

    # --- Step 11: Apply mapping dictionary ---
    mapping = {
        "FAMILY PACK": "FAMILY FOOD PACK",
        "FOOD PACK": "FAMILY FOOD PACK",
        "HOT MEAL": "HOT MEAL",
        "HOT FOOD": "HOT MEAL",
        "HYGIENE": "HYGIENE KIT",
        "TRAUMA": "TRAUMA KIT",
        "CLOTHING": "CLOTHES",
        "RTEF": "READY TO EAT FOOD",
        "GENSET" : "GENERATOR SET",
        "NFI": "NON FOOD ITEM",
        "FNIS" : "FOOD, NON FOOD ITEM",
        "FNI" : "FOOD, NON FOOD ITEM"
    }
    if text in mapping:
        text = mapping[text]

    return text

# Apply normalization
df["Assistance_Type_Normalized"] = df["Assistance_Type"].apply(normalize_text)

# Save to Excel
output_filename = "unique_assistance_types_normalized.xlsx"
df.to_excel(output_filename, index=False)

# Download cleaned file
files.download(output_filename)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Apply normalization to fuzzy_tyhoon.xlsx
filename = "fuzzy_typhoon_normalized.xlsx"
df = pd.read_excel(filename)

df["normalized_type"] = df["Type"].apply(normalize_text)

# Save to Excel
output_filename = "fuzzy_typhoon_normalized_ver2.xlsx"
df.to_excel(output_filename, index=False)

# Download cleaned file
files.download(output_filename)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Categorize Assitance Types and Merge Rows

In [None]:
import pandas as pd
import numpy as np

# Load your data
df = pd.read_excel("fuzzy_typhoon_merge_rows.xlsx")

# Full mapping dictionary
type_to_category = {
    # Clothing Kit
    "ASSORTED USED CLOTHES": "Clothing Kit",
    "FAMILY CLOTHING KIT": "Clothing Kit",
    "MALONG": "Clothing Kit",
    "RAINCOAT": "Clothing Kit",
    "TOWEL": "Clothing Kit",

    # Family Food Pack
    "ASSORTED CANNED GOOD": "Family Food Pack",
    "ASSORTED FOOD ITEM": "Family Food Pack",
    "BIHON": "Family Food Pack",
    "BISCUIT": "Family Food Pack",
    "BOTTLED WATER": "Family Food Pack",
    "CANNED GOOD": "Family Food Pack",
    "COOKING OIL": "Family Food Pack",
    "DISTILLED WATER": "Family Food Pack",
    "ENERGEN": "Family Food Pack",
    "F/NFI FAMILY FOOD PACK FOR TOTALLY DAMAGED HOUSE": "Family Food Pack",
    "FAMILY FOOD PACK": "Family Food Pack",
    "FOOD": "Family Food Pack",
    "FOOD ASSISTANCE": "Family Food Pack",
    "FOOD ITEM": "Family Food Pack",
    "FROZEN GOOD": "Family Food Pack",
    "GROCERY PACK": "Family Food Pack",
    "MEATLOAF": "Family Food Pack",
    "MINERAL WATER": "Family Food Pack",
    "NOODLE": "Family Food Pack",
    "NUTRIBUN": "Family Food Pack",
    "OTHER FOOD ITEM": "Family Food Pack",
    "POTABLE WATER": "Family Food Pack",
    "PURIFIED WATER": "Family Food Pack",
    "RICE": "Family Food Pack",
    "RICE ASSISTANCE": "Family Food Pack",
    "RICE, CANNED GOOD, COFFEE": "Family Food Pack",
    "SACK RICE, SARDINES": "Family Food Pack",
    "SARDINES": "Family Food Pack",
    "WATER": "Family Food Pack",

    # Financial / Social Assistance
    "AICS": "Financial / Social Assistance",
    "BURIAL ASSISTANCE": "Financial / Social Assistance",
    "EMERGENCY SHELTER ASSISTANCE": "Financial / Social Assistance",
    "FINANCIAL": "Financial / Social Assistance",
    "HOUSING ASSISTANCE": "Financial / Social Assistance",
    "RELIEF ASSISTANCE": "Financial / Social Assistance",

    # Hygiene Kit
    "BATH SOAP": "Hygiene Kit",
    "DISPOSABLE FACE MASK": "Hygiene Kit",
    "FACE MASK": "Hygiene Kit",
    "FACE SHIELD": "Hygiene Kit",
    "HAND SANITIZER STERILIUM": "Hygiene Kit",
    "HYGIENE KIT": "Hygiene Kit",
    "HYGIENE PRODUCT": "Hygiene Kit",
    "SOAP": "Hygiene Kit",

    # Kitchen Kit
    "BUTANE CANISTER": "Kitchen Kit",
    "BUTANE GAS STOVE": "Kitchen Kit",
    "CANISTER": "Kitchen Kit",
    "COLLAPSIBLE WATER CONTAINER": "Kitchen Kit",
    "COLLAPSIBLE WATER CUP": "Kitchen Kit",
    "COMMUNITY KITCHEN": "Kitchen Kit",
    "JERRY CAN": "Kitchen Kit",
    "KITCHEN KIT": "Kitchen Kit",
    "KITCHEN SET": "Kitchen Kit",
    "PORTABLE STOVE": "Kitchen Kit",
    "WATER CONTAINER": "Kitchen Kit",
    "WATER FILTRATION KIT": "Kitchen Kit",
    "WATER TANK": "Kitchen Kit",

    # Livelihood
    "ABACA SEEDLING": "Livelihood",

    # Logistical Support
    "ACCOMMODATION, TRANSPORTATION": "Logistical Support",
    "DIESEL": "Logistical Support",
    "FUEL": "Logistical Support",
    "GASOLINE": "Logistical Support",
    "GENERATOR": "Logistical Support",
    "GENERATOR SET": "Logistical Support",
    "MEGAPHONE": "Logistical Support",

    # Medical
    "ALCOHOL": "Medical",
    "ANTIGEN TEST KIT": "Medical",
    "ASSORTED MEDICAL ITEM": "Medical",
    "COTTON": "Medical",
    "DOXYCYCLINE": "Medical",
    "FIRST AID KIT": "Medical",
    "MEDICAL ASSISTANCE": "Medical",
    "PPE SET": "Medical",
    "TRAUMA KIT": "Medical",

    # Ready to Eat Food
    "HOT MEAL": "Ready to Eat Food",
    "HOT MEAL, RICE, CANNED GOOD": "Ready to Eat Food",
    "MEAL": "Ready to Eat Food",
    "PACK LUNCH": "Ready to Eat Food",
    "PACKED MEAL": "Ready to Eat Food",
    "READY TO EAT FOOD": "Ready to Eat Food",

    # Ready to Eat Food + Clothing Kit
    "HOT MEAL, MALONG": "Ready to Eat Food, Clothing Kit",
    "HOT MEAL, WATER, CLOTHES": "Ready to Eat Food, Clothing Kit",

    # Ready to Eat Food + Logistical Support
    "MEAL, ACCOMMODATION": "Ready to Eat Food, Logistical Support",

    # Shelter Kit
    "CHAINSAW": "Shelter Kit",
    "F/NFI EMERGENCY SHELTER KIT WITH TARP": "Shelter Kit",
    "F/NFIS SHELTER REPAIR KIT": "Shelter Kit",
    "FAMILY TENT": "Shelter Kit",
    "GI": "Shelter Kit",
    "MODULAR TENT": "Shelter Kit",
    "SANDBAG": "Shelter Kit",
    "SHELTER KIT": "Shelter Kit",
    "SHELTER REPAIR KIT": "Shelter Kit",
    "SOLAR LAMP": "Shelter Kit",
    "TARPAULIN": "Shelter Kit",
    "TENT": "Shelter Kit",

    # Sleeping Kit
    "BLANKET": "Sleeping Kit",
    "F/NFI BLANKET": "Sleeping Kit",
    "HEAVY DUTY COT BED": "Sleeping Kit",
    "MAT": "Sleeping Kit",
    "MOSQUITO NET": "Sleeping Kit",
    "SLEEPING GEAR": "Sleeping Kit",
    "SLEEPING KIT": "Sleeping Kit",
    "SLEEPING SUPPLIES": "Sleeping Kit",

    # Special categories
    "NO BREAKDOWN": "No Breakdown",
    "WITHOUT BREAKDOWN": "No Breakdown",
    "NOT SPECIFIED": "Not Specified",
    "FOR VERIFICATION": "For Verification",

    # Others
    "BODY BAG": "Others",
    "BREASTFEEDING KIT": "Others",
    "CADAVER BAG": "Others",
    "CHILD FRIENDLY SPACE KIT": "Others",
    "ECO BAG": "Others",
    "F/NFI": "Others",
    "FAMILY KIT": "Others",
    "FNI KIT": "Others",
    "FOOD, NFI": "Others",
    "FOOD, NON FOOD ITEM": "Others",
    "KIT": "Others",
    "LAMINATED PACK": "Others",
    "LAMINATED SACK": "Others",
    "NON FOOD ITEM": "Others",
    "OTHER": "Others",
    "OTHER NFI": "Others",
    "OTHER NON FOOD ITEM": "Others",
    "PACK": "Others",
    "RELIEF GOOD": "Others",
    "RELIEF PACK": "Others",
    "SAKO LINE": "Others",
    "UTILITY": "Others",
    "VARIOUS": "Others",
}

# Map types to categories
df["Category"] = df["Normalized_Type"].map(type_to_category).str.upper()

# Grouping keys (kept same across rows)
group_cols = [
    "Typhoon Name", "Year", "Region", "Province", "City/Municipality",
    "Families", "Person", "Brgy", "Dead", "Injured/Ill",
    "Missing", "Totally", "Partially", "Total"
]


agg_df = df.groupby(group_cols, dropna=False).agg({
    "Quantity": lambda x: x.sum() if x.notna().any() else np.nan,
    "Cost": lambda x: x.sum() if x.notna().any() else np.nan,
    "Normalized_Type": lambda x: sorted(set(x.dropna())) if x.notna().any() else np.nan,
    "Category": lambda x: sorted(set(x.dropna())) if x.notna().any() else np.nan
}).reset_index()

# Save result
agg_df.to_excel("aggregated_output.xlsx", index=False)


### Check for duplicates


In [None]:
import pandas as pd

df = pd.read_excel("DATAFRAME-V1.xlsx")

# Define key columns
key_cols = ["Typhoon Name", "Year", "Region", "Province", "City/Municipality"]

# Find duplicate rows based on key columns
duplicates = df[df.duplicated(subset=key_cols, keep=False)].sort_values(by=key_cols)

print(f"Found {len(duplicates)} duplicate rows based on keys.\n")

# Print the duplicate rows
print(duplicates)


Found 0 duplicate rows based on keys.

Empty DataFrame
Columns: [Typhoon Name, Year, Region, Province, City/Municipality, Families, Person, Brgy, Dead, Injured/Ill, Missing, Totally, Partially, Total, Quantity, Cost, Type, Category]
Index: []


In [None]:
# Save duplicates for review
duplicates.to_excel("duplicates_in_dataframe_v1.xlsx", index=False)