#lab3: CRISP-DM - Data Understanding

# Task 4: Data Quality Verification

In [13]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import display

file_path ='/content/drive/MyDrive/ public_emdat_project.csv'

df = pd.read_csv(file_path, encoding='ISO-8859-1')

# Display number of rows and columns to show the size of the Dataset
print("Dataset Dimensions:", df.shape)

Dataset Dimensions: (15784, 46)


In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
## Create and display summary of dataset features (easy to read table)
df_info = pd.DataFrame({
    "Column Name": df.dtypes.index,
    "Data Type": df.dtypes.values,
    "Sample Values": [df[col].dropna().unique()[:3] for col in df.columns]  # Show up to 3 unique sample values
})

print("\nDataset Features:")
display(df_info)


Dataset Features:


Unnamed: 0,Column Name,Data Type,Sample Values
0,DisNo.,object,"[1999-9388-DJI, 1999-9388-SDN, 1999-9388-SOM]"
1,Historic,object,[No]
2,Classification Key,object,"[nat-cli-dro-dro, tec-tra-roa-roa, nat-hyd-flo..."
3,Disaster Group,object,"[Natural, Technological]"
4,Disaster Subgroup,object,"[Climatological, Transport, Hydrological]"
5,Disaster Type,object,"[Drought, Road, Flood]"
6,Disaster Subtype,object,"[Drought, Road, Riverine flood]"
7,External IDs,object,"[USGS:usp0009khm, USGS:usp0009m4u, USGS:usp000..."
8,Event Name,object,"[Hotel, Pacaya, Airbus A310]"
9,ISO,object,"[DJI, SDN, SOM]"


### **1. Accuracy Dimension**:


In [15]:
# --- Outlier Detection: Identify Unusual Numeric Entries ---
def detect_iqr_outliers(df, column):
    """
    Detects and displays outliers in a numeric column using the IQR method.
    """
    # Check if column exists and is numeric
    if column not in df.columns:
        print(f"[Error] '{column}' does not exist in the DataFrame.")
        return
    if not np.issubdtype(df[column].dtype, np.number):
        print(f"[Error] '{column}' is not a numeric column.")
        return

     # Calculate IQR
    q1 = df[column].quantile(0.25)
    q3 = df[column].quantile(0.75)
    iqr = q3 - q1

    # Identify outliers
    outlier_mask = (df[column] < (q1 - 1.5 * iqr)) | (df[column] > (q3 + 1.5 * iqr))
    outliers = df[outlier_mask]
    outlier_percentage = outlier_mask.mean() * 100  # mean() gives proportion of True values

    # Print results
    print(f"== IQR Outlier Report for '{column}' ==")
    print(f"Number of outliers detected: {outliers.shape[0]}")
    print(f"Outlier percentage in 'column': {outlier_percentage:.2f}%")
    if not outliers.empty:
        print("Sample outliers:")
        print(outliers[[column]].head(5))

# usage:
detect_iqr_outliers(df, "Magnitude")
detect_iqr_outliers(df, "Total Deaths")
detect_iqr_outliers(df, "No. Injured")
detect_iqr_outliers(df, "Total Affected")

== IQR Outlier Report for 'Magnitude' ==
Number of outliers detected: 554
Outlier percentage in 'column': 3.51%
Sample outliers:
     Magnitude
20    350000.0
74     55000.0
114   100000.0
243   200000.0
279   114000.0
== IQR Outlier Report for 'Total Deaths' ==
Number of outliers detected: 1484
Outlier percentage in 'column': 9.40%
Sample outliers:
    Total Deaths
14         800.0
21         275.0
25         169.0
35          88.0
43          83.0
== IQR Outlier Report for 'No. Injured' ==
Number of outliers detected: 815
Outlier percentage in 'column': 5.16%
Sample outliers:
     No. Injured
33        2528.0
86         200.0
138        194.0
151        252.0
195        300.0
== IQR Outlier Report for 'Total Affected' ==
Number of outliers detected: 2149
Outlier percentage in 'column': 13.62%
Sample outliers:
    Total Affected
0         100000.0
1        2000000.0
2        1200000.0
4          70000.0
14       4500000.0


**Considerations:**

*   A significant number of outliers were detected in all examined features.
* Some outlier values (e.g., 350,000 for Magnitude or 2,528 for No. Injured) may be outside the plausible range for the context of your data.  
* **High counts of outliers could indicate:**
-- Data entry errors (e.g., extra zeros)
-- Inconsistent data collection methods.

*  Outliers can strongly influence statistical analysis and model performance













### **2. Completeness Dimension**:
Completeness measures if all the required data is present (i.e., no missing values in mandatory fields).

In [16]:
# Count missing values (NaN)
missing_values = df.isna().sum()

# Count empty string values ("" or just spaces)
empty_values = df.astype(str).map(lambda x: x.strip() == "" if pd.notna(x) else False).sum()

# Calculate total missing (NaN + Empty Strings)
total_missing = missing_values + empty_values

# Create a summary DataFrame
completeness_report = pd.DataFrame({
    "Total Values": len(df),
    "Missing (NaN)": missing_values,
    "Empty Strings": empty_values,
    "Total Missing": total_missing,
    "Missing %": (total_missing / len(df)) * 100
})

if completeness_report.empty:
        print("‚úÖ No missing values found in the dataset!")
else:
    print("\nüîç Completeness Check Report:")
    display(completeness_report)

    # Display sample rows with missing values
    for col in completeness_report.index:
        print(f"\nüí•Examples of Missing Data in {col}:")
        print(df[df[col].isna() | (df[col].astype(str).str.strip() == "")][["DisNo.", col]].head(3))


üîç Completeness Check Report:


Unnamed: 0,Total Values,Missing (NaN),Empty Strings,Total Missing,Missing %
DisNo.,15784,0,0,0,0.0
Historic,15784,0,0,0,0.0
Classification Key,15784,0,0,0,0.0
Disaster Group,15784,0,0,0,0.0
Disaster Subgroup,15784,0,0,0,0.0
Disaster Type,15784,0,0,0,0.0
Disaster Subtype,15784,0,0,0,0.0
External IDs,15784,13379,0,13379,84.763051
Event Name,15784,10829,0,10829,68.607451
ISO,15784,0,0,0,0.0



üí•Examples of Missing Data in DisNo.:
Empty DataFrame
Columns: [DisNo., DisNo.]
Index: []

üí•Examples of Missing Data in Historic:
Empty DataFrame
Columns: [DisNo., Historic]
Index: []

üí•Examples of Missing Data in Classification Key:
Empty DataFrame
Columns: [DisNo., Classification Key]
Index: []

üí•Examples of Missing Data in Disaster Group:
Empty DataFrame
Columns: [DisNo., Disaster Group]
Index: []

üí•Examples of Missing Data in Disaster Subgroup:
Empty DataFrame
Columns: [DisNo., Disaster Subgroup]
Index: []

üí•Examples of Missing Data in Disaster Type:
Empty DataFrame
Columns: [DisNo., Disaster Type]
Index: []

üí•Examples of Missing Data in Disaster Subtype:
Empty DataFrame
Columns: [DisNo., Disaster Subtype]
Index: []

üí•Examples of Missing Data in External IDs:
          DisNo. External IDs
0  1999-9388-DJI          NaN
1  1999-9388-SDN          NaN
2  1999-9388-SOM          NaN

üí•Examples of Missing Data in Event Name:
          DisNo. Event Name
0  1999-9

**Considerations :**

Based on the size of the available dataset and the difficulty in obtaining all the data, we have many missing values. Therefore, a number of columns will be removed; some of them do not assist us in the analysis, while others are being removed due to a high percentage of incompleteness exceeding 95%.

**The columns that contain a high percentage of missing data:**

 External IDs, Event Name, Origin	, Associated Types, AID Contribution ,  No. Homeless, Reconstruction Costs , Reconstruction Costs, Adjusted, Insured Damage,Insured Damage, Adjusted, Total Damage, Total Damage, Adjusted.


 **The columns that are irrelevant to our data mining goals:**

 Magnitude Scale,Latitude, Longitude, River Basin,Admin Units

 **The columns that contain an acceptable amount of missingness that we can handle are:**

  Location, Magnitude, Total Deaths, Start Month, Start Day, End Month, End Day, No. Injured, No. Affected, and Total Affected,CPI.

### **3.Consistency Dimension**:
Refresher: Consistency ensures that data across different columns or records does not conflict.

**Considerations:**

- Check for contradictory values (e.g., 0 quantity but non-zero total_price).
- Ensure formats are uniform (e.g., date formats, units of measurement).
- Document any irregularities.

Handling different data types:

- Numerical: Ensure values are in the correct unit.
- Categorical: Ensure consistent spelling or format.
- Date/Time: Standardize all formats



In [17]:
# Define a list of valid country names
valid_countries = [
    "Sudan", "Somalia", "Angola", "Brazil", "Guatemala",
    "Bangladesh", "Indonesia", "Bulgaria", "Egypt", "China", "India",
    "United States of America", "Philippines", "Nigeria", "Sint Maarten (Dutch part)",
    "Pakistan", "Russian Federation", "Saint Martin (French Part)", "Anguilla",
    "Cura√ßao", "Isle of Man", "Democratic Republic of the Congo", "Mexico",
    "Peru", "Viet Nam", "Niue", "Tokelau", "Montserrat", "Wallis and Futuna Islands",
    "Iran (Islamic Republic of)", "South Africa", "Afghanistan", "T√ºrkiye",
    "Colombia", "Saint Kitts and Nevis", "British Virgin Islands",
    "Netherlands Antilles", "Saint Helena", "Saint Barth√©lemy",
    "Japan", "Thailand", "Kenya", "Italy", "Uganda",
    "New Caledonia", "Antigua and Barbuda", "Bermuda",
    "Turkmenistan", "French Guiana", "United Republic of Tanzania",
    "France", "Australia", "Nepal", "Haiti",
    "Grenada", "Finland", "United States Virgin Islands",
    "French Polynesia", "Mayotte", "Argentina",
    "Bolivia (Plurinational State of)", "Mozambique", "Malaysia",
    "Algeria", "Cook Islands", "American Samoa",
    "China, Macao Special Administrative Region", "Kuwait", "Kiribati",
    "Ethiopia", "Morocco", "Greece", "Republic of Korea",
    "Spain", "Dominica", "Palau", "Sao Tome and Principe", "Iceland", "Singapore",
    "Ecuador", "Canada", "Sri Lanka", "Yemen",
    "Taiwan (Province of China)", "Guam", "Northern Mariana Islands",
    "Eritrea", "Martinique", "Qatar", "Madagascar",
    "Zimbabwe", "Niger", "Germany", "Cameroon",
    "Myanmar", "Romania", "Chile", "Ukraine", "Malawi",
    "Guinea", "Honduras", "United Kingdom of Great Britain and Northern Ireland",
    "Libya", "Burundi", "Mali", "Zambia",
    "Dominican Republic", "Senegal", "Ghana",
    "Venezuela (Bolivarian Republic of)", "Central African Republic",
    "Tajikistan", "Papua New Guinea", "Burkina Faso",
    "Saudi Arabia", "Poland", "El Salvador",
    "Cuba", "Nicaragua", "Panama", "Tunisia",
    "Rwanda", "Iraq", "Benin", "Chad",
    "Congo", "Paraguay", "Belgium", "Cambodia",
    "C√¥te d'Ivoire", "New Zealand", "Costa Rica",
    "Democratic People's Republic of Korea", "Portugal",
    "Switzerland", "Mauritania", "Syrian Arab Republic", "Sierra Leone",
    "Kazakhstan", "Mongolia", "Czechia", "Croatia",
    "Uruguay", "Hungary", "Lao People's Democratic Republic", "Bosnia and Herzegovina",
    "South Sudan", "Serbia", "Kyrgyzstan", "Austria",
    "Namibia", "Fiji", "Albania", "Slovakia",
    "Netherlands (Kingdom of the)", "Togo", "North Macedonia", "Puerto Rico",
    "Comoros", "Vanuatu", "Georgia", "Gambia",
    "China, Hong Kong Special Administrative Region", "Guinea-Bissau", "Liberia", "Djibouti",
    "Solomon Islands", "Jamaica", "Gabon", "Lebanon", "Oman",
    "Bahamas", "Jordan", "Azerbaijan", "Serbia Montenegro", "Eswatini",
    "Lesotho", "Equatorial Guinea", "Israel", "Ireland", "Tonga",
    "Timor-Leste", "Canary Islands", "Slovenia", "Belize", "Botswana",
    "Republic of Moldova", "Saint Vincent and the Grenadines", "Norway",
    "Cyprus", "United Arab Emirates", "Lithuania", "Latvia",
    "Armenia", "Cabo Verde", "State of Palestine", "Saint Lucia",
    "Uzbekistan", "Sweden", "Maldives", "Belarus", "Denmark",
    "Micronesia (Federated States of)", "R√©union", "Montenegro",
    "Marshall Islands", "Mauritius", "Malta", "Barbados",
    "Samoa", "Guyana", "Turks and Caicos Islands", "Bhutan",
    "Cayman Islands", "Seychelles", "Suriname", "Estonia",
    "Guadeloupe", "Trinidad and Tobago", "Luxembourg", "Tuvalu", "Bahrain","C√¥te d¬íIvoire"
]

# Check for inconsistent countries
inconsistent_countries = df[~df['Country'].isin(valid_countries)]

if inconsistent_countries.empty:
    print("All country names are consistent.")
else:
    # Print total number of inconsistencies
    print("Number of inconsistent country names:", len(inconsistent_countries))
    # Print unique inconsistencies count
    unique_inconsistencies = inconsistent_countries['Country'].value_counts()
    print("\nUnique inconsistent country names:")
    print(unique_inconsistencies)


All country names are consistent.


In [18]:
# Define valid values for each column
valid_disaster_groups = ["Natural", "Technological"]
valid_disaster_types = [
    "Flood", "Earthquake", "Hurricane", "Drought", "Fire", "Tsunami", "Storm",
    "Road", "Water", "Epidemic", "Extreme temperature", "Mass movement (wet)",
    "Mass movement (dry)", "Oil spill", "Glacial lake outburst flood",
    "Radiation", "Impact", "Animal incident", "Explosion (Industrial)", "Air",
    "Fire (Miscellaneous)", "Wildfire", "Rail", "Miscellaneous accident (General)",
    "Explosion (Miscellaneous)", "Collapse (Miscellaneous)", "Collapse (Industrial)",
    "Volcanic activity", "Fire (Industrial)", "Industrial accident (General)",
    "Gas leak", "Infestation", "Chemical spill", "Poisoning"
]
valid_disaster_subgroups = [
    "Severe", "Moderate", "Minor", "Hydrological", "Transport", "Meteorological",
    "Biological", "Miscellaneous accident", "Industrial accident", "Geophysical",
    "Climatological", "Extra-terrestrial"
]

# Check for inconsistencies in Disaster Group
inconsistent_groups = df[~df['Disaster Group'].isin(valid_disaster_groups)]

# Check for inconsistencies in Disaster Type
inconsistent_types = df[~df['Disaster Type'].isin(valid_disaster_types)]

# Check for inconsistencies in Disaster Subgroup
inconsistent_subgroups = df[~df['Disaster Subgroup'].isin(valid_disaster_subgroups)]

# Print results
if inconsistent_groups.empty and inconsistent_types.empty and inconsistent_subgroups.empty:
    print("All Disaster Group, Disaster Type, and Disaster Subgroup values are consistent.")
else:
    if not inconsistent_groups.empty:
        print("Number of inconsistent Disaster Group names:", len(inconsistent_groups))
        print("\nUnique inconsistent Disaster Group names:")
        print(inconsistent_groups['Disaster Group'].value_counts())

    if not inconsistent_types.empty:
        print("Number of inconsistent Disaster Type names:", len(inconsistent_types))
        print("\nUnique inconsistent Disaster Type names:")
        print(inconsistent_types['Disaster Type'].value_counts())

    if not inconsistent_subgroups.empty:
        print("Number of inconsistent Disaster Subgroup names:", len(inconsistent_subgroups))
        print("\nUnique inconsistent Disaster Subgroup names:")
        print(inconsistent_subgroups['Disaster Subgroup'].value_counts())

All Disaster Group, Disaster Type, and Disaster Subgroup values are consistent.


### **4. Timeliness Dimension**
Refresher: Timeliness checks if the data is up-to-date or relevant to the current time frame.

**Considerations:**

-Check the recency of data.

-Note any records that are suspiciously in the future or that are too old.

-Next steps might involve removing or archiving old data, or verifying future-dated entries

In [19]:
df['Last Update'] = pd.to_datetime(df['Last Update'], errors='coerce')

# Define the valid date range
min_date = pd.Timestamp("2002-01-01")
max_date = pd.Timestamp("2024-12-31")

# Filter rows where the 'timestamp' is out of valid range
outdated_LastUpdate = df[(df["Last Update"] < min_date) | (df["Last Update"] > max_date)]

if outdated_LastUpdate .empty:
    print("‚úÖ All updates are within the valid range.")
else:
    # Print total number of inconsistencies
    print("üîç Number of outdated timestamps:", len(outdated_LastUpdate ))
    # Print unique outdated years and their counts
    unique_outdated_years = outdated_LastUpdate ["Last Update"].dt.year.value_counts()
    print("\n‚ùå Unique outdated years:")
    print(unique_outdated_years)

‚úÖ All updates are within the valid range.


### **5. Uniqueness Dimension**



**Refresher:** Uniqueness checks if certain fields (e.g., IDs, email addresses) are duplicated when they should be unique.

**Considerations:**
- Check for duplicate rows or duplicate primary key values.
- If there are duplicates, note how many and consider if they should be removed or combined.

**Handling Different Data Types:**
- **Numerical:** Remove or merge duplicates based on business logic.
- **Categorical:** Ensure unique identifiers are not repeated.


In [20]:
# ================================
# 5. Uniqueness Dimension: Duplicate Check
# ================================

# 1) Exact duplicates across all columns (includes every copy)
exact_duplicates = df[df.duplicated(keep=False)]
print("üîç Number of exact duplicate rows:", len(exact_duplicates))

if not exact_duplicates.empty:
    print("\n‚ùå Examples of exact duplicate records:")
    display(exact_duplicates.head(2))


# 2) Partial duplicates based on the unique event identifier DisNo.
partial_duplicates = df[df.duplicated(
    subset=["DisNo."],
    keep=False
)]
print("\nüîé Number of potential partial duplicates (same DisNo.):", len(partial_duplicates))

if not partial_duplicates.empty:
    print("\nüîé Examples of potential partial duplicates:")

    display(
        partial_duplicates[["DisNo.","Event Name","ISO","Start Year"]]
        .drop_duplicates()
        .head(10)
    )


üîç Number of exact duplicate rows: 0

üîé Number of potential partial duplicates (same DisNo.): 0


**There are no duplicate records the Uniqueness dimension is satisfied, and every disaster event is represented by a single, non-repeating key**

---------------------------------------------------

### **6. Validity Dimension**

**Refresher:** Validity checks if the data follows the correct format or conforms to defined constraints

**Considerations:**
- Ensure values fall within acceptable ranges.



**Handling Different Data Types:**
- **Numerical:** Ensure numbers fall within valid ranges.


In [21]:
# =============================================================================
# Numerical Validity: Identify out-of-range numeric values in our disaster dataset
# with data‚Äêdriven thresholds
# =============================================================================

# List of numeric columns to validate
numeric_cols = [
    "No. Injured",
    "No. Affected",
    "No. Homeless",
    "Total Deaths",
    "Total Injured",
    "Total Affected",
    "Total Damage ('000 US$)",
    "Magnitude"
]

# 1) Check for negative values (all counts and damages should be ‚â• 0)
negatives = {
    col: df[df[col] < 0]
    for col in numeric_cols
    if col in df.columns
}

# 2) Compute data‚Äêdriven upper thresholds (99.5th percentile) except for Magnitude
thresholds = {}
for col in numeric_cols:
    if col not in df.columns:
        continue
    if col == "Magnitude":
        # fixed scientific upper bound
        thresholds[col] = 10.0
    else:
        # 99.5th percentile of the actual distribution
        thresholds[col] = df[col].quantile(0.995)

# 3) Identify outliers above those thresholds
outliers = {
    col: df[df[col] > thresh]
    for col, thresh in thresholds.items()
}

# 4) Print counts of invalid records
print("üîç Negative value counts:")
for col, bad in negatives.items():
    print(f"  ‚Ä¢ {col}: {len(bad)}")

print("\nüîç Outlier counts (above 99.5th percentile or fixed bound):")
for col, bad in outliers.items():
    print(f"  ‚Ä¢ {col} > {thresholds[col]:,}: {len(bad)}")

# 5) Display sample invalid records for inspection
for col, bad in negatives.items():
    if not bad.empty:
        print(f"\n‚ùå Sample negative values in '{col}':")
        display(bad[["DisNo.", col]].head(5))

for col, bad in outliers.items():
    if not bad.empty:
        print(f"\nüö© Sample outliers in '{col}' (> {thresholds[col]:,}):")
        display(bad[["DisNo.", col]].sort_values(col, ascending=False).head(5))

üîç Negative value counts:
  ‚Ä¢ No. Injured: 0
  ‚Ä¢ No. Affected: 0
  ‚Ä¢ No. Homeless: 0
  ‚Ä¢ Total Deaths: 0
  ‚Ä¢ Total Affected: 0
  ‚Ä¢ Total Damage ('000 US$): 0
  ‚Ä¢ Magnitude: 115

üîç Outlier counts (above 99.5th percentile or fixed bound):
  ‚Ä¢ No. Injured > 37,320.7450000036: 29
  ‚Ä¢ No. Affected > 20,000,000.0: 34
  ‚Ä¢ No. Homeless > 900,205.1049999977: 7
  ‚Ä¢ Total Deaths > 1,916.3299999999908: 64
  ‚Ä¢ Total Affected > 13,185,044.999999927: 59
  ‚Ä¢ Total Damage ('000 US$) > 28,750,000.0: 16
  ‚Ä¢ Magnitude > 10.0: 2454

‚ùå Sample negative values in 'Magnitude':


Unnamed: 0,DisNo.,Magnitude
254,2000-0277-RUS,-9.0
380,2000-0416-BOL,-9.0
386,2000-0422-URY,-7.5
387,2000-0423-CHL,-20.0
420,2000-0456-BRA,-5.0



üö© Sample outliers in 'No. Injured' (> 37,320.7450000036):


Unnamed: 0,DisNo.,No. Injured
3593,2004-0331-PER,1800000.0
12868,2019-0122-YEM,461542.0
6558,2008-0192-CHN,366596.0
7663,2010-0017-HTI,300000.0
8156,2010-0557-HTI,277451.0



üö© Sample outliers in 'No. Affected' (> 20,000,000.0):


Unnamed: 0,DisNo.,No. Affected
11184,2015-9618-IND,330000000.0
2544,2002-9349-IND,300000000.0
2846,2003-0315-CHN,150000000.0
7880,2010-0239-CHN,134000000.0
5926,2007-0268-CHN,105000000.0



üö© Sample outliers in 'No. Homeless' (> 900,205.1049999977):


Unnamed: 0,DisNo.,No. Homeless
4620,2005-0575-PAK,5000000.0
5325,2006-0417-IND,4000000.0
6790,2008-0423-IND,2400000.0
11426,2016-0267-IND,2000000.0
927,2001-0033-IND,1790000.0



üö© Sample outliers in 'Total Deaths' (> 1,916.3299999999908):


Unnamed: 0,DisNo.,Total Deaths
7663,2010-0017-HTI,222570.0
3939,2004-0659-IDN,165708.0
6549,2008-0184-MMR,138366.0
6558,2008-0192-CHN,87476.0
4620,2005-0575-PAK,73338.0



üö© Sample outliers in 'Total Affected' (> 13,185,044.999999927):


Unnamed: 0,DisNo.,Total Affected
11184,2015-9618-IND,330000000.0
2544,2002-9349-IND,300000000.0
2846,2003-0315-CHN,150146000.0
7880,2010-0239-CHN,134000000.0
5926,2007-0268-CHN,105004000.0



üö© Sample outliers in 'Total Damage ('000 US$)' (> 28,750,000.0):


Unnamed: 0,DisNo.,Total Damage ('000 US$)
8403,2011-0082-JPN,210000000.0
4509,2005-0467-USA,125000000.0
14876,2022-0614-USA,100000000.0
12019,2017-0362-USA,95000000.0
6558,2008-0192-CHN,85000000.0



üö© Sample outliers in 'Magnitude' (> 10.0):


Unnamed: 0,DisNo.,Magnitude
11033,2015-0486-BRA,40000000.0
11122,2015-0579-AGO,13025874.0
1107,2001-0227-RUS,2857000.0
9099,2012-0173-CHN,2643786.0
1284,2001-0426-CIV,2610994.0


our data-driven numerical validity checks

 we confirmed that none of the count or damage columns contain negative values, so there are no impossible negative entries in the dataset.

  By using 99.5th-percentile thresholds that adapt to the actual distributions, we flagged only a very small number of extreme values per column‚Äîjust a few dozen events even for the largest counts or damages.
  
   We intentionally capped Magnitude at 10.0 (the scientific maximum for earthquake scales), so any readings outside 0‚Äì10 should be set to NaN and cross-checked against source records.






