In [1]:
%load_ext autoreload
%autoreload 2

In [46]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# Load data

In [47]:
df_d = pd.read_csv("../data/details/details_2022.csv.gz")
df_f = pd.read_csv("../data/fatalities/fatalities_2022.csv.gz")
df_l = pd.read_csv("../data/locations/locations_2022.csv.gz")

In [48]:
# Merge details + fatalities (many fatalities per event possible)
df_year = df_d.merge(df_f, on="EVENT_ID", how="left")

# Merge with locations (many locations per event)
df_year = df_year.merge(df_l, on="EVENT_ID", how="left")

From the documentation, each episode can have multiple events related to it. This means 'EPISODE_ID' is a broader classification and 'EVENT_ID' is the true index with all unique values. So when we merge on 'EVENT_ID' pandas creates two colums 'EPISODE_ID_x' and 'EPISODE_ID_y' from df_d and df_l. Both these columns are equivalent, so we can safely drop one and rename the other back to 'EPISODE_ID'.

In [102]:
df_year = df_year.rename(columns={'EPISODE_ID_x': 'EPISODE_ID'}).drop(columns=['EPISODE_ID_y'])

# Remove irrelevant columns

In [103]:
cols_to_remove = ['SOURCE', 'DATA_SOURCE', 'CATEGORY','EPISODE_NARRATIVE', 'EVENT_NARRATIVE','STATE_FIPS','CZ_FIPS','TOR_OTHER_CZ_NAME', 'TOR_OTHER_CZ_FIPS', 'TOR_OTHER_CZ_STATE','WFO','TOR_OTHER_WFO']

In [104]:
df_year.drop(columns=cols_to_remove, errors='ignore', inplace=True)

# Create column categories

## ID columns

In [105]:
import re

In [106]:
ID_cols = [col for col in df_year.columns if re.search(r'_ID$', col.upper())]
ID_cols

['EPISODE_ID', 'EVENT_ID', 'FATALITY_ID']

In [107]:
df_year[ID_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53934 entries, 0 to 53933
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   EPISODE_ID   53934 non-null  int64  
 1   EVENT_ID     53934 non-null  int64  
 2   FATALITY_ID  520 non-null    float64
dtypes: float64(1), int64(2)
memory usage: 1.2 MB


In [108]:
for col in ID_cols:
    df_year[col] = df_year[col].astype('category')


In [109]:
df_year[ID_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53934 entries, 0 to 53933
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   EPISODE_ID   53934 non-null  category
 1   EVENT_ID     53934 non-null  category
 2   FATALITY_ID  520 non-null    category
dtypes: category(3)
memory usage: 3.5 MB


In [110]:
df_year[ID_cols]

Unnamed: 0,EPISODE_ID,EVENT_ID,FATALITY_ID
0,1104812,5165377,
1,1104812,5165378,
2,1104812,5165379,
3,1105342,5165449,
4,1101140,5172568,
...,...,...,...
53929,2414768,5126692,
53930,2414731,5127563,
53931,2414804,5127165,
53932,2414804,5127830,


# Timing columns

In [40]:
timing_cols = [col for col in df_year.columns 
               if any(key in col.upper() for key in ['YEAR', 'DATE', 'TIME', 'MONTH', 'DAY'])]
timing_cols

['BEGIN_YEARMONTH',
 'BEGIN_DAY',
 'BEGIN_TIME',
 'END_YEARMONTH',
 'END_DAY',
 'END_TIME',
 'YEAR',
 'MONTH_NAME',
 'BEGIN_DATE_TIME',
 'CZ_TIMEZONE',
 'END_DATE_TIME',
 'FAT_YEARMONTH',
 'FAT_DAY',
 'FAT_TIME',
 'FATALITY_DATE',
 'EVENT_YEARMONTH',
 'YEARMONTH']

In [41]:
df_year[timing_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53934 entries, 0 to 53933
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   BEGIN_YEARMONTH  53934 non-null  int64  
 1   BEGIN_DAY        53934 non-null  int64  
 2   BEGIN_TIME       53934 non-null  int64  
 3   END_YEARMONTH    53934 non-null  int64  
 4   END_DAY          53934 non-null  int64  
 5   END_TIME         53934 non-null  int64  
 6   YEAR             53934 non-null  int64  
 7   MONTH_NAME       53934 non-null  object 
 8   BEGIN_DATE_TIME  53934 non-null  object 
 9   CZ_TIMEZONE      53934 non-null  object 
 10  END_DATE_TIME    53934 non-null  object 
 11  FAT_YEARMONTH    520 non-null    float64
 12  FAT_DAY          520 non-null    float64
 13  FAT_TIME         520 non-null    float64
 14  FATALITY_DATE    520 non-null    object 
 15  EVENT_YEARMONTH  520 non-null    float64
 16  YEARMONTH        53934 non-null  int64  
dtypes: float64(4

In [42]:
df_year[timing_cols].head()

Unnamed: 0,BEGIN_YEARMONTH,BEGIN_DAY,BEGIN_TIME,END_YEARMONTH,END_DAY,END_TIME,YEAR,MONTH_NAME,BEGIN_DATE_TIME,CZ_TIMEZONE,END_DATE_TIME,FAT_YEARMONTH,FAT_DAY,FAT_TIME,FATALITY_DATE,EVENT_YEARMONTH,YEARMONTH
0,200012,31,600,200012,31,900,2000,December,31-DEC-00 06:00:00,EST,31-DEC-00 09:00:00,,,,,,200012
1,200012,31,600,200012,31,900,2000,December,31-DEC-00 06:00:00,EST,31-DEC-00 09:00:00,,,,,,200012
2,200012,31,700,200012,31,800,2000,December,31-DEC-00 07:00:00,EST,31-DEC-00 08:00:00,,,,,,200012
3,200012,13,2200,200012,14,400,2000,December,13-DEC-00 22:00:00,EST,14-DEC-00 04:00:00,,,,,,200012
4,200008,3,1410,200008,3,1410,2000,August,03-AUG-00 14:10:00,CST,03-AUG-00 14:10:00,,,,,,200008


In [43]:
'YEAR' in df_d.columns

True

In [44]:
'YEAR' in df_f.columns

False

In [45]:
'YEAR' in df_l.columns

False

In [46]:
[c for c in list(df_d.columns) if 'MONTH' in c]

['BEGIN_YEARMONTH', 'END_YEARMONTH', 'MONTH_NAME']

In [47]:
df_year[['BEGIN_YEARMONTH', 'END_YEARMONTH', 'YEAR','MONTH_NAME']]

Unnamed: 0,BEGIN_YEARMONTH,END_YEARMONTH,YEAR,MONTH_NAME
0,200012,200012,2000,December
1,200012,200012,2000,December
2,200012,200012,2000,December
3,200012,200012,2000,December
4,200008,200008,2000,August
...,...,...,...,...
53929,200001,200001,2000,January
53930,200001,200001,2000,January
53931,200002,200002,2000,February
53932,200002,200002,2000,February


## Remove redundencies from year, month

In [48]:
df_year['MONTH'] = df_year['BEGIN_YEARMONTH'].astype(str).str[-2:].astype(int)

In [49]:
cols_to_drop = ['FAT_YEARMONTH', 'EVENT_YEARMONTH', 'YEARMONTH', 'BEGIN_YEARMONTH', 'END_YEARMONTH']
df_year.drop(columns=cols_to_drop, inplace=True)

# Remove them from timing_cols
timing_cols = [col for col in timing_cols if col not in cols_to_drop]

In [50]:
timing_cols.append('MONTH')

In [51]:
df_year[df_year['FAT_DAY'].notna()][timing_cols].head()

Unnamed: 0,BEGIN_DAY,BEGIN_TIME,END_DAY,END_TIME,YEAR,MONTH_NAME,BEGIN_DATE_TIME,CZ_TIMEZONE,END_DATE_TIME,FAT_DAY,FAT_TIME,FATALITY_DATE,MONTH
128,28,1200,28,1200,2000,January,28-JAN-00 12:00:00,CST,28-JAN-00 12:00:00,28.0,0.0,01/28/2000 12:00:00,1
132,25,0,28,0,2000,January,25-JAN-00 00:00:00,CST,28-JAN-00 00:00:00,28.0,0.0,01/28/2000 12:00:00,1
333,25,0,28,0,2000,January,25-JAN-00 00:00:00,CST,28-JAN-00 00:00:00,27.0,0.0,01/27/2000 12:00:00,1
537,14,100,14,900,2000,January,14-JAN-00 01:00:00,EST,14-JAN-00 09:00:00,14.0,0.0,01/14/2000 12:00:00,1
538,14,100,14,900,2000,January,14-JAN-00 01:00:00,EST,14-JAN-00 09:00:00,14.0,0.0,01/14/2000 12:00:00,1


In [52]:
df_year[timing_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53934 entries, 0 to 53933
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   BEGIN_DAY        53934 non-null  int64  
 1   BEGIN_TIME       53934 non-null  int64  
 2   END_DAY          53934 non-null  int64  
 3   END_TIME         53934 non-null  int64  
 4   YEAR             53934 non-null  int64  
 5   MONTH_NAME       53934 non-null  object 
 6   BEGIN_DATE_TIME  53934 non-null  object 
 7   CZ_TIMEZONE      53934 non-null  object 
 8   END_DATE_TIME    53934 non-null  object 
 9   FAT_DAY          520 non-null    float64
 10  FAT_TIME         520 non-null    float64
 11  FATALITY_DATE    520 non-null    object 
 12  MONTH            53934 non-null  int64  
dtypes: float64(2), int64(6), object(5)
memory usage: 5.3+ MB


In [295]:
# For day-level analysis drop the time columns
time_cols = [col for col in timing_cols if '_TIME' in col]
df_year.drop(columns=time_cols, inplace=True)
timing_cols = [col for col in timing_cols if col not in time_cols]

df_year['DURATION_DAYS'] = df_year['END_DAY'] - df_year['BEGIN_DAY'] + 1
timing_cols.append('DURATION_DAYS')

In [54]:
df_year[timing_cols]

Unnamed: 0,BEGIN_DAY,END_DAY,YEAR,MONTH_NAME,FAT_DAY,FATALITY_DATE,MONTH,DURATION_DAYS
0,31,31,2000,December,,,12,1
1,31,31,2000,December,,,12,1
2,31,31,2000,December,,,12,1
3,13,14,2000,December,,,12,2
4,3,3,2000,August,,,8,1
...,...,...,...,...,...,...,...,...
53929,11,11,2000,January,,,1,1
53930,25,25,2000,January,,,1,1
53931,1,29,2000,February,,,2,29
53932,1,29,2000,February,,,2,29


In [55]:
df_d[['BEGIN_YEARMONTH','BEGIN_DATE_TIME','END_DATE_TIME']]

Unnamed: 0,BEGIN_YEARMONTH,BEGIN_DATE_TIME,END_DATE_TIME
0,200012,31-DEC-00 06:00:00,31-DEC-00 09:00:00
1,200012,31-DEC-00 06:00:00,31-DEC-00 09:00:00
2,200012,31-DEC-00 07:00:00,31-DEC-00 08:00:00
3,200012,13-DEC-00 22:00:00,14-DEC-00 04:00:00
4,200008,03-AUG-00 14:10:00,03-AUG-00 14:10:00
...,...,...,...
52002,200001,11-JAN-00 05:00:00,11-JAN-00 09:00:00
52003,200001,25-JAN-00 10:30:00,25-JAN-00 10:30:00
52004,200002,01-FEB-00 00:00:00,29-FEB-00 23:59:00
52005,200002,01-FEB-00 00:00:00,29-FEB-00 23:59:00


In [4]:
import calendar
def clean_timing_columns(df):
    """
    Cleans timing columns in NOAA storm events DataFrame.
    Returns a DataFrame with:
    - YEAR, BEGIN_MONTH, END_MONTH, BEGIN_MONTH_NAME
    - BEGIN_DAY, END_DAY, FAT_DAY
    - DURATION_DAYS (computed from datetime)
    Drops redundant _YEARMONTH and _TIME columns.
    """

    df = df.copy()

    # YEAR
    df['YEAR'] = df['YEAR'].astype(int)
    
    # Create BEGIN_MONTH and END_MONTH from BEGIN_YEARMONTH, END_YEARMONTH columns 
    df['BEGIN_MONTH']=df_year['BEGIN_YEARMONTH'].astype(str).str[-2:].astype(int)
    df['END_MONTH']=df_year['END_YEARMONTH'].astype(str).str[-2:].astype(int)

    # Create BEGIN_MONTH_NAME categorical
    df['BEGIN_MONTH_NAME'] = df['BEGIN_MONTH'].apply(lambda x: calendar.month_abbr[x])
    df['BEGIN_MONTH_NAME'] = pd.Categorical(
        df['BEGIN_MONTH_NAME'],
        categories=list(calendar.month_abbr)[1:],  # Jan→Dec
        ordered=True
    )
 
    # Compute DURATION_DAYS using datetime columns
    # parse datetimes
    df['BEGIN_DATE_TIME'] = pd.to_datetime(df['BEGIN_DATE_TIME'], errors='coerce')
    df['END_DATE_TIME']   = pd.to_datetime(df['END_DATE_TIME'], errors='coerce')

    # duration (cross-month handled correctly)
    bd = df['BEGIN_DATE_TIME'].dt.floor('D')
    ed = df['END_DATE_TIME'].dt.floor('D')
    df['DURATION_DAYS'] = (ed - bd).dt.days + 1

    # FAT_DAY to Int64 (nullable integer)
    # df['FAT_DAY'] = pd.to_numeric(df['FAT_DAY'], errors='coerce').astype('Int64')
    df['FAT_DAY'] = df['FAT_DAY'].astype('Int64')

    # drop unused timing columns
    drop_cols = [c for c in df.columns if any(key in c.upper() for key in ['YEARMONTH', '_TIME','_DATE'])]
    df.drop(columns=drop_cols, inplace=True, errors='ignore')
    df.drop(columns=['MONTH_NAME'], inplace=True, errors='ignore')  # redundant with BEGIN_MONTH_NAME

    return df

In [5]:
df_clean_time = clean_timing_columns(df_year)

In [6]:
timing_cols = [col for col in df_clean_time.columns 
               if any(key in col.upper() for key in ['YEAR', 'DATE', 'TIME', 'MONTH', 'DAY'])]
timing_cols

['BEGIN_DAY',
 'END_DAY',
 'YEAR',
 'FAT_DAY',
 'BEGIN_MONTH',
 'END_MONTH',
 'BEGIN_MONTH_NAME',
 'DURATION_DAYS']

In [7]:
df_clean_time[timing_cols]

Unnamed: 0,BEGIN_DAY,END_DAY,YEAR,FAT_DAY,BEGIN_MONTH,END_MONTH,BEGIN_MONTH_NAME,DURATION_DAYS
0,31,31,2000,,12,12,Dec,1
1,31,31,2000,,12,12,Dec,1
2,31,31,2000,,12,12,Dec,1
3,13,14,2000,,12,12,Dec,2
4,3,3,2000,,8,8,Aug,1
...,...,...,...,...,...,...,...,...
53929,11,11,2000,,1,1,Jan,1
53930,25,25,2000,,1,1,Jan,1
53931,1,29,2000,,2,2,Feb,29
53932,1,29,2000,,2,2,Feb,29


In [89]:
df_clean_time[df_clean_time['FAT_DAY'].notna()][timing_cols].info()

<class 'pandas.core.frame.DataFrame'>
Index: 520 entries, 128 to 53880
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   BEGIN_DAY         520 non-null    int64   
 1   END_DAY           520 non-null    int64   
 2   YEAR              520 non-null    int64   
 3   FAT_DAY           520 non-null    Int64   
 4   BEGIN_MONTH       520 non-null    int64   
 5   END_MONTH         520 non-null    int64   
 6   BEGIN_MONTH_NAME  520 non-null    category
 7   DURATION_DAYS     520 non-null    int64   
dtypes: Int64(1), category(1), int64(6)
memory usage: 33.9 KB


In [49]:
import sys
import os

# Add the src folder to Python path
sys.path.append(os.path.abspath("../src"))
from data_cleaner import drop_unwanted_cols, clean_id_cols, clean_timing_cols, clean_location_cols

In [243]:
df_year.shape

(84058, 71)

In [353]:
drop_unwanted_cols(df_year)

In [245]:
df_year.shape

(84058, 59)

In [354]:
clean_id_cols(df_year)

In [247]:
df_year[['EPISODE_ID','EVENT_ID', 'FATALITY_ID']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84058 entries, 0 to 84057
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   EPISODE_ID   84058 non-null  category
 1   EVENT_ID     84058 non-null  category
 2   FATALITY_ID  1397 non-null   category
dtypes: category(3)
memory usage: 3.6 MB


In [248]:
df_year.shape

(84058, 58)

In [355]:
clean_timing_cols(df_year)

In [250]:
df_year.shape

(84058, 49)

In [251]:
df_year.head()

Unnamed: 0,BEGIN_DAY,END_DAY,EPISODE_ID,EVENT_ID,STATE,YEAR,EVENT_TYPE,CZ_TYPE,CZ_NAME,INJURIES_DIRECT,...,AZIMUTH,LOCATION,LATITUDE,LONGITUDE,LAT2,LON2,BEGIN_MONTH,END_MONTH,BEGIN_MONTH_NAME,DURATION_DAYS
0,20,20,165464,999902,NEVADA,2022,High Wind,Z,SOUTHEASTERN ELKO,0,...,,,,,,,2,2,Feb,1
1,21,22,165465,999903,NEVADA,2022,Heavy Snow,Z,S LANDER & S EUREKA,0,...,,,,,,,2,2,Feb,2
2,22,22,165465,999904,NEVADA,2022,Heavy Snow,Z,N ELKO CNTY,0,...,,,,,,,2,2,Feb,1
3,18,18,165611,1001181,ATLANTIC SOUTH,2022,Waterspout,Z,FERNANDINA BEACH TO ST AUGUSTINE FL OUT 20NM,0,...,SE,PONTE VEDRA,30.05,-81.17,303000.0,8110200.0,2,2,Feb,1
4,2,3,165668,1001527,AMERICAN SAMOA,2022,Heavy Rain,C,TUTUILA,0,...,NNW,VAITOGI,-14.333,-170.7157,-1419980.0,17042942.0,2,2,Feb,2


## Location Columns

In [6]:
location_cols = [col for col in df_year.columns 
                 if any(key in col.upper() for key in ['STATE', 'LAT', 'LON', 'LOCATION', 'RANGE', 'AZIMUTH', 'CZ_'])]

In [7]:
len(location_cols)

28

In [305]:
df_year[location_cols].head()

Unnamed: 0,STATE,CZ_TYPE,CZ_NAME,BEGIN_RANGE,BEGIN_AZIMUTH,BEGIN_LOCATION,END_RANGE,END_AZIMUTH,END_LOCATION,BEGIN_LAT,...,END_LON,FATALITY_LOCATION,LOCATION_INDEX,RANGE,AZIMUTH,LOCATION,LATITUDE,LONGITUDE,LAT2,LON2
0,NEVADA,Z,SOUTHEASTERN ELKO,,,,,,,,...,,,,,,,,,,
1,NEVADA,Z,S LANDER & S EUREKA,,,,,,,,...,,,,,,,,,,
2,NEVADA,Z,N ELKO CNTY,,,,,,,,...,,,,,,,,,,
3,ATLANTIC SOUTH,Z,FERNANDINA BEACH TO ST AUGUSTINE FL OUT 20NM,7.0,SE,PONTE VEDRA,7.0,SE,PONTE VEDRA,30.05,...,-81.17,,1.0,6.8,SE,PONTE VEDRA,30.05,-81.17,303000.0,8110200.0
4,AMERICAN SAMOA,C,TUTUILA,5.0,NNW,VAITOGI,5.0,NNW,VAITOGI,-14.333,...,-170.7268,,1.0,4.75,NNW,VAITOGI,-14.333,-170.7157,-1419980.0,17042942.0


In [306]:
df_year[location_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84058 entries, 0 to 84057
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   STATE              84058 non-null  object 
 1   CZ_TYPE            84058 non-null  object 
 2   CZ_NAME            84058 non-null  object 
 3   BEGIN_RANGE        49441 non-null  float64
 4   BEGIN_AZIMUTH      49441 non-null  object 
 5   BEGIN_LOCATION     49441 non-null  object 
 6   END_RANGE          49441 non-null  float64
 7   END_AZIMUTH        49441 non-null  object 
 8   END_LOCATION       49441 non-null  object 
 9   BEGIN_LAT          49441 non-null  float64
 10  BEGIN_LON          49441 non-null  float64
 11  END_LAT            49441 non-null  float64
 12  END_LON            49441 non-null  float64
 13  FATALITY_LOCATION  1397 non-null   object 
 14  LOCATION_INDEX     39652 non-null  float64
 15  RANGE              39652 non-null  float64
 16  AZIMUTH            396

In [307]:
cols_to_drop = [ 'FATALITY_LOCATION']

In [308]:
df_year[df_year['FATALITY_LOCATION'].notna()][['STATE','CZ_NAME','FATALITY_LOCATION','LOCATION','BEGIN_LOCATION']].head(10)

Unnamed: 0,STATE,CZ_NAME,FATALITY_LOCATION,LOCATION,BEGIN_LOCATION
24,ARIZONA,SCOTTSDALE/PARADISE VALLEY,Unknown,,
25,ARIZONA,BUCKEYE/AVONDALE,Unknown,,
26,ARIZONA,NORTH PHOENIX/GLENDALE,Unknown,,
27,ARIZONA,CENTRAL PHOENIX,Unknown,,
28,ARIZONA,CENTRAL PHOENIX,Unknown,,
29,ARIZONA,SCOTTSDALE/PARADISE VALLEY,Unknown,,
30,ARIZONA,EAST VALLEY,Unknown,,
473,FLORIDA,COASTAL VOLUSIA COUNTY,In Water,,
1056,MISSISSIPPI,PANOLA,Outside/Open Areas,,
1115,NEVADA,LAS VEGAS VALLEY,Outside/Open Areas,,


In [309]:
df_year[['EVENT_TYPE','STATE', 'CZ_NAME', 'LOCATION', 'BEGIN_LOCATION', 'END_LOCATION']].head()

Unnamed: 0,EVENT_TYPE,STATE,CZ_NAME,LOCATION,BEGIN_LOCATION,END_LOCATION
0,High Wind,NEVADA,SOUTHEASTERN ELKO,,,
1,Heavy Snow,NEVADA,S LANDER & S EUREKA,,,
2,Heavy Snow,NEVADA,N ELKO CNTY,,,
3,Waterspout,ATLANTIC SOUTH,FERNANDINA BEACH TO ST AUGUSTINE FL OUT 20NM,PONTE VEDRA,PONTE VEDRA,PONTE VEDRA
4,Heavy Rain,AMERICAN SAMOA,TUTUILA,VAITOGI,VAITOGI,VAITOGI


In [310]:
df_year[['EVENT_TYPE','STATE', 'CZ_NAME', 'LOCATION', 'BEGIN_LOCATION', 'END_LOCATION']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84058 entries, 0 to 84057
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   EVENT_TYPE      84058 non-null  object
 1   STATE           84058 non-null  object
 2   CZ_NAME         84058 non-null  object
 3   LOCATION        39652 non-null  object
 4   BEGIN_LOCATION  49441 non-null  object
 5   END_LOCATION    49441 non-null  object
dtypes: object(6)
memory usage: 3.8+ MB


In [311]:
df_year[df_year['LOCATION'].notna()][['LOCATION','BEGIN_LOCATION','END_LOCATION']].head(10)

Unnamed: 0,LOCATION,BEGIN_LOCATION,END_LOCATION
3,PONTE VEDRA,PONTE VEDRA,PONTE VEDRA
4,VAITOGI,VAITOGI,VAITOGI
5,VAITOGI,VAITOGI,VAITOGI
6,VAITOGI,VAITOGI,VAITOGI
7,VAITOGI,VAITOGI,VAITOGI
694,CHANHASSEN,CHANHASSEN,CHANHASSEN
748,BELLVILLE,BELLVILLE,BELLVILLE
749,CLAXTON EVANS CO ARP,CLAXTON EVANS CO ARP,CLAXTON EVANS CO ARP
753,BAKER ARPT,BAKER ARPT,BAKER ARPT
754,SHANTY TOWN,SHANTY TOWN,SHANTY TOWN


In [312]:
df_year[(df_year['BEGIN_LOCATION']!=df_year['END_LOCATION'])]['BEGIN_LOCATION'].isna().sum()

np.int64(34617)

In [313]:
df_year[df_year['BEGIN_LOCATION']!=df_year['END_LOCATION']]['BEGIN_LOCATION'].shape

(41180,)

In [314]:
df_year[(df_year['BEGIN_LOCATION']!=df_year['END_LOCATION'])][['LOCATION','BEGIN_LOCATION','END_LOCATION']].head(10)

Unnamed: 0,LOCATION,BEGIN_LOCATION,END_LOCATION
0,,,
1,,,
2,,,
8,,,
9,,,
10,,,
11,,,
12,,,
13,,,
14,,,


In [315]:
df_year[df_year['FATALITY_LOCATION'].notna()][['STATE','CZ_NAME','FATALITY_LOCATION','LOCATION','BEGIN_LOCATION']].head(20)

Unnamed: 0,STATE,CZ_NAME,FATALITY_LOCATION,LOCATION,BEGIN_LOCATION
24,ARIZONA,SCOTTSDALE/PARADISE VALLEY,Unknown,,
25,ARIZONA,BUCKEYE/AVONDALE,Unknown,,
26,ARIZONA,NORTH PHOENIX/GLENDALE,Unknown,,
27,ARIZONA,CENTRAL PHOENIX,Unknown,,
28,ARIZONA,CENTRAL PHOENIX,Unknown,,
29,ARIZONA,SCOTTSDALE/PARADISE VALLEY,Unknown,,
30,ARIZONA,EAST VALLEY,Unknown,,
473,FLORIDA,COASTAL VOLUSIA COUNTY,In Water,,
1056,MISSISSIPPI,PANOLA,Outside/Open Areas,,
1115,NEVADA,LAS VEGAS VALLEY,Outside/Open Areas,,


In [316]:
[c for c in df_year.columns if 'AZIMUTH' in c]

['BEGIN_AZIMUTH', 'END_AZIMUTH', 'AZIMUTH']

In [317]:
df_year[['BEGIN_AZIMUTH', 'END_AZIMUTH', 'AZIMUTH']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84058 entries, 0 to 84057
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   BEGIN_AZIMUTH  49441 non-null  object
 1   END_AZIMUTH    49441 non-null  object
 2   AZIMUTH        39652 non-null  object
dtypes: object(3)
memory usage: 1.9+ MB


In [318]:
[c for c in df_year.columns if 'RANGE' in c]

['BEGIN_RANGE', 'END_RANGE', 'RANGE']

In [319]:
cols_to_drop = [ 'FATALITY_LOCATION', 'BEGIN_RANGE', 'END_RANGE', 'RANGE','LATITUDE','LONGITUDE','LAT2','LON2','LOCATION_INDEX','BEGIN_AZIMUTH', 'END_AZIMUTH', 'AZIMUTH','END_LOCATION','LOCATION','END_LAT','END_LON']

In [320]:
df_year.drop(columns=cols_to_drop, errors='ignore', inplace=True)

In [321]:
[location_cols.remove(c) for c in cols_to_drop if c in location_cols]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [322]:
len(location_cols)

6

In [323]:
df_year[location_cols]

Unnamed: 0,STATE,CZ_TYPE,CZ_NAME,BEGIN_LOCATION,BEGIN_LAT,BEGIN_LON
0,NEVADA,Z,SOUTHEASTERN ELKO,,,
1,NEVADA,Z,S LANDER & S EUREKA,,,
2,NEVADA,Z,N ELKO CNTY,,,
3,ATLANTIC SOUTH,Z,FERNANDINA BEACH TO ST AUGUSTINE FL OUT 20NM,PONTE VEDRA,30.0500,-81.1700
4,AMERICAN SAMOA,C,TUTUILA,VAITOGI,-14.3330,-170.7157
...,...,...,...,...,...,...
84053,LOUISIANA,C,ST. CHARLES,PARADIS,29.8575,-90.4491
84054,MISSISSIPPI,C,AMITE,HURON,31.0943,-90.6537
84055,GULF OF MEXICO,Z,COASTAL WATERS FROM PASCAGOULA MISSISSIPPI TO ...,MAIN PASS 289 AWOS (KVKY),29.5461,-88.8183
84056,GULF OF MEXICO,Z,COASTAL WATERS FROM STAKE ISLAND LOUISIANA TO ...,THUNDERHORSE PLATFORM (42887),28.6892,-89.2328


In [324]:
[c for c in location_cols if any(key in c for key in ['LAT', 'LON'])]

['BEGIN_LAT', 'BEGIN_LON']

In [325]:
df_year[location_cols].head()

Unnamed: 0,STATE,CZ_TYPE,CZ_NAME,BEGIN_LOCATION,BEGIN_LAT,BEGIN_LON
0,NEVADA,Z,SOUTHEASTERN ELKO,,,
1,NEVADA,Z,S LANDER & S EUREKA,,,
2,NEVADA,Z,N ELKO CNTY,,,
3,ATLANTIC SOUTH,Z,FERNANDINA BEACH TO ST AUGUSTINE FL OUT 20NM,PONTE VEDRA,30.05,-81.17
4,AMERICAN SAMOA,C,TUTUILA,VAITOGI,-14.333,-170.7157


In [326]:
df_year[location_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84058 entries, 0 to 84057
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   STATE           84058 non-null  object 
 1   CZ_TYPE         84058 non-null  object 
 2   CZ_NAME         84058 non-null  object 
 3   BEGIN_LOCATION  49441 non-null  object 
 4   BEGIN_LAT       49441 non-null  float64
 5   BEGIN_LON       49441 non-null  float64
dtypes: float64(2), object(4)
memory usage: 3.8+ MB


In [327]:
timing_cols = [col for col in df_year.columns 
               if any(key in col.upper() for key in ['YEAR', 'DATE', 'TIME', 'MONTH', 'DAY'])]
timing_cols

['BEGIN_DAY',
 'END_DAY',
 'YEAR',
 'FAT_DAY',
 'BEGIN_MONTH',
 'END_MONTH',
 'BEGIN_MONTH_NAME',
 'DURATION_DAYS']

In [328]:
df_year[timing_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84058 entries, 0 to 84057
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   BEGIN_DAY         84058 non-null  int64   
 1   END_DAY           84058 non-null  int64   
 2   YEAR              84058 non-null  int64   
 3   FAT_DAY           1397 non-null   Int64   
 4   BEGIN_MONTH       84058 non-null  int64   
 5   END_MONTH         84058 non-null  int64   
 6   BEGIN_MONTH_NAME  84058 non-null  category
 7   DURATION_DAYS     84058 non-null  int64   
dtypes: Int64(1), category(1), int64(6)
memory usage: 4.7 MB


In [329]:
import re
ID_cols = [col for col in df_year.columns if re.search(r'_ID$', col.upper())]
df_year[ID_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84058 entries, 0 to 84057
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   EPISODE_ID   84058 non-null  category
 1   EVENT_ID     84058 non-null  category
 2   FATALITY_ID  1397 non-null   category
dtypes: category(3)
memory usage: 3.6 MB


In [330]:
df_year['CZ_TYPE'].value_counts()

CZ_TYPE
C    46476
Z    37582
Name: count, dtype: int64

In [331]:
df_year[location_cols].head()

Unnamed: 0,STATE,CZ_TYPE,CZ_NAME,BEGIN_LOCATION,BEGIN_LAT,BEGIN_LON
0,NEVADA,Z,SOUTHEASTERN ELKO,,,
1,NEVADA,Z,S LANDER & S EUREKA,,,
2,NEVADA,Z,N ELKO CNTY,,,
3,ATLANTIC SOUTH,Z,FERNANDINA BEACH TO ST AUGUSTINE FL OUT 20NM,PONTE VEDRA,30.05,-81.17
4,AMERICAN SAMOA,C,TUTUILA,VAITOGI,-14.333,-170.7157


In [332]:
df_year[location_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84058 entries, 0 to 84057
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   STATE           84058 non-null  object 
 1   CZ_TYPE         84058 non-null  object 
 2   CZ_NAME         84058 non-null  object 
 3   BEGIN_LOCATION  49441 non-null  object 
 4   BEGIN_LAT       49441 non-null  float64
 5   BEGIN_LON       49441 non-null  float64
dtypes: float64(2), object(4)
memory usage: 3.8+ MB


In [359]:
clean_location_cols(df_year)

In [334]:
df_year[location_cols].head()

Unnamed: 0,STATE,CZ_TYPE,CZ_NAME,BEGIN_LOCATION,BEGIN_LAT,BEGIN_LON
0,NEVADA,Z,SOUTHEASTERN ELKO,,,
1,NEVADA,Z,S LANDER & S EUREKA,,,
2,NEVADA,Z,N ELKO CNTY,,,
3,ATLANTIC SOUTH,Z,FERNANDINA BEACH TO ST AUGUSTINE FL OUT 20NM,PONTE VEDRA,30.05,-81.17
4,AMERICAN SAMOA,C,TUTUILA,VAITOGI,-14.333,-170.7157


In [335]:
location_cols

['STATE', 'CZ_TYPE', 'CZ_NAME', 'BEGIN_LOCATION', 'BEGIN_LAT', 'BEGIN_LON']

In [360]:
location_cols = [col for col in df_year.columns 
                 if any(key in col.upper() for key in ['STATE', 'LAT', 'LON', 'LOCATION', 'RANGE', 'AZIMUTH', 'CZ_'])]

In [361]:
location_cols

['STATE',
 'CZ_TYPE',
 'CZ_NAME',
 'BEGIN_LOCATION',
 'BEGIN_LAT',
 'BEGIN_LON',
 'LOCATION_LABEL']

In [343]:
clean_location_cols(df_year)

In [362]:
df_year[location_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84058 entries, 0 to 84057
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   STATE           84058 non-null  category
 1   CZ_TYPE         84058 non-null  category
 2   CZ_NAME         84058 non-null  category
 3   BEGIN_LOCATION  49441 non-null  object  
 4   BEGIN_LAT       49441 non-null  float64 
 5   BEGIN_LON       49441 non-null  float64 
 6   LOCATION_LABEL  84058 non-null  object  
dtypes: category(3), float64(2), object(2)
memory usage: 3.0+ MB


In [366]:
df_year[location_cols].head()

Unnamed: 0,STATE,CZ_TYPE,CZ_NAME,BEGIN_LOCATION,BEGIN_LAT,BEGIN_LON,LOCATION_LABEL
0,NEVADA,Zone,SOUTHEASTERN ELKO,,,,"SOUTHEASTERN ELKO, NEVADA"
1,NEVADA,Zone,S LANDER & S EUREKA,,,,"S LANDER & S EUREKA, NEVADA"
2,NEVADA,Zone,N ELKO CNTY,,,,"N ELKO CNTY, NEVADA"
3,ATLANTIC SOUTH,Zone,FERNANDINA BEACH TO ST AUGUSTINE FL OUT 20NM,PONTE VEDRA,30.05,-81.17,"PONTE VEDRA, FERNANDINA BEACH TO ST AUGUSTINE ..."
4,AMERICAN SAMOA,County,TUTUILA,VAITOGI,-14.333,-170.7157,"VAITOGI, TUTUILA, AMERICAN SAMOA"


## Damage Columns

In [50]:
drop_unwanted_cols(df_year)
clean_id_cols(df_year)
clean_timing_cols(df_year)
clean_location_cols(df_year)

In [51]:
df_year.head()

Unnamed: 0,BEGIN_DAY,END_DAY,EPISODE_ID,EVENT_ID,STATE,YEAR,EVENT_TYPE,CZ_TYPE,CZ_NAME,INJURIES_DIRECT,...,FAT_DAY,FATALITY_ID,FATALITY_TYPE,FATALITY_AGE,FATALITY_SEX,BEGIN_MONTH,END_MONTH,BEGIN_MONTH_NAME,DURATION_DAYS,LOCATION_LABEL
0,20,20,165464,999902,NEVADA,2022,High Wind,Zone,SOUTHEASTERN ELKO,0,...,,,,,,2,2,Feb,1,"SOUTHEASTERN ELKO, NEVADA"
1,21,22,165465,999903,NEVADA,2022,Heavy Snow,Zone,S LANDER & S EUREKA,0,...,,,,,,2,2,Feb,2,"S LANDER & S EUREKA, NEVADA"
2,22,22,165465,999904,NEVADA,2022,Heavy Snow,Zone,N ELKO CNTY,0,...,,,,,,2,2,Feb,1,"N ELKO CNTY, NEVADA"
3,18,18,165611,1001181,ATLANTIC SOUTH,2022,Waterspout,Zone,FERNANDINA BEACH TO ST AUGUSTINE FL OUT 20NM,0,...,,,,,,2,2,Feb,1,"PONTE VEDRA, FERNANDINA BEACH TO ST AUGUSTINE ..."
4,2,3,165668,1001527,AMERICAN SAMOA,2022,Heavy Rain,County,TUTUILA,0,...,,,,,,2,2,Feb,2,"VAITOGI, TUTUILA, AMERICAN SAMOA"


In [75]:
damage_cols = [col for col in df_year.columns 
               if any(key in col.upper() for key in ['DAMAGE', 'FATAL', 'DEATH', 'INJUR', 'LOSS', 'CROP'])
               and col.upper() != "FATALITY_ID"] # Fatality_ID is an identifier, not a damage column
damage_cols

['INJURIES_DIRECT',
 'INJURIES_INDIRECT',
 'DEATHS_DIRECT',
 'DEATHS_INDIRECT',
 'DAMAGE_PROPERTY',
 'DAMAGE_CROPS']

In [58]:
df_year[damage_cols].head()

Unnamed: 0,INJURIES_DIRECT,INJURIES_INDIRECT,DEATHS_DIRECT,DEATHS_INDIRECT,DAMAGE_PROPERTY,DAMAGE_CROPS,FATALITY_TYPE,FATALITY_AGE,FATALITY_SEX
0,0,0,0,0,0.00K,0.00K,,,
1,0,0,0,0,0.00K,0.00K,,,
2,0,0,0,0,0.00K,0.00K,,,
3,0,0,0,0,0.00K,0.00K,,,
4,0,0,0,0,50.00K,0.00K,,,


In [59]:
df_year[damage_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84058 entries, 0 to 84057
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   INJURIES_DIRECT    84058 non-null  int64  
 1   INJURIES_INDIRECT  84058 non-null  int64  
 2   DEATHS_DIRECT      84058 non-null  int64  
 3   DEATHS_INDIRECT    84058 non-null  int64  
 4   DAMAGE_PROPERTY    67426 non-null  object 
 5   DAMAGE_CROPS       68028 non-null  object 
 6   FATALITY_TYPE      1397 non-null   object 
 7   FATALITY_AGE       725 non-null    float64
 8   FATALITY_SEX       837 non-null    object 
dtypes: float64(1), int64(4), object(4)
memory usage: 5.8+ MB


In [None]:
def clean_damage_cols(df):
    """
    Cleans damage-related columns in NOAA storm events DataFrame.
    Converts damage amount columns to numeric, handling suffixes like 'K', 'M', 'B'.
    Converts count columns (fatalities, injuries) to Int64 (nullable integer).
    """
    # Drop unwanted columns
    # 'FATALITY_TYPE' is redudant with 'DEATHS_DIRECT' and 'DEATHS_INDIRECT'
    # We don't want to do demographic analysis so we can drop 'FATALITY_AGE' and 'FATALITY_SEX'
    cols_to_drop = ['FATALITY_TYPE','FATALITY_AGE','FATALITY_SEX']
    df.drop(columns=cols_to_drop, inplace=True, errors='ignore')

    def parse_damage(val):
        """
        Convert damage strings like '25K', '2.5M', '100B' into floats (dollars).
        Returns pd.NA if invalid.
        """
        if pd.isna(val):
            return pd.NA
        val = str(val).upper().strip()  # normalize
        multipliers = {"K": 1_000, "M": 1_000_000, "B": 1_000_000_000}
        if val[-1] in multipliers:
            try:
                return float(val[:-1]) * multipliers[val[-1]]
            except ValueError:
                return pd.NA
        try:
            return float(val)
        except ValueError:
            return pd.NA

    # Parse damage amount columns
    damage_amount_cols = ['DAMAGE_CROPS', 'DAMAGE_PROPERTY']
    for col in damage_amount_cols:
        if col in df.columns:
            df[col] = df[col].map(parse_damage)
            df[col] = pd.to_numeric(df[col], errors='coerce')  # converts to float dtype with NaNs
           


    



In [87]:
def parse_damage(val):
    """
    Convert damage strings like '25K', '2.5M', '100B' into floats (dollars).
    Returns pd.NA if invalid.
    """
    if pd.isna(val):
        return pd.NA
    val = str(val).upper().strip()  # normalize
    multipliers = {"K": 1_000, "M": 1_000_000, "B": 1_000_000_000}
    if val[-1] in multipliers:
        try:
            return float(val[:-1]) * multipliers[val[-1]]
        except ValueError:
            return pd.NA
    try:
        return float(val)
    except ValueError:
        return pd.NA


In [70]:
for i in range(20):
   print(df_year['DAMAGE_PROPERTY'].iloc[i], parse_damage(df_year['DAMAGE_PROPERTY'].iloc[i]))

0.00K 0.0
0.00K 0.0
0.00K 0.0
0.00K 0.0
50.00K 50000.0
50.00K 50000.0
0.00K 0.0
0.00K 0.0
0.00K 0.0
0.00K 0.0
0.00K 0.0
0.00K 0.0
0.00K 0.0
0.00K 0.0
0.00K 0.0
0.00K 0.0
0.00K 0.0
0.00K 0.0
0.00K 0.0
0.00K 0.0


In [None]:
df_year[damage_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84058 entries, 0 to 84057
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   INJURIES_DIRECT    84058 non-null  int64 
 1   INJURIES_INDIRECT  84058 non-null  int64 
 2   DEATHS_DIRECT      84058 non-null  int64 
 3   DEATHS_INDIRECT    84058 non-null  int64 
 4   DAMAGE_PROPERTY    67426 non-null  object
 5   DAMAGE_CROPS       68028 non-null  object
dtypes: int64(4), object(2)
memory usage: 3.8+ MB


In [106]:
df_year[df_year['DAMAGE_PROPERTY'].isna()]['DAMAGE_PROPERTY'].head(10)

37    <NA>
38    <NA>
39    <NA>
40    <NA>
41    <NA>
42    <NA>
43    <NA>
44    <NA>
45    <NA>
74    <NA>
Name: DAMAGE_PROPERTY, dtype: object

In [112]:
clean_damage_cols(df_year)

In [104]:
df_year[damage_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84058 entries, 0 to 84057
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   INJURIES_DIRECT    84058 non-null  int64 
 1   INJURIES_INDIRECT  84058 non-null  int64 
 2   DEATHS_DIRECT      84058 non-null  int64 
 3   DEATHS_INDIRECT    84058 non-null  int64 
 4   DAMAGE_PROPERTY    67426 non-null  object
 5   DAMAGE_CROPS       68028 non-null  object
dtypes: int64(4), object(2)
memory usage: 3.8+ MB


In [86]:
df_year[['DAMAGE_CROPS','DAMAGE_PROPERTY']]

Unnamed: 0,DAMAGE_CROPS,DAMAGE_PROPERTY
0,0.00K,0.00K
1,0.00K,0.00K
2,0.00K,0.00K
3,0.00K,0.00K
4,0.00K,50.00K
...,...,...
84053,0.00K,35.00K
84054,0.00K,150.00K
84055,0.00K,0.00K
84056,0.00K,0.00K


In [100]:
df_year['DAMAGE_CROPS'].map(parse_damage)

0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
84053    0.0
84054    0.0
84055    0.0
84056    0.0
84057    0.0
Name: DAMAGE_CROPS, Length: 84058, dtype: object

In [113]:
df_year['DAMAGE_CROPS']

0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
84053    0.0
84054    0.0
84055    0.0
84056    0.0
84057    0.0
Name: DAMAGE_CROPS, Length: 84058, dtype: float64

In [114]:
df_year[damage_cols]

Unnamed: 0,INJURIES_DIRECT,INJURIES_INDIRECT,DEATHS_DIRECT,DEATHS_INDIRECT,DAMAGE_PROPERTY,DAMAGE_CROPS
0,0,0,0,0,0.0,0.0
1,0,0,0,0,0.0,0.0
2,0,0,0,0,0.0,0.0
3,0,0,0,0,0.0,0.0
4,0,0,0,0,50000.0,0.0
...,...,...,...,...,...,...
84053,0,0,0,0,35000.0,0.0
84054,0,0,0,0,150000.0,0.0
84055,0,0,0,0,0.0,0.0
84056,0,0,0,0,0.0,0.0


In [115]:
df_year[damage_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84058 entries, 0 to 84057
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   INJURIES_DIRECT    84058 non-null  int64  
 1   INJURIES_INDIRECT  84058 non-null  int64  
 2   DEATHS_DIRECT      84058 non-null  int64  
 3   DEATHS_INDIRECT    84058 non-null  int64  
 4   DAMAGE_PROPERTY    67426 non-null  float64
 5   DAMAGE_CROPS       68028 non-null  float64
dtypes: float64(2), int64(4)
memory usage: 3.8 MB


In [117]:
df_year['DAMAGE_PROPERTY'].sum()

np.float64(485005135750.0)