# LA Wildfire Prediction: Data Preprocessing


In [19]:

import pandas as pd
import numpy as np
import os
from datetime import datetime

# For displaying plots in the notebook
%matplotlib inline


## Data Loading
The following function loads the raw wildfire dataset from CSV file.


In [21]:
def load_data(file_path):
    
    print(f"Loading data from {file_path}...")
    df = pd.read_csv(file_path)
    print(f"Loaded data with shape: {df.shape}")
    
    # Print the head of raw data
    print("\nRaw Data Head:")
    print(df.head())
    
    return df


## Date Conversion
Convert date columns to datetime format for time-based analysis.


In [23]:
def convert_date_columns(df):
    print("Converting date columns...")
    df['date'] = pd.to_datetime(df['date'])
    return df


## Missing Value Handling
Handle missing values in the dataset by replacing them with 0.


In [25]:
def handle_missing_values(df):
    print("Handling missing values...")
    
    # Check for missing values
    missing_values = df.isnull().sum()
    print(f"Missing values before replacement:\n{missing_values[missing_values > 0]}")
    
    # Replace all missing values with 0
    df = df.fillna(0)
    
    # Check missing values after replacement
    missing_values_after = df.isnull().sum()
    print(f"Missing values after replacement:\n{missing_values_after[missing_values_after > 0]}")
    
    return df


## Data Integrity Checks
Perform data integrity checks to ensure data quality:
- Remove duplicates
- Fix negative precipitation values
- Remove records with unreasonable temperature values


In [27]:
def check_data_integrity(df):
 
    print("Checking data integrity...")
    
    # Check for duplicates
    duplicates = df.duplicated().sum()
    print(f"Found {duplicates} duplicate rows")
    if duplicates > 0:
        df = df.drop_duplicates()
        print(f"Removed {duplicates} duplicate rows")
    
    # Check for invalid values in key columns
    print("Checking for invalid values in key columns...")
    
    # Ensure precipitation is non-negative
    if 'PRCP' in df.columns:
        invalid_prcp = (df['PRCP'] < 0).sum()
        if invalid_prcp > 0:
            print(f"Found {invalid_prcp} rows with negative precipitation")
            df.loc[df['PRCP'] < 0, 'PRCP'] = 0
            print("Fixed negative precipitation values")
    
    # Ensure temperature values are within reasonable range for LA
    if 'TMAX' in df.columns and 'TMIN' in df.columns:
        invalid_tmax = ((df['TMAX'] < -10) | (df['TMAX'] > 120)).sum()
        invalid_tmin = ((df['TMIN'] < -10) | (df['TMIN'] > 100)).sum()
        
        if invalid_tmax > 0:
            print(f"Found {invalid_tmax} rows with unreasonable TMAX values")
            df = df[(df['TMAX'] >= -10) & (df['TMAX'] <= 120)]
        
        if invalid_tmin > 0:
            print(f"Found {invalid_tmin} rows with unreasonable TMIN values")
            df = df[(df['TMIN'] >= -10) & (df['TMIN'] <= 100)]
    
    return df


## Column Dropping
Drop specified columns from the dataframe. In this case, we only drop 'fire_count' and keep 'Fire_Occurred'.


In [29]:
def drop_columns(df):

    print("Dropping specified columns...")
    
    # Check if the columns exist before dropping
    columns_to_drop = ['fire_count']  # Only drop fire_count, keep Fire_Occurred
    existing_columns = [col for col in columns_to_drop if col in df.columns]
    
    if existing_columns:
        df = df.drop(columns=existing_columns)
        print(f"Dropped columns: {existing_columns}")
    else:
        print("Specified columns not found in the dataframe")
    
    return df


## Save Processed Data
Save the processed dataframe to a CSV file.


In [31]:
def save_processed_data(df, output_path):
   
    print(f"Saving processed data to {output_path}...")
    # Create directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    df.to_csv(output_path, index=False)
    print(f"Saved processed data with shape: {df.shape}")
    
    # Print the head of processed data
    print("\nProcessed Data Head:")
    print(df.head())


## Main Execution
Run the complete preprocessing pipeline.


In [33]:
def main():
    # Define file paths
    input_path = "../data/raw/FINAL_LA_FIRE_ML_DATA.csv"
    output_path = "../data/processed/processed_la_fire_data.csv"
    
    # Load data
    df = load_data(input_path)
    
    # Preprocess data
    df = convert_date_columns(df)
    df = handle_missing_values(df)
    df = check_data_integrity(df)
    
    # Drop specified columns
    df = drop_columns(df)
    
    # Save processed data
    save_processed_data(df, output_path)
    
    print("Preprocessing completed successfully!")
    
    return df


In [35]:
# Execute the preprocessing pipeline
preprocessed_df = main()


Loading data from ../data/raw/FINAL_LA_FIRE_ML_DATA.csv...
Loaded data with shape: (225016, 36)

Raw Data Head:
         date  fire_count  Fire_Occurred STATION NAME  AWND  DAPR  MDPR  PGTM  \
0  2014-12-27           0              0     NaN  NaN   NaN   NaN   NaN   NaN   
1  2014-12-28           0              0     NaN  NaN   NaN   NaN   NaN   NaN   
2  2014-12-29           0              0     NaN  NaN   NaN   NaN   NaN   NaN   
3  2014-12-30           0              0     NaN  NaN   NaN   NaN   NaN   NaN   
4  2014-12-31           0              0     NaN  NaN   NaN   NaN   NaN   NaN   

   PRCP  ...  WT11  year  month  PRCP_7D  AWND_7D  PRCP_prev  AWND_prev  \
0   NaN  ...   NaN   NaN    NaN      NaN      NaN        NaN        NaN   
1   NaN  ...   NaN   NaN    NaN      NaN      NaN        NaN        NaN   
2   NaN  ...   NaN   NaN    NaN      NaN      NaN        NaN        NaN   
3   NaN  ...   NaN   NaN    NaN      NaN      NaN        NaN        NaN   
4   NaN  ...   NaN   NaN  