# 67818 Applied Competitive Lab in Data Science

## Final Project

### Participants:

- **Name:** Dan Badur
- **Student ID:** 209019256
- 
- **Name:** Tzur Breen
- **Student ID:** 209354919
- 
- **Name:** Shir Elbilia
- **Student ID:** 208621102
- 
- **Name:** Eliya Hasson
- **Student ID:** 208845032
- 
- **Name:** Tal Barda
- **Student ID:** 208729210

#### 1. Data Preparation and Feature Engineering:

We will start by defining a function for calculating the distance between each point in our database and a set of polygons in a wanted shapefile.
The function is set to run on multiple cors for better performance. 

For our original data containing 571000 rows for an 8 core CPU (the code utilizes 0.75 of the cores):
Calculating for the nature polygons takes about 3 minutes.
For the others it's about 30 seconds to 1 minute each.

In [1]:
from tqdm import tqdm
import geopandas as gpd
import pandas as pd
import numpy as np
import concurrent.futures
import math
from distance_calculations import calculate_distances_chunk
import os
from datetime import datetime, timedelta


data_df = pd.read_csv("data.csv")

total_cores = os.cpu_count()
cores_to_use = math.ceil(total_cores * 0.75)
num_cores = max(1, cores_to_use) 
WGS_84_GEO = "EPSG:4326"

def calculate_distances_gdf_to_polygon_parallel(points_gdf, polygons_gdf, target_name, attribute_name=None):

    num_chunks = num_cores * 4  
    points_chunks = np.array_split(points_gdf, num_chunks)  
    
    with tqdm(total=len(points_chunks), desc="Processing chunks for "+target_name) as pbar:
        with concurrent.futures.ProcessPoolExecutor(max_workers=num_cores) as executor:
            futures = [executor.submit(calculate_distances_chunk, chunk.geometry, polygons_gdf, return_attribute=attribute_name) for chunk in points_chunks]
            
            results = []
            for future in concurrent.futures.as_completed(futures):
                results.extend(future.result())
                pbar.update(1)
    
    return results

data_gdf = gpd.GeoDataFrame(data_df, geometry=gpd.points_from_xy(data_df.LONGITUDE, data_df.LATITUDE), crs=WGS_84_GEO)

nature_gdf = gpd.read_file('Nature/Nature.shp')
powerline_gdf = gpd.read_file('Powerline/Powerline.shp')
camp_gdf = gpd.read_file('Campgrounds/Campgrounds.shp')
population_gdf = gpd.read_file('Population/Population.shp')
railroads_gdf = gpd.read_file('Railroads/Railroads.shp')
schools_gdf = gpd.read_file('PublicSchools/PublicSchools.shp')
#city_gdf =  gpd.read_file('City/City.shp')

nature_gdf = nature_gdf.to_crs(WGS_84_GEO)
powerline_gdf = powerline_gdf.to_crs(WGS_84_GEO)
camp_gdf = camp_gdf.to_crs(WGS_84_GEO)
population_gdf = population_gdf.to_crs(WGS_84_GEO)
railroads_gdf = railroads_gdf.to_crs(WGS_84_GEO)
schools_gdf = schools_gdf.to_crs(WGS_84_GEO)
#city_gdf = city_gdf.to_crs("EPSG:4326")

data_df['distance_to_nearest_nature_km'] = calculate_distances_gdf_to_polygon_parallel(data_gdf, nature_gdf, "Nature")
data_df['distance_to_nearest_powerline_km'] = calculate_distances_gdf_to_polygon_parallel(data_gdf, powerline_gdf, "Powerline")
data_df['distance_to_nearest_campground_km'] = calculate_distances_gdf_to_polygon_parallel(data_gdf, camp_gdf, "Campground")
data_df['distance_to_nearest_railroad_km'] = calculate_distances_gdf_to_polygon_parallel(data_gdf, railroads_gdf, "Railroad")
data_df['distance_to_nearest_school_km'] = calculate_distances_gdf_to_polygon_parallel(data_gdf, schools_gdf, "School")

distance_from_population_and_class = calculate_distances_gdf_to_polygon_parallel(data_gdf, population_gdf, "Population", "POP_CLASS")
data_df['distance_to_nearest_population'], data_df['nearest_population_class'] = zip(*distance_from_population_and_class)

# data_df['distance_to_nearest_city_km'] = calculate_distances_gdf_to_polygon_parallel(data_gdf, city_gdf, num_cores, "City")

data_df.to_csv('data_with_distances.csv', index=False)

ModuleNotFoundError: No module named 'geopandas'

Now we will calculate the averages for each cause of fire for every feature we calculated above: 

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("data_with_distances.csv")

# Get unique values in the STAT_CAUSE_DESCR column
unique_causes = df['STAT_CAUSE_DESCR'].unique()

for cause in unique_causes:
    # Filter the dataframe for the current cause
    cause_df = df[df['STAT_CAUSE_DESCR'] == cause]

    # Calculate the average distances
    avg_nature_distance = cause_df['distance_to_nearest_nature_km'].mean()
    avg_powerline_distance = cause_df['distance_to_nearest_powerline_km'].mean()
    avg_camp_distance = cause_df['distance_to_nearest_campground_km'].mean()
    avg_population_distance = cause_df['distance_to_nearest_population'].mean()
    avg_city_population_class = cause_df['nearest_population_class'].mean()
    avg_city_distance_railroad= cause_df['distance_to_nearest_railroad_km'].mean()
    avg_school_distance = cause_df['distance_to_nearest_school_km'].mean()


    # Print the averages for the current cause
    print(f"Averages {cause}:")
    print(f"  Nature distance: {avg_nature_distance:.2f} km")
    print(f"  Campground distance: {avg_camp_distance:.2f} km")
    print(f"  Nearest population distance: {avg_population_distance:.2f} km")
    print(f"  Nearest population class: {avg_city_population_class:.2f}")
    print(f"  Nearest railroad distance: {avg_city_distance_railroad:.2f} km")
    print(f"  Powerline distance: {avg_powerline_distance:.2f} km")
    print(f"  School distance: {avg_school_distance:.2f} km")

    # print(f"  City: {avg_city_distance:.2f} km")
    print()

print()
df = pd.read_csv("data_with_distances.csv")
avg_city_distance_by_fire_size = df.groupby('FIRE_SIZE_CLASS')['distance_to_nearest_population'].mean()

print("Average distance from the nearest population for each FIRE_SIZE_CLASS:")
print(avg_city_distance_by_fire_size)

Next we will explore time related features. 

We worked on adding and exploring features related to time.
We decided to check whether there is a connection between holidays and the type of the cause of the fire.
The rational was that during holidays and vacations, people are returning home and may have more time to travel (campfire) or they just get bored and do some stupid things (children, arson...).
Therefore, we downloaded a dataset that contains the dates of some holidays in the USA and added to our dataset the distance of every fire from the closest holiday in days.
The added features are "nearest_holiday", "days_from_closest_holiday".

In [None]:
holiday_dates_df = pd.read_csv(f"US Holiday Dates (2004-2021).csv")

In [None]:
# Convert 'Date' in holiday_dates_df to datetime objects
holiday_dates_df['Date'] = pd.to_datetime(holiday_dates_df['Date'])

# Create a dictionary to map each year to its holidays
holiday_dict = {}
for year in holiday_dates_df['Year'].unique():
    holiday_dict[year] = holiday_dates_df[holiday_dates_df['Year'] == year]

def find_nearest_holiday(fire_year, discovery_doy):
    if fire_year not in holiday_dict:
        return ("No Data", "No Data")
    year_holidays = holiday_dict[fire_year]
    fire_date = datetime(fire_year, 1, 1) + timedelta(days=discovery_doy - 1)
    nearest_holiday = None
    min_days_diff = float('inf')
    for _, row in year_holidays.iterrows():
        holiday_date = row['Date']
        diff = abs((fire_date - holiday_date).days)
        if diff < min_days_diff:
            min_days_diff = diff
            nearest_holiday = row['Holiday']
    return (nearest_holiday, min_days_diff)

# Apply the function to each row in the wildfire dataset
data_df['nearest_holiday'], data_df['days_from_closest_holiday'] = zip(*data_df.apply(lambda row: find_nearest_holiday(row['FIRE_YEAR'], row['DISCOVERY_DOY']), axis=1))

In [None]:
data_df.head(10)

We also noticed that there are specific reasons that have more incidents along different hours during the day.
We considered adding a feature of "part_of_day" in which the fire took place, but we realized that we already have the distribution by hours so this feature will not be very useful.

We will also add here a season feature that will help predict the cause type:

In [None]:
data_df['DISCOVERY_DATE'] = pd.to_datetime(data_df['DISCOVERY_DATE'], unit='D', origin='julian')
def get_season(date):
    month = date.month
    if month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    elif month in [9, 10, 11]:
        return 'Fall'
    else:
        return 'Winter'
data_df['Season'] = data_df['DISCOVERY_DATE'].apply(get_season)

In [None]:
data_df.head(10)