# 67818 Applied Competitive Lab in Data Science

## Final Project

### Participants:

- **Name:** Dan Badur
- **Student ID:** 209019256
- 
- **Name:** Tzur Breen
- **Student ID:** 209354919
- 
- **Name:** Shir Elbilia
- **Student ID:** 208621102
- 
- **Name:** Eliya Hasson
- **Student ID:** 208845032
- 
- **Name:** Tal Barda
- **Student ID:** 208729210

#### 1. Data Preparation and Feature Engineering:

We will start by defining a function for calculating the distance between each point in our database and a set of polygons in a wanted shapefile.
The function is set to run on multiple cors for better performance. 

For our original data containing 571000 rows for an 8 core CPU (the code utilizes 0.75 of the cores):
Reading the shapefiles takes about 5 minutes.
Calculating for the nature polygons takes about 3 minutes.
Calculating for the structure polygons takes about ??? minutes.
For the others it's about 30 seconds to 1 minute each.

In [2]:
from tqdm import tqdm
import geopandas as gpd
import pandas as pd
import numpy as np
import concurrent.futures
import math
from distance_calculations import calculate_distances_chunk
import os


data_df = pd.read_csv("data.csv")

total_cores = os.cpu_count()
cores_to_use = math.ceil(total_cores * 0.75)
num_cores = max(1, cores_to_use) 
WGS_84_GEO = "EPSG:4326"

def calculate_distances_gdf_to_polygon_parallel(points_gdf, polygons_gdf, target_name, attribute_name=None):

    num_chunks = num_cores * 4  
    points_chunks = np.array_split(points_gdf, num_chunks)  
    polygons_sindex = polygons_gdf.sindex
    
    with tqdm(total=len(points_chunks), desc="Processing chunks for "+target_name) as pbar:
        with concurrent.futures.ProcessPoolExecutor(max_workers=num_cores) as executor:
            futures = [executor.submit(
                calculate_distances_chunk,
                chunk.geometry,
                polygons_gdf,
                polygons_sindex,
                return_attribute=attribute_name,
            ) for chunk in points_chunks]
            
            results = []
            for future in concurrent.futures.as_completed(futures):
                results.extend(future.result())
                pbar.update(1)
    
    return results


data_gdf = gpd.GeoDataFrame(data_df, geometry=gpd.points_from_xy(data_df.LONGITUDE, data_df.LATITUDE), crs=WGS_84_GEO)

nature_gdf = gpd.read_file('Nature/Nature.shp')
powerline_gdf = gpd.read_file('Powerline/Powerline.shp')
camp_gdf = gpd.read_file('Campgrounds/Campgrounds.shp')
population_gdf = gpd.read_file('Population/Population.shp')
railroads_gdf = gpd.read_file('Railroads/Railroads.shp')
schools_gdf = gpd.read_file('PublicSchools/PublicSchools.shp')
mobile_home_parks_gdf = gpd.read_file('MobileHomeParks/MobileHomeParks.shp')

nature_gdf = nature_gdf.to_crs(WGS_84_GEO)
powerline_gdf = powerline_gdf.to_crs(WGS_84_GEO)
camp_gdf = camp_gdf.to_crs(WGS_84_GEO)
population_gdf = population_gdf.to_crs(WGS_84_GEO)
railroads_gdf = railroads_gdf.to_crs(WGS_84_GEO)
schools_gdf = schools_gdf.to_crs(WGS_84_GEO)
mobile_home_parks_gdf = mobile_home_parks_gdf.to_crs(WGS_84_GEO)

data_df['distance_to_nearest_nature_km'] = calculate_distances_gdf_to_polygon_parallel(data_gdf, nature_gdf, "Nature")
data_df['distance_to_nearest_powerline_km'] = calculate_distances_gdf_to_polygon_parallel(data_gdf, powerline_gdf, "Powerline")
data_df['distance_to_nearest_campground_km'] = calculate_distances_gdf_to_polygon_parallel(data_gdf, camp_gdf, "Campground")
data_df['distance_to_nearest_railroad_km'] = calculate_distances_gdf_to_polygon_parallel(data_gdf, railroads_gdf, "Railroad")
data_df['distance_to_nearest_school_km'] = calculate_distances_gdf_to_polygon_parallel(data_gdf, schools_gdf, "School")
data_df['distance_to_nearest_mobile_home_park_km'] = calculate_distances_gdf_to_polygon_parallel(data_gdf, mobile_home_parks_gdf, "MobileHomePark")

distance_from_population_and_class = calculate_distances_gdf_to_polygon_parallel(data_gdf, population_gdf, "Population", "POP_CLASS")
data_df['distance_to_nearest_population'], data_df['nearest_population_class'] = zip(*distance_from_population_and_class)

data_df.to_csv('data_with_distances.csv', index=False)

  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
Processing chunks for MobileHomePark: 100%|██████████| 24/24 [00:16<00:00,  1.42it/s]


Now we will calculate the averages for each cause of fire for every feature we calculated above: 

In [4]:
import pandas as pd

# Load the dataset
df = pd.read_csv("data_with_distances.csv")

# Get unique values in the STAT_CAUSE_DESCR column
unique_causes = df['STAT_CAUSE_DESCR'].unique()

for cause in unique_causes:
    # Filter the dataframe for the current cause
    cause_df = df[df['STAT_CAUSE_DESCR'] == cause]
    print(f"Calculating averages for {cause} ({len(cause_df)} rows):" )

    avg_nature_distance = cause_df['distance_to_nearest_nature_km'].mean()
    avg_powerline_distance = cause_df['distance_to_nearest_powerline_km'].mean()
    avg_camp_distance = cause_df['distance_to_nearest_campground_km'].mean()
    avg_population_distance = cause_df['distance_to_nearest_population'].mean()
    avg_city_population_class = cause_df['nearest_population_class'].mean()
    avg_distance_railroad= cause_df['distance_to_nearest_railroad_km'].mean()
    avg_school_distance = cause_df['distance_to_nearest_school_km'].mean()
    
    avg_mobile_home_park_distance = cause_df['distance_to_nearest_mobile_home_park_km'].mean()


    # Print the averages for the current cause
    print(f"Averages {cause}:")
    print(f"  Nature distance: {avg_nature_distance:.2f} km")
    print(f"  Campground distance: {avg_camp_distance:.2f} km")
    print(f"  Nearest population distance: {avg_population_distance:.2f} km")
    print(f"  Nearest population class: {avg_city_population_class:.2f}")
    print(f"  Powerline distance: {avg_powerline_distance:.2f} km")
    print(f"  School distance: {avg_school_distance:.2f} km")
    print(f"  Mobile home park distance: {avg_mobile_home_park_distance:.2f} km")

    print()

# print()
# df = pd.read_csv("data_with_distances.csv")
# avg_city_distance_by_fire_size = df.groupby('FIRE_SIZE_CLASS')['distance_to_nearest_population'].mean()
# 
# print("Average distance from the nearest population for each FIRE_SIZE_CLASS:")
# print(avg_city_distance_by_fire_size)

Calculating averages for Miscellaneous (108372 rows):
  Mobile home park distance: 12.97 km

Calculating averages for Arson (93304 rows):
  Mobile home park distance: 11.76 km

Calculating averages for Debris Burning (143074 rows):
  Mobile home park distance: 10.33 km

Calculating averages for Smoking (17571 rows):
  Mobile home park distance: 11.97 km

Calculating averages for Campfire (25367 rows):
  Mobile home park distance: 21.21 km

Calculating averages for Equipment Use (49423 rows):
  Mobile home park distance: 11.94 km

Calculating averages for Powerline (4733 rows):
  Mobile home park distance: 15.29 km

Calculating averages for Lightning (93057 rows):
  Mobile home park distance: 35.10 km

Calculating averages for Railroad (11053 rows):
  Mobile home park distance: 8.73 km

Calculating averages for Children (20354 rows):
  Mobile home park distance: 10.53 km

Calculating averages for Fireworks (3865 rows):
  Mobile home park distance: 22.03 km

Calculating averages for Stru

Next we will explore time related features. 