In [1]:
import numpy as np
import requests
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from plotly import express as px
import plotly.graph_objs as go
# ML libraries - idk which one to use yet
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, mean_absolute_error, r2_score
from matplotlib.colors import ListedColormap, BoundaryNorm
from matplotlib.patches import Patch
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import preprocessing, tree
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import euclidean_distances
from scipy.spatial import cKDTree


In [2]:
# NASA API
def get_nasa_power_data(lat, lon, start_date, end_date):
    """
    Fetches NASA POWER API data for given latitude, longitude, and time range.

    Args:
    - lat (float): Latitude of the location.
    - lon (float): Longitude of the location.
    - start_date (str): Start date in YYYYMMDD format.
    - end_date (str): End date in YYYYMMDD format.

    Returns:
    - Pandas DataFrame with selected weather parameters.
    """

    # Specify multiple parameters in the API request
    parameters = "PRECSNO,T2MDEW,PRECTOTCORR,T2M,WS2M"

    url = "https://power.larc.nasa.gov/api/temporal/daily/point"
    params = {
        "parameters": parameters,
        "community": "RE",
        "longitude": lon,
        "latitude": lat,
        "start": start_date,
        "end": end_date,
        "format": "JSON"
    }

    response = requests.get(url, params=params)

    data = response.json()

    # Convert JSON response to DataFrame and transpose it
    nasa_weather = pd.DataFrame.from_dict(data["properties"]["parameter"], orient="index").T

    # Reset index and rename date column
    nasa_weather.reset_index(inplace=True)
    nasa_weather.rename(columns={"index": "date"}, inplace=True)

    # Convert date column to proper datetime format
    nasa_weather["date"] = pd.to_datetime(nasa_weather["date"], format="%Y%m%d", errors="coerce")
    nasa_weather.dropna(subset=["date"], inplace=True)  # Remove invalid date rows

    nasa_weather.rename(columns={
        "PRECSNO": "Snow_Precipitation",
        "T2MDEW": "Dew_Point_2m",
        "PRECTOTCORR": "Total_Precipitation_mm",
        "T2M": "Temperature_2m_C",
        "WS2M": "Wind_Speed_2m"
    }, inplace=True)
    
    # Add Rounded_Lat and Rounded_Lng for merging
    nasa_weather['Rounded_Lat'] = lat
    nasa_weather['Rounded_Lng'] = lon
    
    # Display DataFrame
    print(f"\n Weather Data for Latitude {lat}, Longitude {lon}\n")
    print(f"\n Weather Data for Latitude {lat}, Longitude {lon}\n")
    nasa_weather['Precipitation(in)'] = nasa_weather['Total_Precipitation_mm'] / 25.4 # mm to in
    nasa_weather['Temperature(F)'] = (nasa_weather['Temperature_2m_C'] * (9./5.)) + 32. # C to F
    nasa_weather['Wind_Speed(mph)'] = nasa_weather['Wind_Speed_2m'] * 2.237 # m/s to mph
    # nasa_weather.dropna()
    display(nasa_weather)  # Works in Jupyter Notebook

    return nasa_weather

In [3]:
# Example: Fetch data for different locations
df_la = get_nasa_power_data(34.05, -118.25, "20240101", "20240107")  # Los Angeles


 Weather Data for Latitude 34.05, Longitude -118.25


 Weather Data for Latitude 34.05, Longitude -118.25



Unnamed: 0,date,Snow_Precipitation,Dew_Point_2m,Total_Precipitation_mm,Temperature_2m_C,Wind_Speed_2m,Rounded_Lat,Rounded_Lng,Precipitation(in),Temperature(F),Wind_Speed(mph)
0,2024-01-01,0.0,6.12,0.04,12.14,1.64,34.05,-118.25,0.001575,53.852,3.66868
1,2024-01-02,0.0,7.05,0.09,11.55,1.59,34.05,-118.25,0.003543,52.79,3.55683
2,2024-01-03,0.0,6.75,5.87,10.86,3.25,34.05,-118.25,0.231102,51.548,7.27025
3,2024-01-04,0.0,2.14,0.02,9.76,2.89,34.05,-118.25,0.000787,49.568,6.46493
4,2024-01-05,0.0,1.99,0.0,10.8,2.0,34.05,-118.25,0.0,51.44,4.474
5,2024-01-06,0.0,1.23,0.12,10.5,2.86,34.05,-118.25,0.004724,50.9,6.39782
6,2024-01-07,0.0,-0.09,0.26,7.77,5.29,34.05,-118.25,0.010236,45.986,11.83373


In [None]:
# inspect us_accident data
us_accidents = pd.read_csv('US_Accidents_March23.csv')
us_accidents.head()

In [None]:
# list of all columns in US Accident Data
drop_cols = ['ID',
            'Source',
            # 'Severity', # Severity = target column, 1-4, where 1 indicates the least impact on traffic
            'Start_Time',
            'End_Time',
            'Start_Lat',  
            'Start_Lng', 
            'End_Lat',
            'End_Lng',
            'Distance(mi)', # Distance(mi) = target column?, length of road extent affected by accident in miles
            'Description', # Description = human description of accident
            'Street', 
            'City', 
            'County',
            'State',
            'Zipcode',
            'Country',
            'Timezone',
            'Airport_Code',
            'Weather_Timestamp', # Weather_Timestamp = shows time-stamp of weather observation record (in local time)
            # 'Temperature(F)',
            'Wind_Chill(F)',
            'Humidity(%)',
            'Pressure(in)',
            'Visibility(mi)',
            'Wind_Direction',
            # 'Wind_Speed(mph)',
            # 'Precipitation(in)',
            'Weather_Condition',
            'Amenity',
            'Bump',
            'Crossing',
            'Give_Way',
            'Junction',
            'No_Exit',
            'Railway',
            'Roundabout',
            'Station',
            'Stop',
            'Traffic_Calming',
            'Traffic_Signal',
            'Turning_Loop',
            'Sunrise_Sunset', # day or night based on sunrise/sunset
            'Civil_Twilight', # day or night based on civil twilight
            'Nautical_Twilight', # day or night based on nautical twilight
            'Astronomical_Twilight'] # day or night based on astronomical twilight

In [None]:
# string feature that if kept will need to be encoded for ML
str_features = 'Weather_Condition'

In [None]:
def prepare_data(df, split, predictors=[], target=[]):
    """
    Prepares the US Accidents DataFrame for merging with NASA weather data, keeping only necessary columns.
    
    Args:
    - df (DataFrame): Raw US Accidents dataset.
    - split (boolean): if true, split df by target and predictor data
    - ml_drop (list): other columns to drop for machine learning model (adjustable if we decide a variable is not good at predicting)
    
    Returns:
    - DataFrame: Processed DataFrame with 'date', 'Rounded_Lat', 'Rounded_Lng', and 'Severity' columns.
    """
    # Drop duplicates
    df.drop_duplicates(inplace=True)
    
    # Convert time columns to datetime format
    df['Start_Time'] = pd.to_datetime(df['Start_Time'], errors='coerce')

    # Remove rows with invalid 'Start_Time' values
    df = df[df['Start_Time'].notnull()].copy()

    # Extract 'date' from 'Start_Time' for merging with NASA weather data
    # df['date'] = df['Start_Time'].dt.date

    # Filter for coordinates within LA County
    df = df[(df['Start_Lat'].between(33.7, 34.8)) & (df['Start_Lng'].between(-119.0, -117.6))]

    # Round latitude and longitude to 2 decimal places for approximate matching
    df['Rounded_Lat'] = df['Start_Lat'].round(2)
    df['Rounded_Lng'] = df['Start_Lng'].round(2)
    
    # encode str_features
    le = preprocessing.LabelEncoder()
    df['Weather_Condition'] = le.fit_transform(df['Weather_Condition'])
    
#     # add new columns for increased risk with associated conditions and time
#     # for now will start by increased risk with associated weather and time
#     df['hour'] = df['Start_Time'].astype(str).str.split(' ').str.get(1).str.split(':').str.get(0)
    
#     # INCREASED SEVERITY RISK BASED ON LOCATION AND TIME  --------------------------------------------------------------------------
    
#     # add col to represent frequency of accidents at this location and time
#     df['lt_frequency'] = df.groupby(['Rounded_Lat','Rounded_Lng','hour'])['Severity'].transform('count')

#     # add col to represent frequency rank of accidents at this location and time compared to rest of location and times
#     # rank 0 = lowest frequency rank
#     df['lt_frequency_rank'] = df.groupby(['Rounded_Lat','Rounded_Lng','hour'])['lt_frequency'].rank()

#     # add col to represent severity rank of accidents at this location and time compared to rest of location and times
#     # rank 0 = lowest frequency rank
#     df['lt_severity_rank'] = df.groupby(['Rounded_Lat','Rounded_Lng','hour'])['Severity'].rank()
    
#     # INCREASED SEVERITY RISK BASED ON LOCATION AND WEATHER  --------------------------------------------------------------------------
    
#     # add col to represent frequency of accidents at this location and time
#     df['lw_frequency'] = df.groupby(['Rounded_Lat','Rounded_Lng','Temperature(F)','Wind_Speed(mph)','Precipitation(in)'])['Severity'].transform('count')

#     # add col to represent frequency rank of accidents at this location and time compared to rest of location and times
#     # rank 0 = lowest frequency rank
#     df['lw_frequency_rank'] = df.groupby(['Rounded_Lat','Rounded_Lng','Temperature(F)','Wind_Speed(mph)','Precipitation(in)'])['lw_frequency'].rank()

#     # add col to represent severity rank of accidents at this location and time compared to rest of location and times
#     # rank 0 = lowest frequency rank
#     df['lw_severity_rank'] = df.groupby(['Rounded_Lat','Rounded_Lng','Temperature(F)','Wind_Speed(mph)','Precipitation(in)'])['Severity'].rank()
    
#     # Compute Risk Changes-----------------------------------------------------------------------------------
    
#     # Average accident frequency per hour (time baseline)
#     avg_hourly_accidents = df.groupby('hour')['lt_frequency'].transform('mean')
#     df['time_risk_change'] = (df['lt_frequency'] - avg_hourly_accidents) / avg_hourly_accidents

#     # Average accident frequency per weather condition (weather baseline)
#     avg_weather_accidents = df.groupby('Weather_Condition')['lt_frequency'].transform('mean')
#     df['weather_risk_change'] = (df['lt_frequency'] - avg_weather_accidents) / avg_weather_accidents

#     # Combined risk factor (balancing time & weather risks)
#     df['combined_risk'] = 0.5 * df['time_risk_change'] + 0.5 * df['weather_risk_change']

    # drop NaN
    df = df.dropna()
    
    # Keep only relevant columns
    df = df.drop(drop_cols, axis=1)
    
    # for machine learning clean and prep
    if split:
        X=df[predictors]
        y=df[target]
        return X,y

    return df

Trying new neural network based off of https://www.kaggle.com/code/kelixirr/us-accidents-severity-prediction-end-to-end#Preparing-Our-Data-For-The-Model

(i had to download tensorflow)

prepare_data was changed to only have locaiton and 3 weather attributes (the same that can be obtained from nasa)

In [None]:
from __future__ import absolute_import, division, print_function
import os
import sys
import tensorflow as tf

In [None]:
# if you get ValueError: The least populated class in y has only 1 member, 
# which is too few. The minimum number of groups for any class cannot be less than 2.
# just re-run until it works (idk why)
from sklearn.model_selection import train_test_split

train, test = train_test_split(us_accidents, test_size=0.2)
df_train = prepare_data(train, split=False)
# predictors: ['Rounded_Lat','Rounded_Lng','Temperature(F)','Wind_Speed(mph)','Precipitation(in)']
X = df_train.drop(columns=['Severity']) 
# predicting severity of an accident at this location with current weather conditions
y = df_train['Severity'] 

# make training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=30, stratify=y)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.4, random_state=30, stratify=y_temp)

In [None]:
# inspect predictors
X_train.head()

In [None]:
# check y_values (severity labels) are in range [0,4] not negative or other
y_values = y_test.unique()
print(y_values)

In [None]:
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

y_train = y_train - 1
y_valid = y_valid - 1  

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

train_dataset = tf.data.Dataset.from_tensor_slices((X_train_scaled, y_train))
valid_dataset = tf.data.Dataset.from_tensor_slices((X_valid_scaled, y_valid))

train_dataset = train_dataset.shuffle(buffer_size=1024).batch(64)
valid_dataset = valid_dataset.batch(64)

In [None]:
unique_values = train['Severity'].unique()
print(unique_values) # 4 unique class for accident severity

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, LearningRateScheduler


model = Sequential()
model.add(Input(shape = (X_train_scaled.shape[1],)))
model.add(Dense(256, activation='relu')) 
model.add(BatchNormalization()) 
model.add(Dropout(0.3)) 
model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(Dense(32, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3)) 
model.add(Dense(4, activation='softmax'))  # 4 unique classes for accident severity

model.summary()

In [None]:
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

def lr_schedule(epoch):
    initial_lr = 0.001
    drop = 0.5
    epochs_drop = 10
    lr = initial_lr * (drop ** np.floor((1+epoch)/epochs_drop))
    return lr

# create checkpoint to save best model during training
checkpoint_path = 'best_model.keras'
checkpoint_dir = os.path.dirname(checkpoint_path)
checkpoint = ModelCheckpoint('best_model.keras', monitor='val_loss', save_best_only=True, mode='min', verbose=1)

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=1)
lr_scheduler = LearningRateScheduler(lr_schedule)


history = model.fit(train_dataset,
                    epochs=50,
                    validation_data=valid_dataset,
                    callbacks=[checkpoint, early_stopping, lr_scheduler])

In [None]:
os.path.isfile('best_model.keras') # make sure true before loading saved models in later cells

In [None]:
saved_model = tf.keras.models.load_model("best_model.keras")
val_loss, val_accuracy = saved_model.evaluate(valid_dataset) 
print(f"Validation Accuracy: {val_accuracy}")

In [None]:
y_test = y_test - 1
X_test_scaled = scaler.transform(X_test)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test_scaled, y_test)) 
test_dataset = test_dataset.batch(64)

In [None]:
test_loss, test_accuracy = saved_model.evaluate(test_dataset) 


In [None]:
from sklearn.metrics import accuracy_score, classification_report
y_pred = np.argmax(saved_model.predict(X_test_scaled), axis=1) 
print(classification_report(y_test, y_pred))

In [None]:
df_test = get_nasa_power_data(34.05, -118.25, "20240101", "20240107")
df = df_test[['Rounded_Lat','Rounded_Lng','Temperature(F)','Wind_Speed(mph)','Precipitation(in)']]
df.head()

In [None]:
nasa_test = scaler.transform(df)
saved_model.predict(nasa_test)