# 0: Imports

In [1]:
# General imports
import pandas as pd

# Data cleaning pipeline imports
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

# 1. Data Loading

In [2]:
import pandas as pd

#traffic = pd.read_csv('/content/sample_data/sampled_traffic.csv')
#traffic = pd.read_csv('/content/sample_data/full_traffic.csv')

#traffic = pd.read_csv('sampled_traffic.csv')
traffic = pd.read_csv('full_traffic.csv')

In [3]:
traffic.head()

Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,Source2,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,,,0.01,...,False,False,False,False,False,False,Night,Night,Night,Night
1,A-2,Source2,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,,,0.01,...,False,False,False,False,False,False,Night,Night,Night,Day
2,A-3,Source2,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,,,0.01,...,False,False,False,False,True,False,Night,Night,Day,Day
3,A-4,Source2,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,,,0.01,...,False,False,False,False,False,False,Night,Day,Day,Day
4,A-5,Source2,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,,,0.01,...,False,False,False,False,True,False,Day,Day,Day,Day


In [4]:
len(traffic)

7728394

In [5]:
traffic.isna().sum()

ID                             0
Source                         0
Severity                       0
Start_Time                     0
End_Time                       0
Start_Lat                      0
Start_Lng                      0
End_Lat                  3402762
End_Lng                  3402762
Distance(mi)                   0
Description                    5
Street                     10869
City                         253
County                         0
State                          0
Zipcode                     1915
Country                        0
Timezone                    7808
Airport_Code               22635
Weather_Timestamp         120228
Temperature(F)            163853
Wind_Chill(F)            1999019
Humidity(%)               174144
Pressure(in)              140679
Visibility(mi)            177098
Wind_Direction            175206
Wind_Speed(mph)           571233
Precipitation(in)        2203586
Weather_Condition         173459
Amenity                        0
Bump      

# 2. Data Cleaning and Munging

In [6]:
traffic['Weather_Timestamp'].head()

0    2016-02-08 05:58:00
1    2016-02-08 05:51:00
2    2016-02-08 06:56:00
3    2016-02-08 07:38:00
4    2016-02-08 07:53:00
Name: Weather_Timestamp, dtype: object

In [7]:
traffic['Weather_Condition'].head()

0       Light Rain
1       Light Rain
2         Overcast
3    Mostly Cloudy
4    Mostly Cloudy
Name: Weather_Condition, dtype: object

#### Missing Value Handling Plan:
- Because `End_Lat` and `End_Lng` each have 3.4M+ missing values (44.02935% of the full dataset), we chose to drop these columns to focus just on the location of the accident.
- `Wind_Speed` and `Precipitation` also have a significant number of missing values. These columns cannot be simply or reliably imputed because of their dependence on time, location, and data collection agencies. However, we did not want to drop these columns because of their possible significance to traffic accident severity. We decided to investigate their relationship with accident severity level using statistical methods (box-plots and ANOVA).
- `Street`was dropped from the dataset. Given the granularity of the Lat/Lng coordinates, we did not think we needed this column for our analysis. We figured it could be added back somehow at the end of the analysis, perhaps by creating a unique ID code for each row given Lat/Lng to easily do this but we did not explore this step for this analysis.
- `City`, `Zipcode`, `Timezone`, `Airport_Code` can be easily imputed using python packages given the Lat/Lng coordinates. However, we do not see the need to keep all of these variables. We chose to retain `City` and `Zipcode` and impute their missing values in case we wanted to examine traffic accident severity with these columns.
- `Weather_Timestamp`, `Temperature`, `Humidity`, `Pressure`, `Visibility`, `Wind_Direction` each do not have excessive missingness. Of these variables, `Visibility` has the most missing values (177,098 missing, 2.29150% of total rows), so we decided to drop rows with missing values in these columns.
- `Sunrise_Sunset`, `Civil_Twilight`, `Nautical_Twilight`, `Astronomical_Twilight` may have some impact on traffic activity. These variables also likely have some impact on the drowsiness of drivers. However, we chose to drop these columns because they are time columns and to our knowledge, not simply ingested by machine learning algorithms. In the future, we could impute these values using the python `astral` package and figure out how to use this data in an analysis but we chose to focus our analysis on other variables.
- `Weather_Condition` is an important variable with 173,459 missing values (2.2444% of the full dataset). We considered imputing these values with a nearest-neighbors approach after grouping by city, but were unsure of how this approach would affect the truth in our dataset. We chose to impute mode by city into 'Weather_Condition' instead, understanding that we may be amplifying the effect of certain weather conditions.
- `Wind_Speed` has 36,987 missing values. This is not a significant portion of the dataset. We chose to drop na rows from this column.

In [8]:
traffic.columns

Index(['ID', 'Source', 'Severity', 'Start_Time', 'End_Time', 'Start_Lat',
       'Start_Lng', 'End_Lat', 'End_Lng', 'Distance(mi)', 'Description',
       'Street', 'City', 'County', 'State', 'Zipcode', 'Country', 'Timezone',
       'Airport_Code', 'Weather_Timestamp', 'Temperature(F)', 'Wind_Chill(F)',
       'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Direction',
       'Wind_Speed(mph)', 'Precipitation(in)', 'Weather_Condition', 'Amenity',
       'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway',
       'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal',
       'Turning_Loop', 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
       'Astronomical_Twilight'],
      dtype='object')

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut, GeocoderServiceError
import time
import numpy as np

def get_city(lat, lon):
    if pd.isna(lat) or pd.isna(lon):
        return None
    try:
        lat = float(lat)
        lon = float(lon)
    except ValueError:
        return None
    if not (-90 <= lat <= 90) or not (-180 <= lon <= 180):
        return None
    geolocator = Nominatim(user_agent="my_agent")
    try:
        location = geolocator.reverse(f"{lat}, {lon}")
        if location and 'address' in location.raw:
            address = location.raw['address']
            return address.get('city') or address.get('town') or address.get('village')
    except (GeocoderTimedOut, GeocoderServiceError, ValueError):
        time.sleep(1)  # Wait for 1 second before retrying
    return None

def fill_missing_city(df):
    mask = df['City'].isna()
    missing_cities = df[mask]
    for idx, row in missing_cities.iterrows():
        city = get_city(row['Start_Lat'], row['Start_Lng'])
        if city:
            df.at[idx, 'City'] = city
    return df

columns_to_drop = ['End_Lat', 'End_Lng', 'Street', 'Timezone', 'Airport_Code', 
                   'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight', 
                   'Astronomical_Twilight', 'Wind_Chill(F)', 'Precipitation(in)']

def preprocess_data(X):
    X = X.drop(columns=columns_to_drop)
    X = fill_missing_city(X)
    X = X.dropna(subset=['City', 'Description'])
    X['Weather_Condition'] = X.groupby('City')['Weather_Condition'].transform(
        lambda x: x.fillna(x.mode().iloc[0] if not x.mode().empty else 'Unknown')
    )
    columns_to_check_na = ['Weather_Timestamp', 'Temperature(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Direction', 'Wind_Speed(mph)']
    X = X.dropna(subset=columns_to_check_na)
    return X

pipeline = Pipeline([
    ('preprocess', FunctionTransformer(preprocess_data, validate=False))
])

result = pipeline.fit_transform(traffic)
result_df = pd.DataFrame(result, columns=[col for col in traffic.columns if col not in columns_to_drop])
print("Shape before processing:", traffic.shape)
print("Shape after processing:", result_df.shape)

Shape before processing: (7728394, 46)
Shape after processing: (7095556, 35)


In [13]:
result_df.isna().sum()

ID                   0
Source               0
Severity             0
Start_Time           0
End_Time             0
Start_Lat            0
Start_Lng            0
Distance(mi)         0
Description          0
City                 0
County               0
State                0
Zipcode              0
Country              0
Weather_Timestamp    0
Temperature(F)       0
Humidity(%)          0
Pressure(in)         0
Visibility(mi)       0
Wind_Direction       0
Wind_Speed(mph)      0
Weather_Condition    0
Amenity              0
Bump                 0
Crossing             0
Give_Way             0
Junction             0
No_Exit              0
Railway              0
Roundabout           0
Station              0
Stop                 0
Traffic_Calming      0
Traffic_Signal       0
Turning_Loop         0
dtype: int64

In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Assuming your dataframe is named results_df and the target variable is 'severity'

# Separate features and target
X = result_df.drop('Severity', axis=1)
y = result_df['Severity']

# Identify numerical and categorical columns
numerical_columns = X.select_dtypes(include=[np.number]).columns
categorical_columns = X.select_dtypes(exclude=[np.number]).columns

print("Numerical columns:", numerical_columns.tolist())
print("Categorical columns:", categorical_columns.tolist())

# Perform stratified train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Combine features and target for each split
train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

# Shuffle the rows of each dataframe
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)
test_df = test_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the full datasets
train_df.to_csv('train_data_full.csv', index=False)
test_df.to_csv('test_data_full.csv', index=False)

# Save numerical and categorical datasets separately
train_df[numerical_columns].to_csv('train_data_numerical.csv', index=False)
test_df[numerical_columns].to_csv('test_data_numerical.csv', index=False)
train_df[categorical_columns].to_csv('train_data_categorical.csv', index=False)
test_df[categorical_columns].to_csv('test_data_categorical.csv', index=False)

# Save target variable separately
train_df['severity'].to_csv('train_target.csv', index=False)
test_df['severity'].to_csv('test_target.csv', index=False)

print("Train set shape:", train_df.shape)
print("Test set shape:", test_df.shape)
print("Datasets have been saved as CSV files")

Numerical columns: ['Start_Lat', 'Start_Lng', 'Distance(mi)', 'Temperature(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Speed(mph)']
Categorical columns: ['ID', 'Source', 'Start_Time', 'End_Time', 'Description', 'City', 'County', 'State', 'Zipcode', 'Country', 'Weather_Timestamp', 'Wind_Direction', 'Weather_Condition', 'Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop']


KeyError: 'severity'