In [37]:
import requests
import pandas as pd
import sys
import os

In [38]:
# URL of the dataset (API endpoint)
url = "https://data.cityofnewyork.us/resource/h9gi-nx95.json"

# Parameters to limit the response to 200,000 records
params = {
    "$limit": 200000
}

# Send a GET request to the API
response = requests.get(url, params=params)

# Check if the request was successful
if response.status_code == 200:
    data = response.json()  # Convert the response to JSON format
    df = pd.DataFrame(data)  # Create a pandas DataFrame from the data
    print(df.head())  # Display the first few records
    # Save the DataFrame to a CSV file
    df.to_csv('../data/API_data.csv', index=False, encoding='utf-8')
else:
    # If the request fails, print the error code
    print(f"Error in the request: {response.status_code}")

                crash_date crash_time           on_street_name  \
0  2021-09-11T00:00:00.000       2:39    WHITESTONE EXPRESSWAY   
1  2022-03-26T00:00:00.000      11:45  QUEENSBORO BRIDGE UPPER   
2  2022-06-29T00:00:00.000       6:55       THROGS NECK BRIDGE   
3  2021-09-11T00:00:00.000       9:35                      NaN   
4  2021-12-14T00:00:00.000       8:13          SARATOGA AVENUE   

  off_street_name number_of_persons_injured number_of_persons_killed  \
0       20 AVENUE                         2                        0   
1             NaN                         1                        0   
2             NaN                         0                        0   
3             NaN                         0                        0   
4  DECATUR STREET                         0                        0   

  number_of_pedestrians_injured number_of_pedestrians_killed  \
0                             0                            0   
1                             0           

In [40]:
# Get the absolute path of the file
file_path = os.path.abspath(os.path.join('../data/API_data.csv'))

# Load the CSV file using pandas
data = pd.read_csv(file_path)

# Set pandas to display all columns
pd.set_option('display.max_columns', None)

# 2. Convert `crash_date` and `crash_time` to datetime format
# Handle errors by setting invalid parsing as NaT (Not a Time)
data['crash_date'] = pd.to_datetime(data['crash_date'], errors='coerce')
data['crash_time'] = pd.to_datetime(data['crash_time'], format='%H:%M', errors='coerce')

# 3. Fix inconsistent values (e.g., remove whitespace or correct capitalization in the `borough` column)
data['borough'] = data['borough'].str.strip().str.title()

# Filter data to keep only rows with valid crash dates and from the year 2021 or later
data = data[data['crash_date'].notna() & (data['crash_date'].dt.year >= 2021)]

# 4. Remove duplicates based on the `collision_id` column (assuming it's unique for each accident)
data = data.drop_duplicates(subset='collision_id')

# Convert `crash_date` to just the date (drop the time part)
data['crash_date'] = data['crash_date'].dt.date

# Drop unnecessary columns
data = data.drop(['vehicle_type_code_5', 'contributing_factor_vehicle_5',
                  'vehicle_type_code_4', 'contributing_factor_vehicle_4',
                  'vehicle_type_code_3', 'contributing_factor_vehicle_3',
                  'cross_street_name'], axis=1)

print("FILTERED AND CLEANED DATA: \n")

# Drop rows with any missing values
data = data.dropna()

# Add a `city` column with the value "New York"
data['city'] = "New York"

# Print summary information about null values and duplicates
print(f"The total of Null data is: \n{data.isnull().sum()}\n")
print(f"The total of duplicated data is: {data.duplicated().sum()}\n")
print(f"Data: {data.shape[0]} rows\n")

# Save the cleaned data to a new CSV file
data.to_csv('../data/API_data_Cleaned.csv', index=False, encoding='utf-8')
print("File Cleaned Successfully")

FILTRATED AND SORTED DATA: 

The total of Null data is: 
crash_date                       0
crash_time                       0
on_street_name                   0
off_street_name                  0
number_of_persons_injured        0
number_of_persons_killed         0
number_of_pedestrians_injured    0
number_of_pedestrians_killed     0
number_of_cyclist_injured        0
number_of_cyclist_killed         0
number_of_motorist_injured       0
number_of_motorist_killed        0
contributing_factor_vehicle_1    0
contributing_factor_vehicle_2    0
collision_id                     0
vehicle_type_code1               0
vehicle_type_code2               0
borough                          0
zip_code                         0
latitude                         0
longitude                        0
location                         0
city                             0
dtype: int64

The total of duplicated data is: 0

Data: 46436 

File Cleaned Correctly
