In [37]:
import requests
import pandas as pd
import sys
import os

In [38]:
# URL del dataset
url = "https://data.cityofnewyork.us/resource/h9gi-nx95.json"

# Parámetros para limitar a 500,000 registros
params = {
    "$limit": 200000
}

# Hacer la solicitud GET al API
response = requests.get(url, params=params)

# Verificar si la solicitud fue exitosa
if response.status_code == 200:
    data = response.json()  # Convertir la respuesta a JSON
    df = pd.DataFrame(data)  # Crear un DataFrame de pandas
    print(df.head())  # Mostrar los primeros registros
    df.to_csv('../data/API_data.csv', index=False, encoding='utf-8')
else:
    print(f"Error en la solicitud: {response.status_code}")

                crash_date crash_time           on_street_name  \
0  2021-09-11T00:00:00.000       2:39    WHITESTONE EXPRESSWAY   
1  2022-03-26T00:00:00.000      11:45  QUEENSBORO BRIDGE UPPER   
2  2022-06-29T00:00:00.000       6:55       THROGS NECK BRIDGE   
3  2021-09-11T00:00:00.000       9:35                      NaN   
4  2021-12-14T00:00:00.000       8:13          SARATOGA AVENUE   

  off_street_name number_of_persons_injured number_of_persons_killed  \
0       20 AVENUE                         2                        0   
1             NaN                         1                        0   
2             NaN                         0                        0   
3             NaN                         0                        0   
4  DECATUR STREET                         0                        0   

  number_of_pedestrians_injured number_of_pedestrians_killed  \
0                             0                            0   
1                             0           

In [39]:
file_path = os.path.abspath(os.path.join('../data/API_data.csv'))

# Cargar el archivo CSV con pandas
data = pd.read_csv(file_path)

pd.set_option('display.max_columns', None)

# 2. Convertir `crash_date` y `crash_time` a formato datetime
data['crash_date'] = pd.to_datetime(data['crash_date'], errors='coerce')
data['crash_time'] = pd.to_datetime(data['crash_time'], format='%H:%M', errors='coerce')

# 3. Corregir valores inconsistentes (ejemplo: corregir espacios en blanco o capitalización en la columna `borough`)
data['borough'] = data['borough'].str.strip().str.title()

data = data[data['crash_date'].notna() & (data['crash_date'].dt.year >= 2021)]

# 4. Eliminar duplicados basados en la columna `collision_id` (asumido como único para cada accidente)
data = data.drop_duplicates(subset='collision_id')

data['crash_date'] = data['crash_date'].dt.date
# Borrar columnas
data = data.drop(['vehicle_type_code_5','contributing_factor_vehicle_5',
                                    'vehicle_type_code_4','contributing_factor_vehicle_4',
                                    'vehicle_type_code_3','contributing_factor_vehicle_3',
                                    'cross_street_name'], axis=1)

print("FILTRATED AND SORTED DATA: \n")

data = data.dropna()

print(f"The total of Null data is: \n{data.isnull().sum()}\n")
print(f"The total of duplicated data is: {data.duplicated().sum()}\n")
print(f"Data: {data.shape[0]} \n")
data.to_csv('../data/API_data_Cleaned.csv', index=False, encoding='utf-8')
print("File Cleaned Correctly")

FILTRATED AND SORTED DATA: 

The total of Null data is: 
crash_date                       0
crash_time                       0
on_street_name                   0
off_street_name                  0
number_of_persons_injured        0
number_of_persons_killed         0
number_of_pedestrians_injured    0
number_of_pedestrians_killed     0
number_of_cyclist_injured        0
number_of_cyclist_killed         0
number_of_motorist_injured       0
number_of_motorist_killed        0
contributing_factor_vehicle_1    0
contributing_factor_vehicle_2    0
collision_id                     0
vehicle_type_code1               0
vehicle_type_code2               0
borough                          0
zip_code                         0
latitude                         0
longitude                        0
location                         0
dtype: int64

The total of duplicated data is: 0

Data: 46436 

File Cleaned Correctly
