In [14]:
import pandas as pd

In [31]:
# Load the CSV files
API_merge = pd.read_csv('data/API_data_Cleaned.csv')
db_merge = pd.read_csv('data/us_accidents_cleaned.csv')

# Convert date columns to datetime format and use only the date
API_merge['crash_date'] = pd.to_datetime(API_merge['crash_date']).dt.date  # Use only the date
db_merge['start_time'] = pd.to_datetime(db_merge['start_time']).dt.date  # Use only the date

# Filter both datasets for rows where the city is 'New York'
api_data_ny = API_merge[API_merge['city'] == 'New York']
us_accidents_ny = db_merge[db_merge['city'] == 'New York']

# Merge the two datasets based on the date (inner join)
merged_df = pd.merge(api_data_ny, us_accidents_ny, left_on='crash_date', right_on='start_time', how='inner')

# Drop duplicate city columns ('city_x' and 'city_y')
merged_df = merged_df.drop(columns=['city_x', 'city_y'])

# 1. Asegúrate de que la columna de fecha esté en formato datetime
merged_df['crash_date'] = pd.to_datetime(merged_df['crash_date'], errors='coerce')

# 2. Crear una nueva columna que contenga el mes y el año
# Aquí se formatea como "YYYY-MM"
merged_df['crash_date'] = merged_df['crash_date'].dt.to_period('M')

# Add a new column 'city' with the value "New York"
merged_df['city'] = "New York"

merged_df = merged_df.sort_values(by='crash_date', ascending=True)

# Move the 'city' column to the beginning of the DataFrame
cols = ['city'] + [col for col in merged_df.columns if col != 'city']
merged_df = merged_df[cols]


In [32]:
# Convertimos la columna a datetime si no lo está
merged_df['crash_time'] = pd.to_datetime(merged_df['crash_time'], errors='coerce')

# Extraer solo la hora
merged_df['crash_time'] = merged_df['crash_time'].dt.strftime('%H:%M')

In [36]:
# Convertir 'number_of_persons_injured' a tipo entero
merged_df['number_of_persons_injured'] = pd.to_numeric(merged_df['number_of_persons_injured'], errors='coerce').fillna(0).astype(int)

In [34]:
# 3. Borrar 'Colishion_id'
merged_df.drop(columns=['collision_id'], inplace=True)

# 4. Borrar 'Factor contribuyente 2'
merged_df.drop(columns=['contributing_factor_vehicle_2'], inplace=True)

# 5. Borrar 'vehicle_type_code2'
merged_df.drop(columns=['vehicle_type_code2'], inplace=True)

# 6. Mezclar 'codigo postal' con 'distrito'
# Supongamos que 'codigo_postal' y 'distrito' son las columnas en merged_clean
merged_df['borough'] = merged_df['borough'] + ' - ' + merged_df['zip_code'].astype(str)

# 7. Borrar latitud y longitud
merged_df.drop(columns=['latitude', 'longitude'], inplace=True)

# 8. Borrar 'start time' y 'end time'
merged_df.drop(columns=['start_time', 'end_time'], inplace=True)

# 9. Borrar 'start latitud' y 'end latitud'
merged_df.drop(columns=['start_lat', 'start_lng'], inplace=True)

# 10. Borrar 'distancia en millas'
merged_df.drop(columns=['distance_mi'], inplace=True)

# 11. Borrar 'county'
merged_df.drop(columns=['county'], inplace=True)

# 12. Mezclar 'state' con 'city'
merged_df['city'] = merged_df['city'] + ', ' + merged_df['state']

# 13. Borrar 'zipcode'
merged_df.drop(columns=['zipcode'], inplace=True)

# 14. Borrar columnas innecesarias
columns_to_drop = [
    'airport_code', 'amenity', 'bump', 'crossing', 'give_way', 'junction', 
    'no_exit', 'railway', 'roundabout', 'station', 'stop', 'traffic_calming', 
    'traffic_signal', 'turning_loop'
]

merged_df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

In [38]:
# 13. Borrar
merged_df.drop(columns=['zip_code','state','weather_timestamp'], inplace=True)


In [40]:
merged_df['id'] = range(1, len(merged_df) + 1)

# Mover la columna 'id' al principio
cols = ['id'] + [col for col in merged_df.columns if col != 'id']
merged_df = merged_df[cols]

In [41]:
# Opcional: Verificar las primeras filas del DataFrame limpio
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
merged_df.head(4)

Unnamed: 0,id,city,crash_date,crash_time,on_street_name,off_street_name,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,contributing_factor_vehicle_1,vehicle_type_code1,borough,location,severity,street,timezone,temperature_f,wind_chill_f,humidity_percent,pressure_in,visibility_mi,wind_direction,wind_speed_mph,precipitation_in,weather_condition,sunrise_sunset
48000,1,"New York, NY",2021-01,19:10,5 AVENUE,EAST 101 STREET,0,0,0,0,0,0,0,0,Backing Unsafely,Station Wagon/Sport Utility Vehicle,Manhattan - 10029.0,"{'latitude': '40.791084', 'longitude': '-73.95...",2,Broome St,US/Eastern,32.0,24.0,29.0,30.14,10.0,WNW,9.0,0.0,Fair,Day
18196,2,"New York, NY",2021-01,22:55,4 AVENUE,54 STREET,0,0,0,0,0,0,0,0,Driver Inattention/Distraction,Sedan,Brooklyn - 11220.0,"{'latitude': '40.64423', 'longitude': '-74.014...",2,Henry Hudson Pkwy N,US/Eastern,38.0,34.0,76.0,29.84,10.0,NNE,5.0,0.0,Cloudy,Night
18195,3,"New York, NY",2021-01,19:43,EAST 46 STREET,FOSTER AVENUE,1,0,0,0,0,0,1,0,Driver Inexperience,Station Wagon/Sport Utility Vehicle,Brooklyn - 11203.0,"{'latitude': '40.639553', 'longitude': '-73.93...",3,Amsterdam Ave,US/Eastern,36.0,32.0,82.0,29.9,10.0,N,5.0,0.0,Cloudy,Day
18194,4,"New York, NY",2021-01,19:43,EAST 46 STREET,FOSTER AVENUE,1,0,0,0,0,0,1,0,Driver Inexperience,Station Wagon/Sport Utility Vehicle,Brooklyn - 11203.0,"{'latitude': '40.639553', 'longitude': '-73.93...",2,Henry Hudson Pkwy N,US/Eastern,38.0,34.0,76.0,29.84,10.0,NNE,5.0,0.0,Cloudy,Night


In [42]:
# Check the number of rows after the merge
merged_count = merged_df.shape[0]
print(f"Number of rows after the merge: {merged_count}")

# Check for null values in the merged DataFrame
print(f"Null values: \n{merged_df.isnull().sum()}\n")

# Save the merged result to a CSV file
merged_df.to_csv('../data/merged_data.csv', index=False, encoding='utf-8')


Number of rows after the merge: 48001
Null values: 
id                               0
city                             0
crash_date                       0
crash_time                       0
on_street_name                   0
off_street_name                  0
number_of_persons_injured        0
number_of_persons_killed         0
number_of_pedestrians_injured    0
number_of_pedestrians_killed     0
number_of_cyclist_injured        0
number_of_cyclist_killed         0
number_of_motorist_injured       0
number_of_motorist_killed        0
contributing_factor_vehicle_1    0
vehicle_type_code1               0
borough                          0
location                         0
severity                         0
street                           0
timezone                         0
temperature_f                    0
wind_chill_f                     0
humidity_percent                 0
pressure_in                      0
visibility_mi                    0
wind_direction                   0
win

# DATA ANALYSIS

In [49]:
# 2. Distribución por hora del día
distribucion_horas = merged_df.groupby('crash_time').size().reset_index(name='count')
print("\n2. Distribución por hora del día:")
print(distribucion_horas.sort_values(by='count', ascending=False))


2. Distribución por hora del día:
     crash_time  count
0         00:00    693
829       14:00    462
889       15:00    445
1009      17:00    418
769       13:00    400
...         ...    ...
423       07:13      1
298       05:02      1
363       06:12      1
221       03:42      1
1273      21:24      1

[1429 rows x 2 columns]


In [50]:
# 2. Distribución por tipo de vehículo involucrado
vehicle_distribution = merged_df['vehicle_type_code1'].value_counts()
print("\nDistribución por tipo de vehículo:")
print(vehicle_distribution)


Distribución por tipo de vehículo:
vehicle_type_code1
Sedan                                  22789
Station Wagon/Sport Utility Vehicle    16465
Taxi                                    1414
Bus                                     1122
Pick-up Truck                           1022
                                       ...  
TRACTOR                                    1
MTA bus                                    1
Vanette                                    1
REFG                                       1
Van Camper                                 1
Name: count, Length: 151, dtype: int64


In [53]:
# 3. Relación entre condiciones climáticas y gravedad del accidente
climate_vs_injuries = merged_df.groupby('weather_condition')['number_of_persons_injured'].sum()
print("\nRelación entre clima y número de personas heridas:")
print(climate_vs_injuries.sort_values(ascending=False))


Relación entre clima y número de personas heridas:
weather_condition
Fair             18005
Cloudy            6287
Light Rain        1848
Mostly Cloudy     1775
Partly Cloudy     1332
Heavy Rain         298
Rain               261
Fog                175
Light Snow         128
Haze                64
Snow                15
Name: number_of_persons_injured, dtype: int32


In [54]:
# 4. Comparación entre accidentes de día y de noche
accidents_day_night = merged_df['sunrise_sunset'].value_counts()
print("\nComparación de accidentes entre día y noche:")
print(accidents_day_night)


Comparación de accidentes entre día y noche:
sunrise_sunset
Day      32483
Night    15518
Name: count, dtype: int64


In [55]:
# 5. Factores contribuyentes más comunes en accidentes graves (con heridos)
factors_in_grave_accidents = merged_df[merged_df['number_of_persons_injured'] > 0]['contributing_factor_vehicle_1'].value_counts()
print("\nFactores contribuyentes más comunes en accidentes graves:")
print(factors_in_grave_accidents)


Factores contribuyentes más comunes en accidentes graves:
contributing_factor_vehicle_1
Driver Inattention/Distraction                           5940
Unspecified                                              3639
Failure to Yield Right-of-Way                            2880
Traffic Control Disregarded                              1757
Following Too Closely                                    1264
Unsafe Speed                                              837
Passing or Lane Usage Improper                            774
Turning Improperly                                        634
Other Vehicular                                           502
Pedestrian/Bicyclist/Other Pedestrian Error/Confusion     434
Driver Inexperience                                       367
Unsafe Lane Changing                                      338
Alcohol Involvement                                       321
View Obstructed/Limited                                   311
Passing Too Closely                        

In [56]:
# 6. Accidentes por ubicación geográfica (borough)
accidents_by_borough = merged_df['borough'].value_counts()
print("\nAccidentes por barrio (borough):")
print(accidents_by_borough)


Accidentes por barrio (borough):
borough
Brooklyn - 11207.0     1442
Brooklyn - 11236.0      936
Brooklyn - 11234.0      839
Queens - 11434.0        782
Brooklyn - 11208.0      780
                       ... 
Manhattan - 10168.0       5
Manhattan - 10115.0       4
Manhattan - 10069.0       4
Queens - 11109.0          2
Manhattan - 10169.0       1
Name: count, Length: 186, dtype: int64


In [58]:
# 7. Correlación entre número de vehículos involucrados y número de heridos
vehicles_vs_injuries = merged_df.groupby('vehicle_type_code1')['number_of_persons_injured'].sum()
print("\nCorrelación entre tipo de vehículo y número de personas heridas:")
print(vehicles_vs_injuries.sort_values(ascending=False))


Correlación entre tipo de vehículo y número de personas heridas:
vehicle_type_code1
Sedan                                  14918
Station Wagon/Sport Utility Vehicle    10434
Taxi                                     989
Bike                                     631
Pick-up Truck                            544
                                       ...  
MINI BUS                                   0
Lunch Wagon                                0
LOCOMOTIVE                                 0
Garbage Tr                                 0
van                                        0
Name: number_of_persons_injured, Length: 151, dtype: int32


In [60]:
# 9. Accidentes por dirección del viento
accidents_wind_direction = merged_df['wind_speed_mph'].value_counts()
print("\nAccidentes por velocidad del viento:")
print(accidents_wind_direction)


Accidentes por velocidad del viento:
wind_speed_mph
0.000000     9210
3.000000     8691
5.000000     7890
6.000000     7278
7.000000     3840
7.681347     2529
8.000000     2501
9.000000     2075
10.000000    1358
13.000000     709
12.000000     687
15.000000     430
18.000000     308
16.000000     307
14.000000     188
Name: count, dtype: int64


In [62]:
# 10. Impacto de la precipitación en accidentes
precipitation_impact = merged_df.groupby('precipitation_in')['number_of_persons_injured'].sum()
print("\nImpacto de la precipitación en accidentes:")
print(precipitation_impact.sort_values(ascending=False))


Impacto de la precipitación en accidentes:
precipitation_in
0.00    27827
0.01      935
0.02      360
0.18      162
0.21      118
0.04      104
0.05      104
0.96      100
0.16       64
0.10       60
0.11       55
0.63       52
0.07       47
0.30       47
0.13       44
0.17       42
0.24       39
0.03       28
Name: number_of_persons_injured, dtype: int32


In [66]:
# 11. Comparación entre personas,peatones, ciclistas y motoristas heridos
injury_comparison = {
    'Personas Heridas': merged_df['number_of_persons_injured'].sum(),
    'Peatones Heridos': merged_df['number_of_pedestrians_injured'].sum(),
    'Ciclistas Heridos': merged_df['number_of_cyclist_injured'].sum(),
    'Motoristas Heridos': merged_df['number_of_motorist_injured'].sum(),
}
print("\nComparación entre peatones, ciclistas y motoristas heridos:")
print(injury_comparison)


Comparación entre peatones, ciclistas y motoristas heridos:
{'Personas Heridas': 30188, 'Peatones Heridos': 220, 'Ciclistas Heridos': 4189, 'Motoristas Heridos': 23810}


In [67]:
# 12. Factores que afectan la seguridad de ciclistas
factors_affecting_cyclists = merged_df[merged_df['number_of_cyclist_injured'] > 0]['contributing_factor_vehicle_1'].value_counts()
print("\nFactores que afectan la seguridad de ciclistas:")
print(factors_affecting_cyclists)


Factores que afectan la seguridad de ciclistas:
contributing_factor_vehicle_1
Driver Inattention/Distraction                           1330
Unspecified                                               601
Failure to Yield Right-of-Way                             586
Pedestrian/Bicyclist/Other Pedestrian Error/Confusion     352
Traffic Control Disregarded                               289
Passing or Lane Usage Improper                            182
Turning Improperly                                        125
Following Too Closely                                     101
View Obstructed/Limited                                    90
Unsafe Speed                                               90
Other Vehicular                                            61
Driver Inexperience                                        53
Passing Too Closely                                        45
Passenger Distraction                                      38
Unsafe Lane Changing                                 

In [70]:
# 13. Correlación entre temperatura y accidentes
temperature_vs_accidents = merged_df.groupby('temperature_f')['id'].count()
print("\nCorrelación entre temperatura y número de accidentes:")
print(temperature_vs_accidents.sort_values(ascending=False))


Correlación entre temperatura y número de accidentes:
temperature_f
74.000000    2347
80.000000    1604
77.000000    1572
73.000000    1464
66.000000    1301
75.000000    1229
71.000000    1184
54.000000    1178
70.000000    1130
65.000000    1114
83.000000    1063
78.000000    1034
51.000000     983
79.000000     973
81.000000     948
60.000000     944
84.000000     900
55.000000     893
62.000000     874
56.000000     862
86.000000     861
63.000000     856
76.000000     856
69.000000     839
67.000000     796
42.000000     795
49.000000     775
36.000000     765
52.000000     736
44.000000     710
50.000000     678
82.000000     671
72.000000     640
40.000000     639
61.000000     623
57.000000     617
88.000000     608
68.000000     591
43.000000     553
41.000000     539
46.000000     539
85.000000     537
59.000000     526
32.000000     507
89.000000     484
87.000000     469
53.000000     465
27.000000     460
91.000000     428
64.000000     395
38.000000     389
28.000000    

In [71]:
# 14. Número de accidentes por precipitación (lluvia/nieve)
accidents_by_precipitation = merged_df.groupby('precipitation_in')['id'].count()
print("\nNúmero de accidentes por precipitación:")
print(accidents_by_precipitation)


Número de accidentes por precipitación:
precipitation_in
0.00    44325
0.01     1464
0.02      546
0.03       72
0.04      209
0.05      188
0.07       88
0.10       87
0.11       82
0.13       75
0.16       82
0.17       64
0.18      255
0.21      128
0.24       60
0.30       81
0.63       75
0.96      120
Name: id, dtype: int64


In [72]:
# 15. Factores más comunes en accidentes sin heridos
factors_in_minor_accidents = merged_df[merged_df['number_of_persons_injured'] == 0]['contributing_factor_vehicle_1'].value_counts()
print("\nFactores más comunes en accidentes sin heridos:")
print(factors_in_minor_accidents)


Factores más comunes en accidentes sin heridos:
contributing_factor_vehicle_1
Driver Inattention/Distraction                           6889
Unspecified                                              4704
Failure to Yield Right-of-Way                            1914
Passing or Lane Usage Improper                           1720
Following Too Closely                                    1608
Passing Too Closely                                      1269
Traffic Control Disregarded                              1113
Turning Improperly                                       1072
Backing Unsafely                                          913
Unsafe Speed                                              799
Other Vehicular                                           712
Alcohol Involvement                                       603
Driver Inexperience                                       548
Unsafe Lane Changing                                      532
Reaction to Uninvolved Vehicle                       