In [1]:
import pandas as pd

In [59]:
# Load the CSV files
API_merge = pd.read_csv('../data/API_data_Cleaned.csv')
db_merge = pd.read_csv('../data/us_accidents_cleaned.csv')

# Convert date columns to datetime format and use only the date
API_merge['crash_date'] = pd.to_datetime(API_merge['crash_date']).dt.date  # Use only the date
db_merge['start_time'] = pd.to_datetime(db_merge['start_time']).dt.date  # Use only the date

# Filter both datasets for rows where the city is 'New York'
api_data_ny = API_merge[API_merge['city'] == 'New York']
us_accidents_ny = db_merge[db_merge['city'] == 'New York']

# Merge the two datasets based on the date (inner join)
merged_df = pd.merge(api_data_ny, us_accidents_ny, left_on='crash_date', right_on='start_time', how='inner')

# Drop duplicate city columns ('city_x' and 'city_y')
merged_df = merged_df.drop(columns=['city_x', 'city_y'])

# Add a new column 'city' with the value "New York"
merged_df['city'] = "New York"

# Move the 'city' column to the beginning of the DataFrame
cols = ['city'] + [col for col in merged_df.columns if col != 'city']
merged_df = merged_df[cols]


In [None]:
# Check the number of rows after the merge
merged_count = merged_df.shape[0]
print(f"Number of rows after the merge: {merged_count}")

# Check for null values in the merged DataFrame
print(f"Null values: \n{merged_df.isnull().sum()}\n")

# Save the merged result to a CSV file
merged_df.to_csv('../data/merged_data.csv', index=False, encoding='utf-8')
