In [1]:
import pandas as pd
import glob
from pathlib import Path

In [2]:
# Create a list of CSV file paths to combine
file_paths = glob.glob("Files/*.csv")

In [3]:
# Initialize an empty list to store DataFrames
dataframes = []

In [4]:
# Read each CSV file into a DataFrame and append it to the list
for file_path in file_paths:
    try:
        df = pd.read_csv(file_path)
        dataframes.append(df)
    except pd.errors.EmptyDataError:
        print(f"Warning: Empty file found at {file_path}")
    except pd.errors.ParserError as e:
        print(f"Error reading file {file_path}: {str(e)}")

In [5]:
# Check if there are valid DataFrames to concatenate
if dataframes:
    # Concatenate all DataFrames into one
    all_combined_df = pd.concat(dataframes, ignore_index=True)
    
    # Save the combined DataFrame to a new CSV file
    all_combined_df.to_csv('all_combined_data.csv', index=False)
else:
    print("No valid DataFrames to concatenate.")

In [6]:
# Concatenate all DataFrames into one
combined_new_files_df = pd.concat(dataframes, ignore_index=True)

In [7]:
combined_new_files_df

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,121DD7DD23CB1335,docked_bike,2021-02-03 23:11:28,2021-02-03 23:18:28,Hoboken Ave at Monmouth St,JC105,Christ Hospital,JC034,40.735208,-74.046964,40.734786,-74.050444,member
1,FD73FB85F008349D,docked_bike,2021-02-27 16:34:05,2021-02-27 16:56:40,Newport Pkwy,JC008,Marin Light Rail,JC013,40.728744,-74.032108,40.714584,-74.042817,member
2,39F9E6663CB5FDF6,docked_bike,2021-02-26 23:16:04,2021-02-26 23:22:25,Journal Square,JC103,Baldwin at Montgomery,JC020,40.733670,-74.062500,40.723659,-74.064194,member
3,A64745CB0792EC6F,docked_bike,2021-02-24 16:51:50,2021-02-24 17:16:09,Hoboken Ave at Monmouth St,JC105,Hoboken Ave at Monmouth St,JC105,40.735208,-74.046963,40.735208,-74.046964,casual
4,75CC76EB9543764A,docked_bike,2021-02-24 20:44:16,2021-02-24 20:44:46,Hoboken Ave at Monmouth St,JC105,Hoboken Ave at Monmouth St,JC105,40.735208,-74.046963,40.735208,-74.046964,member
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2078980,944F5CD711E7688E,classic_bike,2023-07-25 18:21:52,2023-07-25 18:34:38,Van Vorst Park,JC035,Newport Pkwy,JC008,40.718489,-74.047727,40.728745,-74.032108,member
2078981,4665B542F79F8C43,electric_bike,2023-07-26 22:22:20,2023-07-26 22:26:20,Van Vorst Park,JC035,Warren St,JC006,40.718489,-74.047727,40.721124,-74.038051,member
2078982,789D45FDEBC19E83,classic_bike,2023-07-31 07:57:29,2023-07-31 08:03:58,Van Vorst Park,JC035,Newport Pkwy,JC008,40.718422,-74.047691,40.728745,-74.032108,member
2078983,FFBE463288D36C2A,classic_bike,2023-07-07 17:52:09,2023-07-07 18:00:14,Van Vorst Park,JC035,Warren St,JC006,40.718489,-74.047727,40.721124,-74.038051,member


In [8]:
cleaned_combined_new = combined_new_files_df.drop(columns=['ride_id', 'rideable_type'])
cleaned_combined_new.head()

Unnamed: 0,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,2021-02-03 23:11:28,2021-02-03 23:18:28,Hoboken Ave at Monmouth St,JC105,Christ Hospital,JC034,40.735208,-74.046964,40.734786,-74.050444,member
1,2021-02-27 16:34:05,2021-02-27 16:56:40,Newport Pkwy,JC008,Marin Light Rail,JC013,40.728744,-74.032108,40.714584,-74.042817,member
2,2021-02-26 23:16:04,2021-02-26 23:22:25,Journal Square,JC103,Baldwin at Montgomery,JC020,40.73367,-74.0625,40.723659,-74.064194,member
3,2021-02-24 16:51:50,2021-02-24 17:16:09,Hoboken Ave at Monmouth St,JC105,Hoboken Ave at Monmouth St,JC105,40.735208,-74.046963,40.735208,-74.046964,casual
4,2021-02-24 20:44:16,2021-02-24 20:44:46,Hoboken Ave at Monmouth St,JC105,Hoboken Ave at Monmouth St,JC105,40.735208,-74.046963,40.735208,-74.046964,member


In [9]:
cleaned_new = cleaned_combined_new

In [10]:
cleaned_new['started_at'] = pd.to_datetime(cleaned_new['started_at'],
                                                format='%Y-%m-%d %H:%M:%S.%f')
cleaned_new['ended_at'] = pd.to_datetime(cleaned_new['ended_at'],
                                                format='%Y-%m-%d %H:%M:%S.%f')

In [21]:
cleaned_new

Unnamed: 0,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,2021-02-03 23:11:28,2021-02-03 23:18:28,Hoboken Ave at Monmouth St,JC105,Christ Hospital,JC034,40.735208,-74.046964,40.734786,-74.050444,member
1,2021-02-27 16:34:05,2021-02-27 16:56:40,Newport Pkwy,JC008,Marin Light Rail,JC013,40.728744,-74.032108,40.714584,-74.042817,member
2,2021-02-26 23:16:04,2021-02-26 23:22:25,Journal Square,JC103,Baldwin at Montgomery,JC020,40.733670,-74.062500,40.723659,-74.064194,member
3,2021-02-24 16:51:50,2021-02-24 17:16:09,Hoboken Ave at Monmouth St,JC105,Hoboken Ave at Monmouth St,JC105,40.735208,-74.046963,40.735208,-74.046964,casual
4,2021-02-24 20:44:16,2021-02-24 20:44:46,Hoboken Ave at Monmouth St,JC105,Hoboken Ave at Monmouth St,JC105,40.735208,-74.046963,40.735208,-74.046964,member
...,...,...,...,...,...,...,...,...,...,...,...
2078980,2023-07-25 18:21:52,2023-07-25 18:34:38,Van Vorst Park,JC035,Newport Pkwy,JC008,40.718489,-74.047727,40.728745,-74.032108,member
2078981,2023-07-26 22:22:20,2023-07-26 22:26:20,Van Vorst Park,JC035,Warren St,JC006,40.718489,-74.047727,40.721124,-74.038051,member
2078982,2023-07-31 07:57:29,2023-07-31 08:03:58,Van Vorst Park,JC035,Newport Pkwy,JC008,40.718422,-74.047691,40.728745,-74.032108,member
2078983,2023-07-07 17:52:09,2023-07-07 18:00:14,Van Vorst Park,JC035,Warren St,JC006,40.718489,-74.047727,40.721124,-74.038051,member


In [12]:
#create path to old csv file:
old_data= Path("Resources/merged_old.csv")
old_data_df=pd.read_csv(old_data)

In [20]:
old_data_df

Unnamed: 0,started_at,ended_at,start_station_name,start_lat,start_lng,end_station_name,end_lat,end_lng,member_casual
0,2020-07-01 00:23:19.822,2020-07-01 01:02:13.298,Jackson Square,40.711130,-74.078900,Marin Light Rail,40.714584,-74.042817,casual
1,2020-07-01 00:27:45.242,2020-07-01 00:47:49.196,Hoboken Ave at Monmouth St,40.735208,-74.046964,Newport PATH,40.727224,-74.033759,member
2,2020-07-01 00:34:17.868,2020-07-01 00:47:24.631,Columbus Dr at Exchange Pl,40.716870,-74.032810,Baldwin at Montgomery,40.723659,-74.064194,member
3,2020-07-01 00:51:07.531,2020-07-01 01:15:56.020,Glenwood Ave,40.727551,-74.071061,McGinley Square,40.725340,-74.067622,member
4,2020-07-01 00:51:16.388,2020-07-01 01:35:58.459,Union St,40.718211,-74.083639,Lincoln Park,40.724605,-74.078406,member
...,...,...,...,...,...,...,...,...,...
210454,2021-01-31 20:16:05.470,2021-01-31 20:57:22.011,Warren St,40.721124,-74.038051,Montgomery St,40.719420,-74.050990,member
210455,2021-01-31 21:05:05.209,2021-01-31 21:07:32.866,Grove St PATH,40.719586,-74.043117,Warren St,40.721124,-74.038051,member
210456,2021-01-31 21:06:23.588,2021-01-31 21:17:23.842,Newark Ave,40.721525,-74.046305,Marin Light Rail,40.714584,-74.042817,member
210457,2021-01-31 21:16:37.822,2021-01-31 21:23:02.124,JC Medical Center,40.716540,-74.049638,Lafayette Park,40.713464,-74.062859,member


In [13]:
# Merge the old and new dataframes 
all_merged = pd.concat([cleaned_new, old_data_df], ignore_index=True)

In [14]:
all_merged

Unnamed: 0,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,2021-02-03 23:11:28,2021-02-03 23:18:28,Hoboken Ave at Monmouth St,JC105,Christ Hospital,JC034,40.735208,-74.046964,40.734786,-74.050444,member
1,2021-02-27 16:34:05,2021-02-27 16:56:40,Newport Pkwy,JC008,Marin Light Rail,JC013,40.728744,-74.032108,40.714584,-74.042817,member
2,2021-02-26 23:16:04,2021-02-26 23:22:25,Journal Square,JC103,Baldwin at Montgomery,JC020,40.733670,-74.062500,40.723659,-74.064194,member
3,2021-02-24 16:51:50,2021-02-24 17:16:09,Hoboken Ave at Monmouth St,JC105,Hoboken Ave at Monmouth St,JC105,40.735208,-74.046963,40.735208,-74.046964,casual
4,2021-02-24 20:44:16,2021-02-24 20:44:46,Hoboken Ave at Monmouth St,JC105,Hoboken Ave at Monmouth St,JC105,40.735208,-74.046963,40.735208,-74.046964,member
...,...,...,...,...,...,...,...,...,...,...,...
2289439,2021-01-31 20:16:05.470,2021-01-31 20:57:22.011,Warren St,,Montgomery St,,40.721124,-74.038051,40.719420,-74.050990,member
2289440,2021-01-31 21:05:05.209,2021-01-31 21:07:32.866,Grove St PATH,,Warren St,,40.719586,-74.043117,40.721124,-74.038051,member
2289441,2021-01-31 21:06:23.588,2021-01-31 21:17:23.842,Newark Ave,,Marin Light Rail,,40.721525,-74.046305,40.714584,-74.042817,member
2289442,2021-01-31 21:16:37.822,2021-01-31 21:23:02.124,JC Medical Center,,Lafayette Park,,40.716540,-74.049638,40.713464,-74.062859,member


In [24]:
all_merged['trip_id'] = all_merged.index
all_merged

Unnamed: 0,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,trip_id
0,2021-02-03 23:11:28,2021-02-03 23:18:28,Hoboken Ave at Monmouth St,JC105,Christ Hospital,JC034,40.735208,-74.046964,40.734786,-74.050444,member,0
1,2021-02-27 16:34:05,2021-02-27 16:56:40,Newport Pkwy,JC008,Marin Light Rail,JC013,40.728744,-74.032108,40.714584,-74.042817,member,1
2,2021-02-26 23:16:04,2021-02-26 23:22:25,Journal Square,JC103,Baldwin at Montgomery,JC020,40.733670,-74.062500,40.723659,-74.064194,member,2
3,2021-02-24 16:51:50,2021-02-24 17:16:09,Hoboken Ave at Monmouth St,JC105,Hoboken Ave at Monmouth St,JC105,40.735208,-74.046963,40.735208,-74.046964,casual,3
4,2021-02-24 20:44:16,2021-02-24 20:44:46,Hoboken Ave at Monmouth St,JC105,Hoboken Ave at Monmouth St,JC105,40.735208,-74.046963,40.735208,-74.046964,member,4
...,...,...,...,...,...,...,...,...,...,...,...,...
2289439,2021-01-31 20:16:05.470,2021-01-31 20:57:22.011,Warren St,,Montgomery St,,40.721124,-74.038051,40.719420,-74.050990,member,2289439
2289440,2021-01-31 21:05:05.209,2021-01-31 21:07:32.866,Grove St PATH,,Warren St,,40.719586,-74.043117,40.721124,-74.038051,member,2289440
2289441,2021-01-31 21:06:23.588,2021-01-31 21:17:23.842,Newark Ave,,Marin Light Rail,,40.721525,-74.046305,40.714584,-74.042817,member,2289441
2289442,2021-01-31 21:16:37.822,2021-01-31 21:23:02.124,JC Medical Center,,Lafayette Park,,40.716540,-74.049638,40.713464,-74.062859,member,2289442


In [15]:
#drop null values and reset index
all_merged_cleaned = all_merged.dropna()
all_merged_cleaned = all_merged_cleaned.reset_index(drop=True)
all_merged_cleaned

Unnamed: 0,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,2021-02-03 23:11:28,2021-02-03 23:18:28,Hoboken Ave at Monmouth St,JC105,Christ Hospital,JC034,40.735208,-74.046964,40.734786,-74.050444,member
1,2021-02-27 16:34:05,2021-02-27 16:56:40,Newport Pkwy,JC008,Marin Light Rail,JC013,40.728744,-74.032108,40.714584,-74.042817,member
2,2021-02-26 23:16:04,2021-02-26 23:22:25,Journal Square,JC103,Baldwin at Montgomery,JC020,40.733670,-74.062500,40.723659,-74.064194,member
3,2021-02-24 16:51:50,2021-02-24 17:16:09,Hoboken Ave at Monmouth St,JC105,Hoboken Ave at Monmouth St,JC105,40.735208,-74.046963,40.735208,-74.046964,casual
4,2021-02-24 20:44:16,2021-02-24 20:44:46,Hoboken Ave at Monmouth St,JC105,Hoboken Ave at Monmouth St,JC105,40.735208,-74.046963,40.735208,-74.046964,member
...,...,...,...,...,...,...,...,...,...,...,...
2070492,2023-07-25 18:21:52,2023-07-25 18:34:38,Van Vorst Park,JC035,Newport Pkwy,JC008,40.718489,-74.047727,40.728745,-74.032108,member
2070493,2023-07-26 22:22:20,2023-07-26 22:26:20,Van Vorst Park,JC035,Warren St,JC006,40.718489,-74.047727,40.721124,-74.038051,member
2070494,2023-07-31 07:57:29,2023-07-31 08:03:58,Van Vorst Park,JC035,Newport Pkwy,JC008,40.718422,-74.047691,40.728745,-74.032108,member
2070495,2023-07-07 17:52:09,2023-07-07 18:00:14,Van Vorst Park,JC035,Warren St,JC006,40.718489,-74.047727,40.721124,-74.038051,member


In [26]:
# Write the merged data to CSV
output_file_path= Path('Resources\merged_citi_files.csv') 
all_merged.to_csv(output_file_path, index=False)