In [69]:
import pandas as pd
import numpy as np
path = 'data/'
import os
from glob import glob
import matplotlib.pyplot as plt
import seaborn as sns
viz_path = 'viz/'

In [71]:
def load_data(path):
    all_files = glob(os.path.join(path, "*.csv"))
    df_from_each_file = (pd.read_csv(f) for f in all_files)
    return pd.concat(df_from_each_file, ignore_index=True).sort_values(by='started_at')

def preprocess(df):

    df['started_at'] = pd.to_datetime(df['started_at'], format='ISO8601')
    df['ended_at'] = pd.to_datetime(df['ended_at'], format='ISO8601')
    df['year_started'] = df['started_at'].dt.year
    df['month_started'] = df['started_at'].dt.month
    df['day_started'] = df['started_at'].dt.day
    df['weekday_started'] = df['started_at'].dt.weekday
    df['hour_started'] = df['started_at'].dt.hour
    df['minute_started'] = df['started_at'].dt.minute
    df['second_started'] = df['started_at'].dt.second

    return df


def filter_top_station(df, number = 10):
    top_stations = df['start_station_name'].value_counts().head(number).index.tolist()
    return df[df['start_station_name'].isin(top_stations)]

def analyse_data_statistics(df):
    def popular_start_station(df):
        return df.groupby('month_started')['start_station_name'].value_counts().reset_index(name='count')
    
    def popular_end_station(df):
        return df.groupby('month_started')['end_station_name'].value_counts().reset_index(name='count')
      
    def duration(df):
        df['started_at'] = pd.to_datetime(df['started_at'])
        df['ended_at'] = pd.to_datetime(df['ended_at'])
        df['duration'] = df['ended_at'] - df['started_at']
        df['duration'] = df['duration'].dt.total_seconds()

        month_duration = df.groupby('month_started')['duration'].sum().reset_index(name='sum_duration')
        day_duration = df.groupby('weekday_started')['duration'].sum().reset_index(name='sum_duration').sort_values(by='weekday_started')
       
        return month_duration, day_duration
    
    def seasonality_count(df):
        month_count = df.groupby('month_started').size().reset_index(name='count')
        day_count = df.groupby('weekday_started').size().reset_index(name='count').sort_values(by='weekday_started')
        hour_count = df.groupby('hour_started').size().reset_index(name='count')
        day_hour_count = df.groupby(['weekday_started', 'hour_started']).size().reset_index(name='count')
        return month_count, day_count, hour_count, day_hour_count
    
    
    def plotting(df):

        sns.set(style="white")

        # Most popular start station
        plt.figure(figsize=(10, 6))
        plt.tight_layout()
        sns.barplot(x='start_station_name', y='count', hue = 'month_started', data=popular_start_station(df))
        plt.title('Most popular start station')
        plt.xticks(rotation=90)
        sns.despine()
        plt.savefig('Most popular start station.png', dpi=300, transparent=True)


        # Duration
        plt.figure(figsize=(10, 6))
        plt.tight_layout()
        sns.barplot(x='month_started', y='sum_duration', data=duration(df)[0])
        plt.title('Duration of rides per month')
        plt.xticks(rotation=90)
        sns.despine()
        plt.savefig(f'{viz_path}Duration of rides per month.png', dpi=300, transparent=True)
        plt.figure(figsize=(10, 6))
        plt.tight_layout()
        sns.barplot(x='weekday_started', y='sum_duration', data=duration(df)[1])
        plt.title('Duration of rides per weekday')
        plt.xticks(rotation=90)
        sns.despine()
        plt.savefig(f'{viz_path}Duration of rides per weekday.png', dpi=300, transparent=True)


        # Seasonality
        plt.figure(figsize=(10, 6))
        plt.tight_layout()
        sns.barplot(x='month_started', y='count', data=seasonality_count(df)[0])
        plt.title('Seasonality of rides per month')
        plt.xticks(rotation=90)
        sns.despine()
        plt.savefig(f'{viz_path}Seasonality of rides per month.png', dpi=300, transparent=True)

        plt.figure(figsize=(10, 6))
        plt.tight_layout()
        sns.barplot(x='weekday_started', y='count', data=seasonality_count(df)[1])
        plt.title('Seasonality of rides per weekday')
        plt.xticks(rotation=90)
        sns.despine()
        plt.savefig(f'{viz_path}Seasonality of rides per weekday.png', dpi=300, transparent=True)

        plt.figure(figsize=(10, 6))
        plt.tight_layout()
        sns.barplot(x='hour_started', y='count', data=seasonality_count(df)[2])
        plt.title('Seasonality of rides per hour')
        plt.xticks(rotation=90)
        sns.despine()
        plt.savefig(f'{viz_path}Seasonality of rides per hour.png', dpi=300, transparent=True)



        plt.figure(figsize=(10, 6))
        plt.tight_layout()
        sns.barplot(x='hour_started', y='count', hue = 'weekday_started', data=seasonality_count(df)[3])
        plt.title('Seasonality of rides per hour and weekday')
        plt.xticks(rotation=90)
        sns.despine()
        plt.savefig(f'{viz_path}Seasonality of rides per hour and weekday.png', dpi=300, transparent=True)

    plotting(df)
    

def main():
    df = load_data(path)
    df = preprocess(df)
    analyse_data_statistics(df)
    
    
    


In [78]:
df = load_data(path)
df = preprocess(df)
df = filter_top_station(df)


In [56]:
duration_df = analyse_data_statistics(df)

In [79]:
df[df['month_started'] == 5]

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,year_started,month_started,day_started,weekday_started,hour_started,minute_started,second_started
1116805,8DDD0552D490C5E3,classic_bike,2022-05-01 00:04:41.000,2022-05-01 00:08:06.000,Grove St PATH,JC005,JC Medical Center,JC011,40.719586,-74.043117,40.716540,-74.049638,member,2022,5,1,6,0,4,41
1097385,9087BA335BADAC37,classic_bike,2022-05-01 00:06:31.000,2022-05-01 00:13:00.000,Hoboken Terminal - Hudson St & Hudson Pl,HB101,Columbus Park - Clinton St & 9 St,HB501,40.735938,-74.030305,40.748161,-74.032453,member,2022,5,1,6,0,6,31
1068326,08B5EC5023920804,classic_bike,2022-05-01 00:06:33.000,2022-05-01 00:09:55.000,Hoboken Terminal - Hudson St & Hudson Pl,HB101,Madison St & 1 St,HB402,40.735938,-74.030305,40.738790,-74.039300,member,2022,5,1,6,0,6,33
1087220,D2EEE2EF94703015,electric_bike,2022-05-01 00:07:06.000,2022-05-01 00:12:53.000,Hoboken Terminal - Hudson St & Hudson Pl,HB101,11 St & Washington St,HB502,40.735938,-74.030305,40.749985,-74.027150,member,2022,5,1,6,0,7,6
1095479,2C4761AC7C78D95F,electric_bike,2022-05-01 00:07:24.000,2022-05-01 00:14:29.000,Hoboken Terminal - Hudson St & Hudson Pl,HB101,Southwest Park - Jackson St & Observer Hwy,HB401,40.735938,-74.030305,40.737551,-74.041664,casual,2022,5,1,6,0,7,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
423723,DE3E512C705435C3,classic_bike,2024-05-31 23:53:09.852,2024-06-01 00:04:19.105,Hoboken Terminal - River St & Hudson Pl,HB102,14 St Ferry - 14 St & Shipyard Ln,HB202,40.736068,-74.029127,40.752961,-74.024353,casual,2024,5,31,4,23,53,9
335408,269CAE1A61663378,classic_bike,2024-05-31 23:55:31.802,2024-06-01 00:00:50.426,Hoboken Terminal - River St & Hudson Pl,HB102,Madison St & 1 St,HB402,40.736068,-74.029127,40.738790,-74.039300,member,2024,5,31,4,23,55,31
428618,40AAF58F03CBD1F0,classic_bike,2024-05-31 23:57:20.995,2024-06-01 00:06:48.031,Hoboken Terminal - Hudson St & Hudson Pl,HB101,Pershing Field,JC024,40.735938,-74.030305,40.742677,-74.051789,member,2024,5,31,4,23,57,20
404012,E7535E475E9EC93E,classic_bike,2024-05-31 23:58:12.780,2024-06-01 00:04:56.895,South Waterfront Walkway - Sinatra Dr & 1 St,HB103,12 St & Sinatra Dr N,HB201,40.736982,-74.027781,40.750604,-74.024020,casual,2024,5,31,4,23,58,12


In [32]:
df = load_data(path)
df['started_at'] = pd.to_datetime(df['started_at'], format='ISO8601')
df['month'] = df['started_at'].dt.month
grouped_df = df.groupby('month')['start_station_name'].value_counts().reset_index(name='count')

In [77]:
df

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,year_started,month_started,day_started,weekday_started,hour_started,minute_started,second_started
295511,E26BB19DBC834216,classic_bike,2022-01-01 00:10:30.000,2022-01-01 00:24:30.000,Grove St PATH,JC005,Newport Pkwy,JC008,40.719586,-74.043117,40.728745,-74.032108,member,2022,1,1,5,0,10,30
318313,95D98C7AE35192B6,classic_bike,2022-01-01 00:13:31.000,2022-01-01 00:18:56.000,Newport PATH,JC066,Hamilton Park,JC009,40.727224,-74.033759,40.727596,-74.044247,member,2022,1,1,5,0,13,31
318589,B42E6C17AF007611,classic_bike,2022-01-01 00:19:27.000,2022-01-01 00:25:03.000,Hoboken Terminal - River St & Hudson Pl,HB102,Adams St & 2 St,HB407,40.736068,-74.029127,40.739814,-74.036904,member,2022,1,1,5,0,19,27
312278,1D1BEA82F36906ED,electric_bike,2022-01-01 00:21:06.000,2022-01-01 00:57:48.000,City Hall - Washington St & 1 St,HB105,Adams St & 2 St,HB407,40.737360,-74.030970,40.739814,-74.036904,casual,2022,1,1,5,0,21,6
305682,8AFC43B9B89D8FA7,electric_bike,2022-01-01 00:29:37.000,2022-01-01 15:12:36.000,City Hall - Washington St & 1 St,HB105,Newark Ave,JC032,40.737360,-74.030970,40.721525,-74.046305,casual,2022,1,1,5,0,29,37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2204561,6B72D199FC6E7531,classic_bike,2024-09-30 23:21:30.195,2024-09-30 23:42:05.631,Hamilton Park,JC009,14 St Ferry - 14 St & Shipyard Ln,HB202,40.727596,-74.044247,40.752961,-74.024353,member,2024,9,30,0,23,21,30
2156270,AD2D5926699F812C,classic_bike,2024-09-30 23:23:07.279,2024-09-30 23:31:18.570,Hoboken Terminal - Hudson St & Hudson Pl,HB101,Southwest Park - Jackson St & Observer Hwy,HB401,40.735938,-74.030305,40.737551,-74.041664,casual,2024,9,30,0,23,23,7
2183275,71E7EC271CC0711F,classic_bike,2024-09-30 23:23:49.085,2024-09-30 23:29:09.650,Hoboken Terminal - Hudson St & Hudson Pl,HB101,7 St & Monroe St,HB304,40.735938,-74.030305,40.746413,-74.037977,member,2024,9,30,0,23,23,49
2161339,93DF3C30DDCF5F8F,classic_bike,2024-09-30 23:24:21.718,2024-09-30 23:31:36.647,City Hall - Washington St & 1 St,HB105,6 St & Grand St,HB302,40.737360,-74.030970,40.744398,-74.034501,member,2024,9,30,0,23,24,21


In [44]:
df['start_station_name'].value_counts().head(10).index.tolist()

['Hoboken Terminal - River St & Hudson Pl',
 'Grove St PATH',
 'City Hall - Washington St & 1 St',
 'Hoboken Terminal - Hudson St & Hudson Pl',
 'Newport PATH',
 'South Waterfront Walkway - Sinatra Dr & 1 St',
 'Exchange Pl',
 'Hamilton Park',
 'Bergen Ave & Sip Ave',
 'Newport Pkwy']

In [61]:
month_count = df.groupby('month_started').count().reset_index()

In [67]:
month_count = df.groupby('month_started').size().reset_index(name = 'count')

In [68]:
month_count

Unnamed: 0,month_started,count
0,1,14060
1,2,15386
2,3,18095
3,4,21767
4,5,8
5,6,30342
6,7,30299
7,8,29032
8,9,30910
