In [18]:
import pandas as pd
import numpy as np
path = 'data/'
import os
from glob import glob
import matplotlib.pyplot as plt
import seaborn as sns

In [55]:
def load_data(path):
    all_files = glob(os.path.join(path, "*.csv"))
    df_from_each_file = (pd.read_csv(f) for f in all_files)
    return pd.concat(df_from_each_file, ignore_index=True).sort_values(by='started_at')

def preprocess(df):

    df['started_at'] = pd.to_datetime(df['started_at'], format='ISO8601')
    df['ended_at'] = pd.to_datetime(df['ended_at'], format='ISO8601')
    df['month_started'] = df['started_at'].dt.month
    df['day_started'] = df['started_at'].dt.day
    df['hour_started'] = df['started_at'].dt.hour
    df['minute_started'] = df['started_at'].dt.minute
    df['second_started'] = df['started_at'].dt.second

    return df


def filter_top_station(df, number = 10):
    top_stations = df['start_station_name'].value_counts().head(number).index.tolist()
    return df[df['start_station_name'].isin(top_stations)]

def analyse_data_statistics(df):
    def popular_start_station(df):
        return df.groupby('month_started')['start_station_name'].value_counts().reset_index(name='count')
    def popular_end_station(df):
        return df.groupby('month_started')['end_station_name'].value_counts().reset_index(name='count')
      
    def duration(df):
        df['started_at'] = pd.to_datetime(df['started_at'])
        df['ended_at'] = pd.to_datetime(df['ended_at'])
        df['duration'] = df['ended_at'] - df['started_at']
        return df.groupby('month_started')['duration'].value_counts().reset_index(name='count')
    
    def plotting(df):

        sns.set(style="white")

        # Most popular start station
        plt.figure(figsize=(10, 6))
        plt.tight_layout()
        sns.barplot(x='start_station_name', y='count', hue = 'month_started', data=popular_start_station(df))
        plt.title('Most popular start station')
        plt.xticks(rotation=90)
        sns.despine()
        plt.show()

        # Most popular end station
        plt.figure(figsize=(10, 6))
        plt.tight_layout()
        sns.barplot(x='end_station_name', y='count', hue = 'month_started', data=popular_end_station(df))
        plt.title('Most popular end station')
        plt.xticks(rotation=90)
        sns.despine()
        plt.show()

        # Duration
        plt.figure(figsize=(10, 6))
        plt.tight_layout()
        sns.barplot(x='month_started', y='count', data=duration(df))
        plt.title('Duration')
        plt.xticks(rotation=90)
        sns.despine()
        plt.show()
    
    return duration(df)

    
    


In [50]:
df = load_data(path)
df = preprocess(df)
df = filter_top_station(df)


In [56]:
duration_df = analyse_data_statistics(df)

In [57]:
duration_df

Unnamed: 0,month_started,duration,count
0,1,0 days 00:04:36,49
1,1,0 days 00:04:24,46
2,1,0 days 00:04:42,46
3,1,0 days 00:04:46,46
4,1,0 days 00:04:27,45
...,...,...,...
125893,9,1 days 00:59:55.186000,1
125894,9,1 days 00:59:55.575000,1
125895,9,1 days 00:59:55.905000,1
125896,9,1 days 00:59:56.512000,1


In [32]:
df = load_data(path)
df['started_at'] = pd.to_datetime(df['started_at'], format='ISO8601')
df['month'] = df['started_at'].dt.month
grouped_df = df.groupby('month')['start_station_name'].value_counts().reset_index(name='count')

In [35]:
grouped_df

Unnamed: 0,month,start_station_name,count
0,1,Hoboken Terminal - River St & Hudson Pl,2741
1,1,Grove St PATH,2334
2,1,Hoboken Terminal - Hudson St & Hudson Pl,1363
3,1,Bergen Ave & Sip Ave,1338
4,1,Newport PATH,1282
...,...,...,...
785,9,5 Corners Library,363
786,9,Jackson Square,253
787,9,Union St,214
788,9,Bergen Ave & Stegman St,165


In [44]:
df['start_station_name'].value_counts().head(10).index.tolist()

['Hoboken Terminal - River St & Hudson Pl',
 'Grove St PATH',
 'City Hall - Washington St & 1 St',
 'Hoboken Terminal - Hudson St & Hudson Pl',
 'Newport PATH',
 'South Waterfront Walkway - Sinatra Dr & 1 St',
 'Exchange Pl',
 'Hamilton Park',
 'Bergen Ave & Sip Ave',
 'Newport Pkwy']

In [61]:
month_count = df.groupby('month_started').count().reset_index()

In [67]:
month_count = df.groupby('month_started').size().reset_index(name = 'count')

In [68]:
month_count

Unnamed: 0,month_started,count
0,1,14060
1,2,15386
2,3,18095
3,4,21767
4,5,8
5,6,30342
6,7,30299
7,8,29032
8,9,30910
