In [1]:
import pandas as pd
import numpy as np
import time

### Loading Data

In [2]:
print('Loading Data ....')
df = pd.read_csv('./data/new_york_city.csv')

# convert the Start Time column to datetime
df['Start Time'] = pd.to_datetime(df['Start Time'])
df['End Time'] = pd.to_datetime(df['End Time'])

# extract month and day of week from Start Time to create new columns
df['month'] = df['Start Time'].dt.strftime('%B')
df['month'] = df['month'].str.lower()

df['weekday'] = df['Start Time'].dt.strftime('%A')
df['weekday'] = df['weekday'].str.lower()

df['hour'] = df['Start Time'].dt.hour

print('Data Loaded !', '\n')
df.head()

Loading Data ....
Data Loaded ! 



Unnamed: 0.1,Unnamed: 0,Start Time,End Time,Trip Duration,Start Station,End Station,User Type,Gender,Birth Year,month,weekday,hour
0,5688089,2017-06-11 14:55:05,2017-06-11 15:08:21,795,Suffolk St & Stanton St,W Broadway & Spring St,Subscriber,Male,1998.0,june,sunday,14
1,4096714,2017-05-11 15:30:11,2017-05-11 15:41:43,692,Lexington Ave & E 63 St,1 Ave & E 78 St,Subscriber,Male,1981.0,may,thursday,15
2,2173887,2017-03-29 13:26:26,2017-03-29 13:48:31,1325,1 Pl & Clinton St,Henry St & Degraw St,Subscriber,Male,1987.0,march,wednesday,13
3,3945638,2017-05-08 19:47:18,2017-05-08 19:59:01,703,Barrow St & Hudson St,W 20 St & 8 Ave,Subscriber,Female,1986.0,may,monday,19
4,6208972,2017-06-21 07:49:16,2017-06-21 07:54:46,329,1 Ave & E 44 St,E 53 St & 3 Ave,Subscriber,Male,1992.0,june,wednesday,7


In [None]:
def show_data(df):
    show_data_option = input('Care to see some raw data from city data? (y/n): ').strip().lower()
    if show_data_option in ('yes', 'y'):
        for i in range(5, df.shape[0], 5):
            print(df.iloc[i-5:i])
            more_data = input('Want to see more data (y/n): ').strip().lower()
            if more_data in ('no', 'n'): break

show_data(df)

### Removing NaN's

In [10]:
#removing NaN's value
def removing_nan(df):
#     df_without_nan =df.dropna(axis=0)
    df.dropna(axis=0, inplace=True)
    print("Row Counts without nan's", df.shape[0])

print('Before DF shape ', df.shape)

Before DF shape  (300000, 12)


In [11]:
removing_nan(df)
print('After NaN removal shape ', df.shape)

Row Counts without nan's 270102
After NaN removal shape  (270102, 12)


### Filtering by month & day

In [None]:
#filtering
month, weekday = 'february', 'tuesday'

df = df[df['month']==month]
df = df[df['weekday']==weekday]

print(df.shape)
df.head()

### User Stats

In [13]:
def user_stats(df):
    """Displays statistics on bikeshare users."""

    print('\nCalculating User Stats...\n')
    start_time = time.time()

    # Display counts of user types
    print('The No. of Users & User types riding are: ')
    print(df['User Type'].value_counts(), '\n')

    # Display counts of gender
    if 'Gender' in df.columns:
        print('Gender Counts are: \n',
              df['Gender'].value_counts())
    else:
        print('No Gender Data available')

    print()
    # Display earliest, most recent, and most common year of birth
    if 'Birth Year' in df.columns:
        print('First ride taken can seen back to year: ',
              int(df['Birth Year'].min()), '\n')

        print('Most updated ride taken on year: ',
              int(df['Birth Year'].max()), '\n')

        print('Riders riding with common birth years: ',
              tuple(map(int, df['Birth Year'].mode())),
              '\n')

    else:
        print('No birth year data available')

    print("\nThis took %s seconds." % (time.time() - start_time))
    print('-' * 40)

In [14]:
user_stats(df)


Calculating User Stats...

The No. of Users & User types riding are: 
Subscriber    265737
Customer        4365
Name: User Type, dtype: int64 

Gender Counts are: 
 Male      203535
Female     66567
Name: Gender, dtype: int64

First ride taken can seen back to year:  1886 

Most updated ride taken on year:  2001 

Riders riding with common birth years:  (1989,) 


This took 0.22958159446716309 seconds.
----------------------------------------


### Trip Duration stats

In [None]:
# display total travel time
tot_travel_time = df['Trip Duration'].sum()
print('Total Travel time of all riders took nearly: {:.1f} hrs'.format(
      tot_travel_time / 60))

# display mean travel time
avg_trav_time = df['Trip Duration'].mean()
print('The Average Ride time for each person is: {:.1f} hrs'.format(avg_trav_time / 60))

### Times of Travel Stats

In [15]:
def time_stats(df):
    """Displays statistics on the most frequent times of travel."""

    print('\nCalculating The Most Frequent Times of Travel...\n')
    start_time = time.time()

    # display the most common month
    print('Riders mostly seen to ride in months: ',
          df['month'].mode().values)

    # display the most common day of week
    print('Riders rides mostly on weekdays: ',
          df['weekday'].mode().values)

    # display the most common start hour
    print('Common riding hour:',
          df['hour'].mode().values)

    print("\nThis took %s seconds." % (time.time() - start_time))
    print('-' * 40)

In [16]:
time_stats(df)


Calculating The Most Frequent Times of Travel...

Riders mostly seen to ride in months:  ['june']
Riders rides mostly on weekdays:  ['wednesday']
Common riding hour: [17]

This took 0.15843677520751953 seconds.
----------------------------------------


### Station stats

In [19]:
# display most commonly used start station
print('Most popular Start Station is: \n',
      df['Start Station'].mode().values, '\n')

# display most commonly used end station
print('Most popular rider end in station: \n',
      df['End Station'].mode().values, '\n')

# # # display most frequent combination of start station and end station trip
# # print('Frequent Start and End Station, respectively: \n',
# #       tuple(df[['Start Station', 'End Station']].mode().values[0]))
freq_pair_dest = df.groupby(['Start Station', 'End Station']).size().sort_values(ascending=False).reset_index().values[0]
print('Frequent combination of Start and End Destination: \n',
      ' - Start point: {} \n  - End point: {} \n  - count: {}'.format(*freq_pair_dest))

Most popular Start Station is: 
 ['Pershing Square North'] 

Most popular rider end in station: 
 ['Pershing Square North'] 

Frequent combination of Start and End Destination: 
  - Start point: E 7 St & Avenue A 
  - End point: Cooper Square & E 7 St 
  - count: 166


In [None]:
df_chicago = pd.read_csv('./bikeshare-project/chicago.csv')
df_ny = pd.read_csv('./bikeshare-project/new_york_city.csv')
df_wash = pd.read_csv('./bikeshare-project/washington.csv')

In [None]:
print('Prev: ', df_ny.isnull().any(axis=1), df_ny.shape)
df_without_nan = df_ny.dropna(axis=0)
print('After nan removal: ', df_without_nan.isnull().any().sum(), df_without_nan.shape)

In [None]:
print('rows count with nan\'s: ', df_chicago.shape[0])
print('total nan counts: ', df_chicago.isnull().any(axis=1).sum())
print('without nan, rows count MUST be: ', df.shape[0] - df_chicago.isnull().any(axis=1).sum())

df_nan_removed = df_chicago.dropna(axis=0)
print('without nan, rows CAME to be: ', df_nan_removed.shape[0])

In [None]:
df_chicago[['Start Station', 'End Station']].mode().values