## Process
We will be processing the data for further analysis and visualizations.


In [1]:
# Begi by importing important library and setting up the pandas viewing options.
import numpy as np
import pandas as pd
import os
import gc
import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_colwidth', 999)
pd.set_option('display.max_rows', 9999)

In [2]:
# Importing the complete dataset on trips data, which was prepared during the previous step. 
path = r"C:\Users\utsav\OneDrive\Documents\ML\Google Data Analytics Course\Course 8 - Capstone\Case Study 1\data\Unzipped"
os.chdir(path)
trip_data = pd.read_csv('combined_trips_data.csv')

In [3]:
# Display all the files used in compiling of the data.
trip_data['file'].unique()

array(['Divvy_Trips_2014_Q1Q2.csv', 'Divvy_Trips_2015-Q1.csv',
       'Divvy_Trips_2016_Q1.csv', 'Divvy_Trips_2017_Q1.csv',
       'Divvy_Trips_2014-Q4.csv', 'Divvy_Trips_2015_Q4.csv',
       'Divvy_Trips_2016_Q4.csv', 'Divvy_Trips_2017_Q4.csv',
       'Divvy_Trips_2013.csv', 'Divvy_Trips_2018_Q1.csv',
       'Divvy_Trips_2018_Q2.csv', 'Divvy_Trips_2018_Q3.csv',
       'Divvy_Trips_2018_Q4.csv', 'Divvy_Trips_2019_Q1.csv',
       'Divvy_Trips_2019_Q2.csv', 'Divvy_Trips_2019_Q3.csv',
       'Divvy_Trips_2019_Q4.csv', 'Divvy_Trips_2020_Q1.csv',
       '202004-divvy-tripdata.csv', '202005-divvy-tripdata.csv',
       '202006-divvy-tripdata.csv', '202007-divvy-tripdata.csv',
       '202008-divvy-tripdata.csv', '202009-divvy-tripdata.csv',
       '202010-divvy-tripdata.csv', '202011-divvy-tripdata.csv',
       '202012-divvy-tripdata.csv', '202101-divvy-tripdata.csv',
       '202102-divvy-tripdata.csv', '202103-divvy-tripdata.csv',
       '202104-divvy-tripdata.csv', '202105-divvy-tripdata.csv

In [4]:
# We will be filtering the data due to two main reasons:
# 1. Predominantly, we would want to analyze the current trend. THus we will be using the data only from 2020-2021.
# 2. Due to Public Tableau limitations on the number of data points to 15 million, we will be using a smaller subset.
files_filter = [
#       'Divvy_Trips_2020_Q1.csv',
#       '202004-divvy-tripdata.csv', '202005-divvy-tripdata.csv',
#       '202006-divvy-tripdata.csv', '202007-divvy-tripdata.csv',
#       '202008-divvy-tripdata.csv', '202009-divvy-tripdata.csv',
#       '202010-divvy-tripdata.csv', '202011-divvy-tripdata.csv',
       '202012-divvy-tripdata.csv', '202101-divvy-tripdata.csv',
       '202102-divvy-tripdata.csv', '202103-divvy-tripdata.csv',
       '202104-divvy-tripdata.csv', '202105-divvy-tripdata.csv']
filtered_trip_data = trip_data[trip_data['file'].isin(files_filter)]

In [5]:
# Viewing random data 
filtered_trip_data.sample(15)

Unnamed: 0.1,Unnamed: 0,ride_id,started_at,start_station_id,start_station_name,start_lat,start_lng,ended_at,end_station_id,end_station_name,end_lat,end_lng,bikeid,birthyear,usertype,tripduration,gender,rideable_type,file
15378419,3816652,38ED1C24758FDC29,2021-04-21 08:26:19,TA1307000107,Sheridan Rd & Montrose Ave,41.96167,-87.65464,2021-04-21 08:30:34,TA1309000041,Sheridan Rd & Lawrence Ave,41.96952,-87.65469,,,member,,,classic_bike,202104-divvy-tripdata.csv
15967816,3924388,8DF37970029230A5,2021-05-29 16:00:18,KA1503000064,Dusable Harbor,41.88698,-87.61281,2021-05-29 18:23:21,KA1503000044,Rush St & Hubbard St,41.89017,-87.62618,,,casual,,,docked_bike,202105-divvy-tripdata.csv
15147981,3400726,40323C988D3ABDEB,2021-03-30 18:06:15,TA1305000029,State St & Randolph St,41.88473,-87.62773,2021-03-30 18:13:24,TA1307000062,Sedgwick St & Huron St,41.89467,-87.63844,,,member,,,classic_bike,202103-divvy-tripdata.csv
15942900,4181699,2F76C62C1F7CB1C3,2021-05-27 15:39:57,TA1306000011,Wells St & Walton St,41.89993,-87.63443,2021-05-27 15:41:40,13017,Franklin St & Chicago Ave,41.89675,-87.63567,,,member,,,classic_bike,202105-divvy-tripdata.csv
15748213,3966489,BED76D6352C2335D,2021-05-18 08:55:14,WL-012,Clinton St & Washington Blvd,41.88338,-87.64117,2021-05-18 09:05:20,TA1308000029,Stetson Ave & South Water St,41.88684,-87.62232,,,member,,,classic_bike,202105-divvy-tripdata.csv
14995062,3468237,044E07A7C911B7E0,2021-03-11 16:01:23,TA1307000005,Indiana Ave & 26th St,41.84571,-87.62258,2021-03-11 16:13:48,,,41.87,-87.63,,,member,,,electric_bike,202103-divvy-tripdata.csv
15897698,3893903,12BEE9A54102F3F1,2021-05-25 06:53:33,TA1307000159,Paulina St & 18th St,41.8579,-87.66874,2021-05-25 07:01:50,TA1309000064,Wolcott Ave & Polk St,41.87126,-87.67369,,,member,,,classic_bike,202105-divvy-tripdata.csv
15725033,4296775,DD8F636593761E5F,2021-05-16 17:16:26,KA1504000133,Rush St & Cedar St,41.90231,-87.62769,2021-05-16 20:13:08,KA1504000133,Rush St & Cedar St,41.90231,-87.62769,,,casual,,,classic_bike,202105-divvy-tripdata.csv
15780344,3990855,C55AA43DA684FE0F,2021-05-20 13:14:48,WL-012,Clinton St & Washington Blvd,41.88338,-87.64117,2021-05-20 13:16:33,TA1305000032,Clinton St & Madison St,41.88224,-87.64107,,,member,,,classic_bike,202105-divvy-tripdata.csv
15483840,3621828,18ADFCEC9B1313FC,2021-04-30 09:58:57,13036,Michigan Ave & Madison St,41.88213,-87.62512,2021-04-30 10:21:57,13431,Adler Planetarium,41.8661,-87.60727,,,member,,,classic_bike,202104-divvy-tripdata.csv


In [6]:
# Getting preliminary information on data columns.
filtered_trip_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1375388 entries, 14650208 to 16025595
Data columns (total 19 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   Unnamed: 0          1375388 non-null  int64  
 1   ride_id             1375388 non-null  object 
 2   started_at          1375388 non-null  object 
 3   start_station_id    1256370 non-null  object 
 4   start_station_name  1256370 non-null  object 
 5   start_lat           1375388 non-null  float64
 6   start_lng           1375388 non-null  float64
 7   ended_at            1375388 non-null  object 
 8   end_station_id      1243421 non-null  object 
 9   end_station_name    1243421 non-null  object 
 10  end_lat             1374074 non-null  float64
 11  end_lng             1374074 non-null  float64
 12  bikeid              0 non-null        float64
 13  birthyear           0 non-null        float64
 14  usertype            1375388 non-null  object 
 15  tripdur

In [7]:
filtered_trip_data.describe(include = 'all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Unnamed: 0,1375388.0,,,,3670916.5,397040.460358,2983223.0,3327069.75,3670916.5,4014763.25,4358610.0
ride_id,1375388.0,1375388.0,077C210F6FA90834,1.0,,,,,,,
started_at,1375388.0,1222518.0,2021-05-30 13:00:38,6.0,,,,,,,
start_station_id,1256370.0,689.0,13022,16497.0,,,,,,,
start_station_name,1256370.0,696.0,Streeter Dr & Grand Ave,16497.0,,,,,,,
start_lat,1375388.0,,,,41.90133,0.045461,41.64,41.88132,41.89776,41.92883,42.07
start_lng,1375388.0,,,,-87.64468,0.026732,-87.78,-87.65912,-87.64107,-87.62689,-87.52
ended_at,1375388.0,1221489.0,2021-05-29 14:14:57,14.0,,,,,,,
end_station_id,1243421.0,689.0,13022,16854.0,,,,,,,
end_station_name,1243421.0,696.0,Streeter Dr & Grand Ave,16854.0,,,,,,,


In [8]:
# Data type for some of the variables need to be corrected like date-time,  etc. 
# Filtered Data seems are pretty clean.

In [9]:
filtered_trip_data.isnull().sum()

Unnamed: 0                  0
ride_id                     0
started_at                  0
start_station_id       119018
start_station_name     119018
start_lat                   0
start_lng                   0
ended_at                    0
end_station_id         131967
end_station_name       131967
end_lat                  1314
end_lng                  1314
bikeid                1375388
birthyear             1375388
usertype                    0
tripduration          1375388
gender                1375388
rideable_type               0
file                        0
dtype: int64

## Removing Unwanted Columns.

In [10]:
# The following columns do not help us in aswering the quesiton. We will be dropping them:
# * gender                
# * birthyear             
# * bikeid  
# * ride_id   
# * Row number

In [11]:
filtered_trip_data.drop(['gender','birthyear','bikeid','ride_id',"Unnamed: 0"], axis = 1, inplace = True)

## Transforming Columns

In [12]:
# Transforming the date-time for 'started-at' and 'ended-at' columns to YYYY-MM-DD HH:MM:SS formats, so that it can be read into Tableau.  
filtered_trip_data['started_at'] =  pd.to_datetime(filtered_trip_data['started_at'], format='%Y-%m-%d %H:%M:%S')
filtered_trip_data['ended_at'] =  pd.to_datetime(filtered_trip_data['ended_at'], format='%Y-%m-%d %H:%M:%S')

## Creating New Columns

In [13]:
# Creating new column for distance travel. 
from math import radians, cos, sin, asin, sqrt
# vectorized haversine function
def haversine(lat1, lon1, lat2, lon2, to_radians=True, earth_radius=6371):
    """
    slightly modified version: of http://stackoverflow.com/a/29546836/2901002

    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees or in radians)

    All (lat, lon) coordinates must have numeric dtypes and be of equal length.

    """
    if to_radians:
        lat1, lon1, lat2, lon2 = np.radians([lat1, lon1, lat2, lon2])

    a = np.sin((lat2-lat1)/2.0)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin((lon2-lon1)/2.0)**2

    return earth_radius * 2 * np.arcsin(np.sqrt(a))

In [14]:
filtered_trip_data['distance'] = haversine(filtered_trip_data['start_lat'],
                                           filtered_trip_data['start_lng'],
                                           filtered_trip_data['end_lat'],
                                           filtered_trip_data['end_lng']
                                          )

In [15]:
# Creating new columns for trip duration and day of the week. The day of the week with Monday=0, Sunday=6.
# I have a hypothesis that riders behave differently for different day of the week and have different speeds of travel.
filtered_trip_data['tripduration'] = (filtered_trip_data['ended_at'] - filtered_trip_data['started_at']).astype('timedelta64[s]')
filtered_trip_data['weekday'] = filtered_trip_data['started_at'].dt.dayofweek

filtered_trip_data.sample(10)

Unnamed: 0,started_at,start_station_id,start_station_name,start_lat,start_lng,ended_at,end_station_id,end_station_name,end_lat,end_lng,usertype,tripduration,rideable_type,file,distance,weekday
14745751,2020-12-20 13:15:26,TA1306000029,Lake Shore Dr & Ohio St,41.89257,-87.61449,2020-12-20 14:17:52,TA1306000029,Lake Shore Dr & Ohio St,41.89257,-87.61449,casual,3746.0,classic_bike,202012-divvy-tripdata.csv,0.0,6
14754668,2020-12-22 12:15:45,TA1306000003,Desplaines St & Kinzie St,41.88872,-87.64445,2020-12-22 12:21:02,13053,Green St & Randolph St,41.88367,-87.64867,casual,317.0,classic_bike,202012-divvy-tripdata.csv,0.66133,1
14766794,2020-12-26 16:59:47,13263,Columbus Dr & Randolph St,41.88473,-87.61952,2020-12-26 17:09:48,TA1307000120,Green St & Madison St,41.88189,-87.64879,member,601.0,classic_bike,202012-divvy-tripdata.csv,2.443617,5
15961559,2021-05-29 12:43:42,LF-005,Lake Shore Dr & North Blvd,41.91172,-87.6268,2021-05-29 13:08:26,13221,Wood St & Milwaukee Ave,41.90766,-87.67255,casual,1484.0,docked_bike,202105-divvy-tripdata.csv,3.812685,5
15573111,2021-05-05 19:47:47,13091,Western Ave & 21st St,41.85402,-87.68585,2021-05-05 20:55:26,,,41.86,-87.68,member,4059.0,electric_bike,202105-divvy-tripdata.csv,0.822731,2
15289760,2021-04-12 11:55:43,TA1309000058,Lincoln Ave & Fullerton Ave,41.92593,-87.64928,2021-04-12 12:04:53,13154,Sheffield Ave & Kingsbury St,41.91082,-87.6532,member,550.0,electric_bike,202104-divvy-tripdata.csv,1.711175,0
14751892,2020-12-21 17:04:28,TA1307000117,Wabash Ave & Grand Ave,41.89113,-87.62694,2020-12-21 17:11:21,15534,Field Blvd & South Water St,41.88617,-87.61675,member,413.0,electric_bike,202012-divvy-tripdata.csv,1.007816,0
15502869,2021-05-01 14:08:55,13042,Michigan Ave & Oak St,41.90096,-87.62378,2021-05-01 14:35:25,TA1309000049,Lake Shore Dr & Belmont Ave,41.94078,-87.63919,casual,1590.0,classic_bike,202105-divvy-tripdata.csv,4.60769,5
15393234,2021-04-22 19:58:16,,,41.91,-87.65,2021-04-22 20:06:41,13193,Larrabee St & Webster Ave,41.92171,-87.64418,member,505.0,electric_bike,202104-divvy-tripdata.csv,1.38829,3
15700917,2021-05-15 16:49:47,657,Wood St & Augusta Blvd,41.89918,-87.6722,2021-05-15 17:05:06,13059,Bissell St & Armitage Ave,41.91844,-87.65222,casual,919.0,classic_bike,202105-divvy-tripdata.csv,2.705589,5


## Cleaning Data

In [16]:
# As seen before, the data was pretty clean. Let's explored the newely added fields:
filtered_trip_data[['tripduration', 'weekday']].describe()

Unnamed: 0,tripduration,weekday
count,1375388.0,1375388.0
mean,924.3571,3.195281
std,30554.54,2.048481
min,-1742998.0,0.0
25%,411.0,1.0
50%,738.0,3.0
75%,1373.0,5.0
max,3235296.0,6.0


In [17]:
# We see there are a few trips with tripduration <0.
# Lets understand them further, by looking at trip duration<0, <5s < 10s, etc.:
filtered_trip_data[filtered_trip_data['tripduration']<-0]

Unnamed: 0,started_at,start_station_id,start_station_name,start_lat,start_lng,ended_at,end_station_id,end_station_name,end_lat,end_lng,usertype,tripduration,rideable_type,file,distance,weekday
14650288,2020-12-01 05:45:22,TA1307000005,Indiana Ave & 26th St,41.84569,-87.62248,2020-12-01 05:44:49,13001,Michigan Ave & Washington St,41.88398,-87.62468,member,-33.0,docked_bike,202012-divvy-tripdata.csv,4.26155,1
14650518,2020-12-01 07:14:28,TA1307000070,Damen Ave & Thomas St (Augusta Blvd),41.90131,-87.67741,2020-12-01 07:13:38,KA1504000104,Paulina St & Flournoy St,41.87295,-87.66913,member,-50.0,docked_bike,202012-divvy-tripdata.csv,3.227118,1
14650527,2020-12-01 07:16:21,638,Clinton St & Jackson Blvd,41.87812,-87.63984,2020-12-01 07:15:48,13285,Wood St & Taylor St (Temp),41.86927,-87.67373,member,-33.0,docked_bike,202012-divvy-tripdata.csv,2.973572,1
14650536,2020-12-01 07:17:34,TA1307000128,Dearborn Pkwy & Delaware Pl,41.89897,-87.62991,2020-12-01 07:17:21,331,Halsted St & Clybourn Ave,41.90967,-87.64813,member,-13.0,docked_bike,202012-divvy-tripdata.csv,1.920731,1
14650546,2020-12-01 07:19:01,13268,Damen Ave & Wellington Ave,41.93588,-87.67842,2020-12-01 07:18:12,13268,Damen Ave & Wellington Ave,41.93588,-87.67842,member,-49.0,docked_bike,202012-divvy-tripdata.csv,0.0,1
14650557,2020-12-01 07:21:16,TA1307000150,Pine Grove Ave & Waveland Ave,41.94947,-87.64645,2020-12-01 07:20:43,13063,Sheridan Rd & Irving Park Rd,41.95424,-87.65441,member,-33.0,docked_bike,202012-divvy-tripdata.csv,0.845361,1
14650586,2020-12-01 07:28:05,TA1307000041,Lake Shore Dr & Wellington Ave,41.93669,-87.63683,2020-12-01 07:27:46,632,Clark St & Newport St,41.94454,-87.65468,member,-19.0,docked_bike,202012-divvy-tripdata.csv,1.715124,1
14650831,2020-12-01 08:15:23,TA1305000035,State St & Van Buren St,41.87718,-87.62784,2020-12-01 08:14:52,TA1307000005,Indiana Ave & 26th St,41.84569,-87.62248,member,-31.0,docked_bike,202012-divvy-tripdata.csv,3.529551,1
14651166,2020-12-01 09:37:26,KA1504000079,Larrabee St & Division St,41.90349,-87.64335,2020-12-01 09:37:06,15534,Field Blvd & South Water St,41.88635,-87.61752,member,-20.0,docked_bike,202012-divvy-tripdata.csv,2.864129,1
14651186,2020-12-01 09:47:03,TA1307000005,Indiana Ave & 26th St,41.84569,-87.62248,2020-12-01 09:46:33,13036,Michigan Ave & Madison St,41.88213,-87.62512,member,-30.0,docked_bike,202012-divvy-tripdata.csv,4.057837,1


In [18]:
filtered_trip_data[filtered_trip_data['tripduration']<-0].count()
#10676 data points

started_at            445
start_station_id      401
start_station_name    401
start_lat             445
start_lng             445
ended_at              445
end_station_id        391
end_station_name      391
end_lat               443
end_lng               443
usertype              445
tripduration          445
rideable_type         445
file                  445
distance              443
weekday               445
dtype: int64

In [19]:
# Some of the data points originate from Cyclistic's head quarters and do not represent the true trips. We will be removing them.
filtered_trip_data = filtered_trip_data[filtered_trip_data['start_station_name'] != "HQ QR"]
filtered_trip_data = filtered_trip_data[filtered_trip_data['end_station_name'] != "HQ QR"]

# Some of the data points have the same starting and ending stations. It can be error in the data colleciton software.
# We can remove them:
filtered_trip_data = filtered_trip_data[filtered_trip_data['end_station_name'] != filtered_trip_data['start_station_name']]
filtered_trip_data = filtered_trip_data[filtered_trip_data['distance']>0]


# Next we can remove the data points without end station details (Latitude and longitudes)
filtered_trip_data = filtered_trip_data[filtered_trip_data['end_lat'].notnull()]

# Next we can remove errorneous values with un-usually high trip durations and un-usually small trips. 
filtered_trip_data = filtered_trip_data[filtered_trip_data['tripduration']>-86400]
filtered_trip_data = filtered_trip_data[abs(filtered_trip_data['tripduration'])>60]


filtered_trip_data[filtered_trip_data['tripduration']<-0].count()
# Only 770 Data points with negative trip durations remaining. 

started_at            2
start_station_id      1
start_station_name    1
start_lat             2
start_lng             2
ended_at              2
end_station_id        1
end_station_name      1
end_lat               2
end_lng               2
usertype              2
tripduration          2
rideable_type         2
file                  2
distance              2
weekday               2
dtype: int64

In [20]:
filtered_trip_data[filtered_trip_data['tripduration']<-0]

Unnamed: 0,started_at,start_station_id,start_station_name,start_lat,start_lng,ended_at,end_station_id,end_station_name,end_lat,end_lng,usertype,tripduration,rideable_type,file,distance,weekday
14686741,2020-12-08 00:05:18,,,41.76,-87.56,2020-12-08 00:01:07,KA1503000020,South Shore Dr & 74th St,41.76223,-87.55912,member,-251.0,electric_bike,202012-divvy-tripdata.csv,0.258484,1
14797790,2021-01-06 18:33:12,TA1306000010,Daley Center Plaza,41.88394,-87.62931,2021-01-06 18:31:07,,,41.88,-87.63,member,-125.0,electric_bike,202101-divvy-tripdata.csv,0.441816,2


In [21]:
# Distribution of these 770 incorrect values:
print(filtered_trip_data[filtered_trip_data['tripduration']<-0]['file'].value_counts())
print(filtered_trip_data[filtered_trip_data['tripduration']<-0]['usertype'].value_counts())
print(filtered_trip_data[filtered_trip_data['tripduration']<-0]['weekday'].value_counts())



202101-divvy-tripdata.csv    1
202012-divvy-tripdata.csv    1
Name: file, dtype: int64
member    2
Name: usertype, dtype: int64
1    1
2    1
Name: weekday, dtype: int64


In [22]:
# The incorrect values is uniformly distributed across the data. Thus,  removing them will not add any bias to our data. 
# We can remove the remaining negative trip duration values:
filtered_trip_data = filtered_trip_data[filtered_trip_data['tripduration']>0]

In [23]:
# Now lets review the rides with higher than 1 day trip durations:
filtered_trip_data[filtered_trip_data['tripduration']>86400].count()/filtered_trip_data.count()

started_at            0.000327
start_station_id      0.000356
start_station_name    0.000356
start_lat             0.000327
start_lng             0.000327
ended_at              0.000327
end_station_id        0.000340
end_station_name      0.000340
end_lat               0.000327
end_lng               0.000327
usertype              0.000327
tripduration          0.000327
rideable_type         0.000327
file                  0.000327
distance              0.000327
weekday               0.000327
dtype: float64

In [24]:
filtered_trip_data[filtered_trip_data['tripduration']>86400].count()/filtered_trip_data.count()
# This is less than 0.05%. We can remove the outliers to better get the insights from our data.
# We can also remove all the trips < 60s.

started_at            0.000327
start_station_id      0.000356
start_station_name    0.000356
start_lat             0.000327
start_lng             0.000327
ended_at              0.000327
end_station_id        0.000340
end_station_name      0.000340
end_lat               0.000327
end_lng               0.000327
usertype              0.000327
tripduration          0.000327
rideable_type         0.000327
file                  0.000327
distance              0.000327
weekday               0.000327
dtype: float64

In [25]:
filtered_trip_data = filtered_trip_data[filtered_trip_data['tripduration']<86400]
filtered_trip_data = filtered_trip_data[filtered_trip_data['tripduration']>60]

filtered_trip_data.describe()

Unnamed: 0,start_lat,start_lng,end_lat,end_lng,tripduration,distance,weekday
count,1249719.0,1249719.0,1249719.0,1249719.0,1249719.0,1249719.0,1249719.0
mean,41.90219,-87.64508,41.90253,-87.64535,1134.76,2.326445,3.181439
std,0.04383618,0.02594937,0.04389278,0.02605858,2147.711,1.898573,2.038057
min,41.64,-87.78,41.54,-88.07,61.0,0.0008270698,0.0
25%,41.88199,-87.65914,41.88213,-87.65964,418.0,1.050132,1.0
50%,41.89859,-87.6417,41.89897,-87.64254,721.0,1.742714,3.0
75%,41.92889,-87.62785,41.92916,-87.62833,1281.0,2.987281,5.0
max,42.07,-87.52,42.15,-87.51,86313.0,33.80032,6.0


In [26]:
# Lets Explore the distance column next. Lets evaluate speed for long distance ride
filtered_trip_data[filtered_trip_data['distance'] > 15]['distance']/filtered_trip_data[filtered_trip_data['distance'] > 15]['tripduration']*60*60

14667876    19.821994
14668491    18.417966
14670990    14.102553
14676006    15.091125
14676259    12.584851
14692516    14.072153
14693005    10.649704
14693008    10.669111
14693476     7.398984
14694037    10.833327
14695178     6.626416
14695792    12.524940
14713278    11.518530
14713701    10.972350
14713766    11.644969
14714170    13.700087
14715350    18.830269
14723883    15.665462
14734206    14.376163
14738402    16.774622
14740188    15.820796
14745754    15.489083
14754495    20.653168
14763807    15.144522
14767169    20.203773
14772633    12.940065
14785579     7.700553
14785580     7.709805
14794044    14.068643
14798098    10.315208
14807321     7.930203
14808257    10.012639
14808497     6.717019
14809072    12.645942
14810685    19.922337
14812160    19.536595
14819318     6.660000
14828214    11.391463
14828218    11.529847
14830880    18.928763
14834703    18.782536
14835135     5.369895
14840729     5.142840
14847376    19.744566
14847487    11.371834
14851013  

In [27]:
# As speed is within usual limits, we can say that the distances and trip durations have correlations. 
# And thus long distance travel are not outliers and should be considered in our analysis.  

## Exploring Missing Data

In [28]:
filtered_trip_data.isnull().sum()

started_at                 0
start_station_id      101465
start_station_name    101465
start_lat                  0
start_lng                  0
ended_at                   0
end_station_id        112064
end_station_name      112064
end_lat                    0
end_lng                    0
usertype                   0
tripduration               0
rideable_type              0
file                       0
distance                   0
weekday                    0
dtype: int64

In [29]:
# We have removed most of our missing values while correcting the tripduration values.
# Start and end stations ids and names are missing for a lot of data points.
# Thus it is not advisible to remove them. We can try to sysnthesize them from our spreadsheets, specially from station data.
# We can remove the columns for the same to reduce the file size.
# But we will skip that part in this analysis and will use the data of latitudes and longitudes to capture the behaviour of riders.

filtered_trip_data.drop(['start_station_id','start_station_name', 'end_station_id', 'end_station_name'], axis = 1 , inplace = True)

In [30]:
# Last step is to export the cleaned and processed data for further analysis and visualizations.
filtered_trip_data.to_csv(r"C:\Users\utsav\OneDrive\Documents\ML\Google Data Analytics Course\Course 8 - Capstone\Case Study 1\data\Unzipped\filtered_trip_data.csv",index = True)

In [31]:
del trip_data
del filtered_trip_data
gc.collect()

20