## Chicago bikeshare data

In [1]:
import pandas as pd

filename = 'chicago.csv'

# load data file into a dataframe
df = pd.read_csv(filename)

In [2]:
df.head()

Unnamed: 0.1,Unnamed: 0,Start Time,End Time,Trip Duration,Start Station,End Station,User Type,Gender,Birth Year
0,1423854,2017-06-23 15:09:32,2017-06-23 15:14:53,321,Wood St & Hubbard St,Damen Ave & Chicago Ave,Subscriber,Male,1992.0
1,955915,2017-05-25 18:19:03,2017-05-25 18:45:53,1610,Theater on the Lake,Sheffield Ave & Waveland Ave,Subscriber,Female,1992.0
2,9031,2017-01-04 08:27:49,2017-01-04 08:34:45,416,May St & Taylor St,Wood St & Taylor St,Subscriber,Male,1981.0
3,304487,2017-03-06 13:49:38,2017-03-06 13:55:28,350,Christiana Ave & Lawrence Ave,St. Louis Ave & Balmoral Ave,Subscriber,Male,1986.0
4,45207,2017-01-17 14:53:07,2017-01-17 15:02:01,534,Clark St & Randolph St,Desplaines St & Jackson Blvd,Subscriber,Male,1975.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 9 columns):
Unnamed: 0       300000 non-null int64
Start Time       300000 non-null object
End Time         300000 non-null object
Trip Duration    300000 non-null int64
Start Station    300000 non-null object
End Station      300000 non-null object
User Type        300000 non-null object
Gender           238948 non-null object
Birth Year       238981 non-null float64
dtypes: float64(1), int64(2), object(6)
memory usage: 20.6+ MB


In [4]:
df.isnull().sum()

Unnamed: 0           0
Start Time           0
End Time             0
Trip Duration        0
Start Station        0
End Station          0
User Type            0
Gender           61052
Birth Year       61019
dtype: int64

In [5]:
# convert the Start Time column to datetime
df['Start Time'] = pd.to_datetime(df['Start Time'])

# extract hour from the Start Time column to create an hour column
df['hour'] = df['Start Time'].dt.hour

# find the most common hour (from 0 to 23)
df.groupby(['hour'])
# popular_hour = 
    
# print('Most Frequent Start Hour:', popular_hour)


<pandas.core.groupby.DataFrameGroupBy object at 0x10ab917b8>

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,Start Time,End Time,Trip Duration,Start Station,End Station,User Type,Gender,Birth Year,hour
0,1423854,2017-06-23 15:09:32,2017-06-23 15:14:53,321,Wood St & Hubbard St,Damen Ave & Chicago Ave,Subscriber,Male,1992.0,15
1,955915,2017-05-25 18:19:03,2017-05-25 18:45:53,1610,Theater on the Lake,Sheffield Ave & Waveland Ave,Subscriber,Female,1992.0,18
2,9031,2017-01-04 08:27:49,2017-01-04 08:34:45,416,May St & Taylor St,Wood St & Taylor St,Subscriber,Male,1981.0,8
3,304487,2017-03-06 13:49:38,2017-03-06 13:55:28,350,Christiana Ave & Lawrence Ave,St. Louis Ave & Balmoral Ave,Subscriber,Male,1986.0,13
4,45207,2017-01-17 14:53:07,2017-01-17 15:02:01,534,Clark St & Randolph St,Desplaines St & Jackson Blvd,Subscriber,Male,1975.0,14


#### 1. Most popular start hour 

In [9]:
df['hour'].value_counts().idxmax()

17

In [11]:
# another method
df['hour'].mode()[0]

17

In [15]:
df['month'] = df['Start Time'].dt.month

In [19]:
df['month'].value_counts()

6    98081
5    66755
4    51659
2    32057
3    29639
1    21809
Name: month, dtype: int64

In [21]:
months = ['january', 'february', 'march', 'april', 'may', 'june']
month = months.index('march') + 1 
month

3

In [33]:
df['day_of_week'] = df['Start Time'].dt.dayofweek

In [34]:
df.head()

Unnamed: 0.1,Unnamed: 0,Start Time,End Time,Trip Duration,Start Station,End Station,User Type,Gender,Birth Year,hour,month,day_of_week
0,1423854,2017-06-23 15:09:32,2017-06-23 15:14:53,321,Wood St & Hubbard St,Damen Ave & Chicago Ave,Subscriber,Male,1992.0,15,6,4
1,955915,2017-05-25 18:19:03,2017-05-25 18:45:53,1610,Theater on the Lake,Sheffield Ave & Waveland Ave,Subscriber,Female,1992.0,18,5,3
2,9031,2017-01-04 08:27:49,2017-01-04 08:34:45,416,May St & Taylor St,Wood St & Taylor St,Subscriber,Male,1981.0,8,1,2
3,304487,2017-03-06 13:49:38,2017-03-06 13:55:28,350,Christiana Ave & Lawrence Ave,St. Louis Ave & Balmoral Ave,Subscriber,Male,1986.0,13,3,0
4,45207,2017-01-17 14:53:07,2017-01-17 15:02:01,534,Clark St & Randolph St,Desplaines St & Jackson Blvd,Subscriber,Male,1975.0,14,1,1


In [44]:
df['Birth Year'].max()

2016.0

#### 2. Breakdown of user type

In [13]:
user_types = df['User Type'].value_counts()
user_types

Subscriber    238889
Customer       61110
Dependent          1
Name: User Type, dtype: int64

#### 3: Load and Filter the Dataset
This is a bit of a bigger task, which involves choosing a dataset to load and filtering it based on a specified month and day. In the quiz below, you'll implement the load_data() function, which you can use directly in your project. There are four steps:

1.Load the dataset for the specified city. Index the global CITY_DATA dictionary object to get the corresponding filename for the given city name.  
2.Create month and day_of_week columns. Convert the "Start Time" column to datetime and extract the month number and weekday name into separate columns using the datetime module.  
3.Filter by month. Since the month parameter is given as the name of the month, you'll need to first convert this to the corresponding month number. Then, select rows of the dataframe that have the specified month and reassign this as the new dataframe.  
4.Filter by day of week. Select rows of the dataframe that have the specified day of week and reassign this as the new dataframe. (Note: Capitalize the day parameter with the title() method to match the title case used in the day_of_week column!)

In [None]:
import pandas as pd

CITY_DATA = { 'chicago': 'chicago.csv',
              'new york city': 'new_york_city.csv',
              'washington': 'washington.csv' }

def load_data(city, month, day):
    """
    Loads data for the specified city and filters by month and day if applicable.

    Args:
        (str) city - name of the city to analyze
        (str) month - name of the month to filter by, or "all" to apply no month filter
        (str) day - name of the day of week to filter by, or "all" to apply no day filter
    Returns:
        df - pandas DataFrame containing city data filtered by month and day
    """
    
    # load data file into a dataframe
    df = pd.read_csv(CITY_DATA[city])

    # convert the Start Time column to datetime
    df['Start Time'] = pd.to_datetime(df['Start Time'])

    # extract month and day of week from Start Time to create new columns
    df['month'] = df['Start Time'].dt.month
    df['day_of_week'] = df['Start Time'].dt.dayofweek


    # filter by month if applicable
    if month != 'all':
        # use the index of the months list to get the corresponding int
        months = ['january', 'february', 'march', 'april', 'may', 'june']
        month = months.index(month) + 1
    
        # filter by month to create the new dataframe
        df = df[df['month']==month]

    # filter by day of week if applicable
    if day != 'all':
        # use the input day to get index 
        days = ['monday','tuesday','wednesday','thursday','friday','saturday','sunday']
        dayofweek = days.index(day)
        # filter by day of week to create the new dataframe
        df = df[df['day_of_week']==dayofweek]
    
    return df
    
df = load_data('chicago', 'march', 'friday')


In [39]:
df.head()

Unnamed: 0.1,Unnamed: 0,Start Time,End Time,Trip Duration,Start Station,End Station,User Type,Gender,Birth Year,month,day_of_week
37,395803,2017-03-24 15:35:55,2017-03-24 15:46:10,615,Dearborn St & Erie St,State St & Van Buren St,Subscriber,Male,1989.0,3,4
93,395735,2017-03-24 15:32:04,2017-03-24 15:52:53,1249,Sedgwick St & Webster Ave,Western Ave & Winnebago Ave,Subscriber,Female,1964.0,3,4
175,395402,2017-03-24 15:10:29,2017-03-24 15:19:44,555,Franklin St & Monroe St,Aberdeen St & Monroe St,Subscriber,Male,1987.0,3,4
190,393400,2017-03-24 12:29:30,2017-03-24 12:48:56,1166,Southport Ave & Wellington Ave,Lake Shore Dr & North Blvd,Subscriber,Female,1984.0,3,4
198,427496,2017-03-31 08:25:53,2017-03-31 08:39:09,796,Clinton St & Jackson Blvd,Racine Ave (May St) & Fulton St,Subscriber,Male,1983.0,3,4


In [40]:
df['Trip Duration'].sum()

4344099

In [42]:
df['Trip Duration'].mean()

747.3075864441769