# Practice and Data Exploration

## Explore chicago.csv

In [4]:
import pandas as pd

df = pd.read_csv('chicago.csv') # load chicago.csv in a Pandas dataframe
print(df.head()) # view the first 5 rows

   Unnamed: 0           Start Time             End Time  Trip Duration  \
0     1423854  2017-06-23 15:09:32  2017-06-23 15:14:53            321   
1      955915  2017-05-25 18:19:03  2017-05-25 18:45:53           1610   
2        9031  2017-01-04 08:27:49  2017-01-04 08:34:45            416   
3      304487  2017-03-06 13:49:38  2017-03-06 13:55:28            350   
4       45207  2017-01-17 14:53:07  2017-01-17 15:02:01            534   

                   Start Station                   End Station   User Type  \
0           Wood St & Hubbard St       Damen Ave & Chicago Ave  Subscriber   
1            Theater on the Lake  Sheffield Ave & Waveland Ave  Subscriber   
2             May St & Taylor St           Wood St & Taylor St  Subscriber   
3  Christiana Ave & Lawrence Ave  St. Louis Ave & Balmoral Ave  Subscriber   
4         Clark St & Randolph St  Desplaines St & Jackson Blvd  Subscriber   

   Gender  Birth Year  
0    Male      1992.0  
1  Female      1992.0  
2    Male     

In [5]:
df.columns #view columns in the database

Index(['Unnamed: 0', 'Start Time', 'End Time', 'Trip Duration',
       'Start Station', 'End Station', 'User Type', 'Gender', 'Birth Year'],
      dtype='object')

In [6]:
df.describe() #view descriptive stats in df

Unnamed: 0.1,Unnamed: 0,Trip Duration,Birth Year
count,300000.0,300000.0,238981.0
mean,776345.8,936.23929,1980.858223
std,448146.4,1548.792767,11.003329
min,4.0,60.0,1899.0
25%,387136.8,393.0,1975.0
50%,777103.5,670.0,1984.0
75%,1164065.0,1125.0,1989.0
max,1551500.0,86224.0,2016.0


In [7]:
df.info() #view element types 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Unnamed: 0     300000 non-null  int64  
 1   Start Time     300000 non-null  object 
 2   End Time       300000 non-null  object 
 3   Trip Duration  300000 non-null  int64  
 4   Start Station  300000 non-null  object 
 5   End Station    300000 non-null  object 
 6   User Type      300000 non-null  object 
 7   Gender         238948 non-null  object 
 8   Birth Year     238981 non-null  float64
dtypes: float64(1), int64(2), object(6)
memory usage: 20.6+ MB


In [8]:
df['Gender'].value_counts()

Male      181190
Female     57758
Name: Gender, dtype: int64

In [9]:
df['Birth Year'].value_counts()

1989.0    14666
1988.0    12490
1991.0    12397
1990.0    12378
1987.0    11939
          ...  
2002.0        2
1909.0        2
2003.0        2
1931.0        1
2004.0        1
Name: Birth Year, Length: 79, dtype: int64

In [10]:
df['Start Station'].value_counts()

Streeter Dr & Grand Ave         6911
Clinton St & Washington Blvd    4306
Lake Shore Dr & Monroe St       4289
Clinton St & Madison St         3744
Canal St & Adams St             3443
                                ... 
Ellis Ave & 83rd St                1
Laramie Ave & Kinzie St            1
Throop St & 52nd St                1
Racine Ave & 65th St               1
Greenwood Ave & 79th St            1
Name: Start Station, Length: 568, dtype: int64

In [11]:
df['End Station'].value_counts()

Streeter Dr & Grand Ave         7512
Clinton St & Washington Blvd    4166
Lake Shore Dr & Monroe St       4016
Clinton St & Madison St         4014
Lake Shore Dr & North Blvd      3863
                                ... 
Stony Island Ave & 82nd St         1
Calumet Ave & 71st St              1
Ashland Ave & 69th St              1
Cicero Ave & Quincy St             1
Cicero Ave & Flournoy St           1
Name: End Station, Length: 572, dtype: int64

## Practice 1: Compute the Most Popular Start Hour

* Use pandas to load chicago.csv as a dataframe 
* Find the most frequent hour when people start travelling
    * create an hour column by extracting from 'Start Time' column
    * use mode to find the most common value 

In [15]:
filename = 'chicago.csv'

# load data file into a dataframe
df = pd.read_csv(filename)

# convert the Start Time column to datetime
df['Start Time'] =pd.to_datetime(df['Start Time']) 

# # extract hour from the Start Time column to create an hour column
df['hour'] = df['Start Time'].dt.hour #dt.hour returns hour 

# # find the most common hour (from 0 to 23)
popular_hour = df['hour'].mode()[0] # get mode of each column
popular_hour
    
# print('Most Frequent Start Hour:', popular_hour)

17

## Practice 2: Display a breakdown of user types 

Find how many user types there are in 'User Type' column and store the counts in a pandas Series in the user_types variable 

In [17]:
filename = 'chicago.csv'

# load data file into a dataframe
df = pd.read_csv(filename) 

# print value counts for each user type
user_types = df['User Type'].value_counts()

print(user_types)

Subscriber    238889
Customer       61110
Dependent          1
Name: User Type, dtype: int64


## Practice 3 Load and filter the dataset 

Load data, create month and day_of_week, filter by month, filter by day of week 

In [43]:
import pandas as pd

CITY_DATA = { 'chicago': 'chicago.csv',
              'new york city': 'new_york_city.csv',
              'washington': 'washington.csv' }

def load_data(city, month, day):
    """
    Loads data for the specified city and filters by month and day if applicable.

    Args:
        (str) city - name of the city to analyze
        (str) month - name of the month to filter by, or "all" to apply no month filter
        (str) day - name of the day of week to filter by, or "all" to apply no day filter
    Returns:
        df - Pandas DataFrame containing city data filtered by month and day
    """

    # load data file into a dataframe
    df = pd.read_csv(CITY_DATA[city])

    # convert the Start Time column to datetime
    df['Start Time'] = pd.to_datetime(df['Start Time'])

    # extract month and day of week from Start Time to create new columns
    df['month'] = df['Start Time'].dt.month
    df['day_of_week'] = df['Start Time'].dt.day_name()

    # filter by month if applicable
    if month != 'all':
        # use the index of the months list to get the corresponding int
        months = ['january', 'february', 'march', 'april', 'may', 'june']
        month = months.index(month) + 1

        # filter by month to create the new dataframe
        df = df[df['month'] == month]

    # filter by day of week if applicable
    if day != 'all':
        # filter by day of week to create the new dataframe
        df = df[df['day_of_week'] == day.title()] #change input day to title case 

    return df

df = load_data('chicago', 'march', 'friday')
df.head()


Unnamed: 0.1,Unnamed: 0,Start Time,End Time,Trip Duration,Start Station,End Station,User Type,Gender,Birth Year,month,day_of_week
37,395803,2017-03-24 15:35:55,2017-03-24 15:46:10,615,Dearborn St & Erie St,State St & Van Buren St,Subscriber,Male,1989.0,3,Friday
93,395735,2017-03-24 15:32:04,2017-03-24 15:52:53,1249,Sedgwick St & Webster Ave,Western Ave & Winnebago Ave,Subscriber,Female,1964.0,3,Friday
175,395402,2017-03-24 15:10:29,2017-03-24 15:19:44,555,Franklin St & Monroe St,Aberdeen St & Monroe St,Subscriber,Male,1987.0,3,Friday
190,393400,2017-03-24 12:29:30,2017-03-24 12:48:56,1166,Southport Ave & Wellington Ave,Lake Shore Dr & North Blvd,Subscriber,Female,1984.0,3,Friday
198,427496,2017-03-31 08:25:53,2017-03-31 08:39:09,796,Clinton St & Jackson Blvd,Racine Ave (May St) & Fulton St,Subscriber,Male,1983.0,3,Friday


## Month distribution

In [9]:
# view month distribution in chicago.csv 

import pandas as pd

def month_distribution(filename):
    #filename is the filename for the city
    df = pd.read_csv(filename)
    df['month']=pd.to_datetime(df['Start Time']).dt.month
    return df['month'].value_counts()

print("chicago's month distribution /n")
month_distribution('chicago.csv')

# print("new york's month dist/n")
# month_distribution('new_york_city.csv')

# print("washington's month dist/n")
# month_distribution('washington.csv')


chicago's month distribution /n


6    98081
5    66755
4    51659
2    32057
3    29639
1    21809
Name: month, dtype: int64

# Jupyter Notebook Tips 

M for markdown cell
DD to delete cell 

A/B to add new cells above or below 

Ctrl+enter to run cell

enter to edit cell: enter

comment out: ctrl + forward slash /
