In [1]:
import pandas as pd
import numpy as np

## Questions on NY Subway Dataset

1. Does weather systematically vary with longitude, latitude, hour of day? Longitude & latitude using mean values, hour of day using individual entries.
2. How does the number of turnstile entries / exits vary with the hour of day? Whether or not it's a weekday? The day of the week?
3. Turnstile ENTRY = EXIT?
4. Outlier days for turnstile entries and exits?
5. Correlations between weather variables and turnstile entries / exits?


## Pearson's correlation coefficient r

In [11]:
def correlation(x, y):
    '''
    Fill in this function to compute the correlation between the two
    input variables. Each input is either a NumPy array or a Pandas
    Series.
    
    correlation = average of (x in standard units) times (y in standard units)
    
    Remember to pass the argument "ddof=0" to the Pandas std() function!
    '''
    x_mean = x.mean()
    y_mean = y.mean()
    x_std = x.std(ddof=0)
    y_std = y.std(ddof=0)
    standardised_x = (x - x_mean)/x_std
    standardised_y = (y - y_mean)/y_std
    return (standardised_x * standardised_y).mean()

In [3]:
x = pd.Series([1, 2, 3, 4])
y = pd.Series([10, 11, 12, 13])


In [12]:
correlation(x,y)

1.0

## DataFrame Vectorised Operations

In [26]:
def get_hourly_entries_and_exits(entries_and_exits):
    '''
    Fill in this function to take a DataFrame with cumulative entries
    and exits (entries in the first column, exits in the second) and
    return a DataFrame with hourly entries and exits (entries in the
    first column, exits in the second).
    '''
    # new.iloc[0,:] = entries_and_exits.iloc[0,:]
    return entries_and_exits-entries_and_exits.shift().fillna(0)

In [14]:
entries_and_exits = pd.DataFrame(
    {'ENTRIESn': [10, 40, 60, 65, 85], 'EXITSn': [0, 10, 20, 60, 60]},
    index=[0, 1, 2, 3, 4]
)

In [18]:
entries_and_exits

Unnamed: 0,ENTRIESn,EXITSn
0,10,0
1,40,10
2,60,20
3,65,60
4,85,60


In [27]:
get_hourly_entries_and_exits(entries_and_exits)

Unnamed: 0,ENTRIESn,EXITSn
0,10.0,0.0
1,30.0,10.0
2,20.0,10.0
3,5.0,40.0
4,20.0,0.0


## DataFrame applymap()

In [29]:
grades = pd.DataFrame(
    {0: [95, 85, 75, 65, 55], 1: [95, 85, 75, 65, 55]},
    index=[0, 1, 2, 3, 4])

In [35]:
'''
    Convert numerical grades into letter grades.
    Conversion rule:
        90-100 -> A
        80-89  -> B
        70-79  -> C
        60-69  -> D
        0-59   -> F
'''
def convert_grade(grade):
        if grade > 100:
            return 'NA'
        elif 90 <= grade <= 100:
            return 'A'
        elif 80 <= grade < 90:
            return 'B'
        elif 70 <= grade < 80:
            return 'C'
        elif 60 <= grade < 70:
            return 'D'
        elif grade < 60:
            return 'F'
convert_grade(85)

'B'

In [38]:
# Convert the given DataFrame of numerical grades to letter grades. 
# Conversion rule stated in the previous cell.
def convert_grades(grades):
    '''
    Fill in this function to convert the given DataFrame of numerical
    grades to letter grades. Return a new DataFrame with the converted
    grade.
    
  
    '''
    def convert_grade(grade):
        if grade > 100:
            return 'NA'
        elif 90 <= grade: 
            return 'A'
        elif 80 <= grade:
            return 'B'
        elif 70 <= grade:
            return 'C'
        elif 60 <= grade:
            return 'D'
        elif grade < 60:
            return 'F'
    return grades.applymap(convert_grade)

In [39]:
convert_grades(grades)

Unnamed: 0,0,1
0,A,A
1,B,B
2,C,C
3,D,D
4,F,F


In [47]:
# Standardise each column of the input Dataframe
# i.e. convert each value to the number of standard deviations it is above or below the mean.
def standardize(df):
    def standard(col):
        return (col - col.values.mean()) / col.values.std()
    return df.apply(standard)

In [59]:
df = pd.DataFrame(
    {0: [95, 85, 75, 65, 55], 1: [95, 85, 75, 65, 55]},
    index=[0, 1, 2, 3, 4]
)
df

Unnamed: 0,0,1
0,95,95
1,85,85
2,75,75
3,65,65
4,55,55


In [48]:
standardize(df)

Unnamed: 0,0,1
0,1.414214,1.414214
1,0.707107,0.707107
2,0.0,0.0
3,-0.707107,-0.707107
4,-1.414214,-1.414214


In [63]:
# Returns the second-largest value of each column of the input DataFrame
def second_largest(df):
    def sec_largest(col):
        sorted_col = col.sort_values(ascending=False)
        return sorted_col[1]
    return df.apply(sec_largest)

In [64]:
second_largest(df)

0    85
1    85
dtype: int64