In [5]:
import numpy as np
import pandas as pd
import pickle
import datetime
import time
import matplotlib.pyplot as plt

In [6]:
mta_turn_all = pd.read_csv('turnstile_data.csv', low_memory=False)

In [7]:
mta_turn_all.rename(columns={mta_turn_all.columns.values[-1]: 'EXITS'}, inplace=True)

In [8]:
mta_turn_all.shape

(3548154, 11)

In [10]:
'''
Remove duplicate headers and pickle the file
'''

def remove_headers(df, header1, save_file):
    if save_file == True:
        drop_rows = [index for index, val in df.iterrows() if val[header1]==header1]
        df.drop(drop_rows, inplace=True)
        # open a file, where you ant to store the data
        file = open('raw_clean', 'wb')
        # dump information to that file
        pickle.dump(df, file)
        # close the file
        file.close()
    else:
        # open a file, where you stored the pickled data
        file = open('raw_clean', 'rb')
        # dump information to that file
        df = pickle.load(file)
        # close the file
        file.close()
    return df


'''
Make a dict that maps times to an integer mapping of time of the day (or standardized time of the day).
We can remove the non-whole number times or we will end up double counting some entries and exits.
'''

def standard_time(time_str_in):
    time_time = time.strptime(time_str_in, '%H:%M:%S')
    if (time_time.tm_min != 0) or (time_time.tm_sec != 0):
        return np.nan
    else:
        # Logic for standardizing the times to be generally consistent and comparable
        time1 = time.strptime('00:00:00', '%H:%M:%S')
        time2 = time.strptime('04:00:00', '%H:%M:%S')
        time3 = time.strptime('08:00:00', '%H:%M:%S')
        time4 = time.strptime('12:00:00', '%H:%M:%S')
        time5 = time.strptime('16:00:00', '%H:%M:%S')
        time6 = time.strptime('20:00:00', '%H:%M:%S')
        if (time_time >= time1) and (time_time < time2):
            time_str_out = '02:00:00'
        elif (time_time >= time2) and (time_time < time3):
            time_str_out = '06:00:00'
        elif (time_time >= time3) and (time_time < time4):
            time_str_out = '10:00:00'
        elif (time_time >= time4) and (time_time < time5):
            time_str_out = '14:00:00'
        elif (time_time >= time5) and (time_time < time6):
            time_str_out = '18:00:00'
        else:
            time_str_out = '22:00:00'
        return time_str_out

In [11]:
turn_df = remove_headers(mta_turn_all, 'C/A', save_file=True)
turn_df.shape

(3548137, 11)

In [12]:
'''
Cleaning the data
'''
turn_df['entries_int'] = pd.to_numeric(turn_df['ENTRIES'])
turn_df['exits_int'] = pd.to_numeric(turn_df['EXITS'])

In [13]:
turn_df['standard_time'] = turn_df['TIME'].apply(standard_time)

In [14]:
save_clean = True

if save_clean:
    # open a file, where you want to store the data
    file = open('raw_mod', 'wb')
    # dump information to that file
    pickle.dump(turn_df, file)
    # close the file
    file.close()
else:
    # open a file, where you stored the pickled data
    file = open('raw_mod', 'rb')
    # dump information to that file
    turn_df = pickle.load(file)
    # close the file
    file.close()

In [15]:
turn_df.dropna(inplace=True)

In [32]:
def get_day_of_week(date):
    return datetime.datetime.strptime(date, '%m/%d/%Y').weekday()

def get_month(date):
    return datetime.datetime.strptime(date, '%m/%d/%Y').month

def get_day(date):
    return datetime.datetime.strptime(date, '%m/%d/%Y').day

def get_week(date):
    return datetime.datetime.strptime(date, '%m/%d/%Y').isocalendar()[1]

# Here is the fix: added LINENAME to groupby
station_df = turn_df.groupby(by=['STATION', 'LINENAME', 'DATE','standard_time']).sum()
station_df['entries_diff'] = station_df['entries_int'].diff()
station_df['exits_diff'] = station_df['exits_int'].diff()
station_df.reset_index(inplace=True)

In [33]:
# Combine station and line into a unique ID
station_df.rename(columns={'STATION': 'station_old'}, inplace=True)
station_df['STATION'] = station_df['station_old'] + ' & ' + station_df['LINENAME']

In [34]:
'''
NaN out the rows where the station is changing as those values do not make sense
'''
station_chg_index = [i+1 for i, val in enumerate(station_df['STATION'][1:]) if val != station_df['STATION'][i]]
for i in station_chg_index:
    station_df.loc[i, 'entries_diff'] = np.nan
    station_df.loc[i, 'exits_diff'] = np.nan

In [35]:
station_df['day_of_week'] = station_df['DATE'].apply(get_day_of_week)
station_df['month'] = station_df['DATE'].apply(get_month)
station_df['day'] = station_df['DATE'].apply(get_day)
station_df['week'] = station_df['DATE'].apply(get_week)

In [36]:
# Drop nan that result from station transition
station_df.dropna(inplace=True)

In [38]:
# Remove negatives and the row after as well (because there is usually a reversal)

def clear_outliers(df, field):
    original_len = df.shape[0]
    clear_list = list(df[df[field] < 0].index)
    clear_list_plus1 = [i+1 for i in clear_list]
    clear_list.append(clear_list_plus1)
    for i in clear_list:
        if i in list(df.index):
            df.loc[i, field] = np.nan
    df.dropna(inplace=True)
    print(str(len(clear_list)) + ' rows dropped from df out of ' + str(original_len) + ' total')
    return df, len(clear_list)
    
station_df, rows_dropped_entries = clear_outliers(station_df, 'entries_diff')
station_df, rows_dropped_entries = clear_outliers(station_df, 'exits_diff')

1970 rows dropped from df out of 339336 total
499 rows dropped from df out of 337367 total


In [39]:
station_df['total_traffic'] = station_df['entries_diff'] + station_df['exits_diff']
grp_station_weekday = station_df.groupby(by=['STATION','day_of_week']).mean()['total_traffic']
grp_station = station_df.groupby(by=['STATION']).mean()['total_traffic']

In [41]:
grp_station.sort_values(ascending=False)[0:20]

STATION
42 ST-PORT AUTH & ACENQRS1237W    2.792975e+07
3 AV-149 ST & 25                  2.450819e+07
72 ST & 123                       2.266953e+07
CANAL ST & JNQRZ6W                1.286192e+07
34 ST-HERALD SQ & BDFMNQRW        1.243992e+07
125 ST & 456                      1.141896e+07
BROOKLYN BRIDGE & 456JZ           1.125392e+07
CHAMBERS ST & JZ456               1.099157e+07
190 ST & A                        1.011854e+07
1 AV & L                          1.004532e+07
COURT SQ & EMG                    9.003206e+06
BAY PKWY & N                      8.706366e+06
125 ST & 23                       8.534066e+06
59 ST & NRW                       7.562403e+06
18 AV & F                         7.522464e+06
183 ST & 4                        7.517503e+06
KINGS HWY & BQ                    7.088618e+06
225 ST & 25                       7.020078e+06
FLATBUSH AV-B.C & 25              6.861365e+06
FRANKLIN AV & 2345S               6.744224e+06
Name: total_traffic, dtype: float64

In [42]:
save_clean = True

if save_clean:
    # open a file, where you want to store the data
    file = open('final', 'wb')
    # dump information to that file
    pickle.dump(station_df, file)
    # close the file
    file.close()
else:
    # open a file, where you stored the pickled data
    file = open('final', 'rb')
    # dump information to that file
    station_df = pickle.load(file)
    # close the file
    file.close()