In [1]:
# imports a library 'pandas', names it as 'pd'
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime as dt
import datetime
from bisect import bisect
import pickle
import csv
filename = '/home/williamcottrell72/github/sf18_ds11/class_lectures/week01-benson/02-git_viz/turnstiles_v2.pkl'
from IPython.display import Image

def fix_time(num):
    if len(str(num)) == 2:
        return str(num)
    else:
        return '0'+str(num)
    
def get_week_nums(month,yrs_back):
    week_list=[]
    ref_date=datetime.date(2018,6,30)
    weeks_back=yrs_back*52
    for i in range(weeks_back):
        week_shift=datetime.timedelta(-7*i)
        new=ref_date+week_shift
        yr=str(new.year)[-2:]
        mt=fix_time(new.month)
        day=fix_time(new.day)
        string=yr+mt+day
        if (int(mt)==month) or (int(mt) == (month+1)%12) or (int(mt) == (month-1)%12):
            week_list.append(int(string))
    return week_list

def scrape(week_nums):
    url = "http://web.mta.info/developers/data/nyct/turnstile/turnstile_{}.txt"
    dfs = []
    for week_num in week_nums:
        file_url = url.format(week_num)
        dfs.append(pd.read_csv(file_url))
    return pd.concat(dfs)

def scrape_pkl(week_nums,filename):
    try:
        with open(filename,'rb') as pklfile:
            df = pickle.load(pklfile)
    except:
        df = scrape(week_nums)

        with open(filename,'wb') as pklfile:
            df = pickle.dump(df, pklfile)
    return df

def clean_df(df):
    cols={x:x.strip() for x in df.columns}
    df_small=df.rename(columns=cols)
    df_small['datetime']=pd.to_datetime(df_small['DATE']+' '+df_small['TIME'],infer_datetime_format=True)
    df_small_clean=df_small[df_small.DIVISION!='PTH'].drop_duplicates(subset=['C/A','UNIT','SCP','STATION','LINENAME','datetime'])
    df_small_clean2=df_small_clean.groupby(['STATION','datetime'],as_index=False)[['EXITS']].sum()
    return df_small_clean2

def station_activity(df,station):
    df_station=df[df['STATION']==station]
    df_sort=df_station.sort_values(by=['datetime'])
    return df_sort


def month_filter(df,month):
    return df[df['datetime'].dt.month==month]

def construct_dct(df,mt):
    stations=df['STATION'].unique()
    station_diffs_dct={}
    for st in stations:
        sa=station_activity(df,st)
        sa['diffs']=sa['EXITS'].diff()
        sa_filt=month_filter(sa,mt)
        st_diffs=sa_filt.drop(['EXITS'],axis=1).dropna()
        st_diffs_clean=st_diffs[np.abs(st_diffs.diffs)<10*10**4]        
        st_diffs_clean['weekday']=st_diffs_clean['datetime'].dt.weekday
        st_diffs_clean['hour']=st_diffs_clean['datetime'].dt.hour
        st_diffs_clean2=st_diffs_clean.groupby(['weekday','hour'])['diffs'].mean()
        station_diffs_dct[st]=st_diffs_clean2
    return station_diffs_dct

def find_key(num,hours_list):
    sort=sorted(hours_list)
    pos=bisect(sort,num)
    if pos < len(sort):
        return sort[pos]
    else:
        return sort[0]
    
def activity_by_time(day,hour,dct):
    stations=list(set(dct.keys()))
    exits=[]
    for st in stations:
        try:
            """Below we are assuming that the hour list appearing is consistent for
            a given station. (I take the first element of the list as being represent-
            ative.) Should check this"""
            hours_st=dct[st][0].keys().values
            sh=sorted(hours_st)
            h_key=find_key(hour,sh)
            leaving=dct[st][day][h_key]/4
            """
            Below I apply np.abs to account for the possibility that the turnstile was
            reversed for some period of time.
            """             
            exits.append([st,np.abs(leaving)])
        except(KeyError,IndexError,AttributeError):
            pass
    sort_exits=sorted(exits,key=lambda x: x[1])[::-1]    
    return sort_exits

#We first aggregate the dictionaries which are busy in the afternoon

def make_big_df(dct):
    df_cum=pd.DataFrame(activity_by_time(0,12,dct)[:20])
    for i in range(1,12):
        for day in range(1,7):
            df2=pd.DataFrame(activity_by_time(day,12+i,dct)[:20])
            df_cum=pd.merge(df_cum,df2,on=0,how='outer')
    return df_cum

#clean_df_sum creates a cumulative dictionary of the top stations sorted by exits.

def clean_df_cum(dct):
    df_cum=make_big_df(dct)
    df_fill=df_cum.fillna(0)
    df_fill.insert(1,'sums',df_fill.sum(axis=1)/84)
    fin_result=df_fill[[0,'sums']].sort_values(['sums'],ascending=False).head(20)
    return fin_result.values


def main(month,yrs_back=3):
    week_nums=get_week_nums(month,yrs_back)
    df=scrape_pkl(week_nums,filename)
    df_c=clean_df(df)
    dct=construct_dct(df_c,month)
    return (dct,df)

In [2]:
dct,df = main(6,2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [3]:
activity_by_time(3,8,dct)

[['34 ST-HERALD SQ', 9567.444444444445],
 ['47-50 STS ROCK', 9261.0],
 ['TIMES SQ-42 ST', 9203.85],
 ['LEXINGTON AV/53', 6073.727272727273],
 ['59 ST', 5491.613636363636],
 ['42 ST-BRYANT PK', 5389.795454545455],
 ['59 ST COLUMBUS', 5251.136363636364],
 ['CHAMBERS ST', 4529.895833333333],
 ['5 AV/53 ST', 4509.204545454545],
 ['14 ST-UNION SQ', 4459.5],
 ['50 ST', 3764.7291666666665],
 ['BOWLING GREEN', 3356.2954545454545],
 ['51 ST', 3004.35],
 ['JAY ST-METROTEC', 2305.5],
 ['49 ST', 2178.9375],
 ['BOROUGH HALL', 2113.85],
 ['BROOKLYN BRIDGE', 2023.4791666666667],
 ['72 ST-2 AVE', 1839.40625],
 ['ATL AV-BARCLAY', 1830.638888888889],
 ['FLUSHING-MAIN', 1737.65],
 ['GRAND ST', 1685.409090909091],
 ['168 ST', 1661.7083333333333],
 ['W 4 ST-WASH SQ', 1628.6875],
 ['68ST-HUNTER CO', 1517.4583333333333],
 ['5 AVE', 1463.1041666666667],
 ['33 ST', 1414.1363636363637],
 ['5 AV/59 ST', 1389.0],
 ['JAMAICA CENTER', 1343.7954545454545],
 ['COURT SQ', 1326.6875],
 ['1 AV', 1282.9545454545455],
 ["

In [4]:
clean_df_cum(dct)

array([['34 ST-HERALD SQ', 3869.36479365542],
       ['TIMES SQ-42 ST', 3236.395844028657],
       ['FLUSHING-MAIN', 2161.029783179783],
       ['JKSN HT-ROOSVLT', 1789.6775725431971],
       ['ATL AV-BARCLAY', 1699.575266804954],
       ['59 ST COLUMBUS', 1537.1845310939063],
       ['BEDFORD AV', 1495.3963182650682],
       ['14 ST-UNION SQ', 1471.6925182291252],
       ['W 4 ST-WASH SQ', 1465.7306126743629],
       ['145 ST', 1245.4798303548303],
       ['JAMAICA CENTER', 1224.4779879842379],
       ['50 ST', 1174.0540352009102],
       ['59 ST', 1094.1550266862766],
       ['72 ST', 1046.5545634920636],
       ['7 AV', 970.8408420283422],
       ['KEW GARDENS', 847.659641053391],
       ["B'WAY-LAFAYETTE", 780.9010957792208],
       ['CROWN HTS-UTICA', 726.4987130924632],
       ['42 ST-BRYANT PK', 676.8053883616384],
       ['JUNCTION BLVD', 568.6583614302365]], dtype=object)