In [5]:
import pandas as pd

def clean_data(df):

    import pandas as pd
    import datetime as dt
    import numpy as np
   
    # rename columns to get rid of the funny 'EXITS     ' issue
    df.columns = ['C/A','UNIT','SCP','STATION','LINENAME',\
                  'DIVISION','DATE','TIME','DESC','ENTRIES','EXITS']
    
    # remove index 180052 (project technology error)
    df.drop(df.index[180052])
    
    # convert to datetime/make turnstile column
    df['converted_time'] = pd.to_datetime(df['DATE']+' '+df['TIME'])
    df['turnstiles'] = df['C/A'] + '-' + df['UNIT'] + '-' + df['SCP'] + '-' + df['STATION']
    
    # sort by date and location
    df_sorted = df.sort_values(['turnstiles', 'converted_time'])
    
    # group by turnstile so we can get entry/exit differences
    turnstile_df = df_sorted.groupby('turnstiles')
    df_sorted['entries_diff'] = turnstile_df['ENTRIES'].diff()
    df_sorted['exits_diff'] = turnstile_df['EXITS'].diff()
    
    # calculates IQR for entries_diff
    Q3 = df_sorted['entries_diff'].quantile(0.75) 
    Q1 = df_sorted['entries_diff'].quantile(0.25)
    IQR = Q3 - Q1
    
    # calculates IQR range using outliers 
    IQR_range = (Q1 - 1.5 * IQR, Q3 + 1.5 * IQR)
    
    # removes values outside of lower and upper bounds
    df_sorted = df_sorted[df_sorted['entries_diff'].between(0, IQR_range[1])]
    
    # repeats process for exits_diff
    Q3_2 = df_sorted['exits_diff'].quantile(0.75) 
    Q1_2 = df_sorted['exits_diff'].quantile(0.25)
    IQR_2 = Q3_2 - Q1_2
    IQR_range_2 = (Q1_2 - 1.5 * IQR, Q3_2 + 1.5 * IQR)
    df_sorted = df_sorted[df_sorted['exits_diff'].between(0, IQR_range_2[1])]

    '''
    
    # removes negative values (ONLY NEEDED if no outliers removed)
    df_sorted = df_sorted[df_sorted['entries_diff'].between(0, np.inf)]
    df_sorted = df_sorted[df_sorted['exits_diff'].between(0, np.inf)]
    
    '''
    
    # created new column turnstile_turns with total turnstile interactions per turnstile
    df_sorted['turnstile_turns'] = df_sorted.entries_diff + df_sorted.exits_diff
    df_sorted.turnstile_turns.describe()

    # replaces NaN values with mean for entries_diff and exits_diff
    df_sorted.entries_diff = df_sorted.entries_diff.fillna(df_sorted.entries_diff.mean())
    df_sorted.exits_diff = df_sorted.exits_diff.fillna(df_sorted.exits_diff.mean())

    # provides column day_of_week that designates the day of the week 
    df_sorted['day_of_week']=df_sorted.converted_time.dt.dayofweek
    
    #Create new column to differentiate stations serving different subway lines but with identical names
    df_sorted['station_unique'] = df_sorted['STATION'] + '-' + df_sorted['LINENAME']
    
    return df_sorted

In [5]:
def clean_data_no_outliers(df2):

    import pandas as pd
    import datetime as dt
    import numpy as np
   
    # rename columns to get rid of the funny 'EXITS     ' issue
    df2.columns = ['C/A','UNIT','SCP','STATION','LINENAME',\
                  'DIVISION','DATE','TIME','DESC','ENTRIES','EXITS']
    
    
    
    # convert to datetime/make turnstile column
    df2['converted_time'] = pd.to_datetime(df2['DATE']+' '+df2['TIME'])
    df2['turnstiles'] = df2['C/A'] + '-' + df2['UNIT'] + '-' + df2['SCP'] + '-' + df2['STATION']
    
    # sort by date and location
    df2_sorted = df2.sort_values(['turnstiles', 'converted_time'])
    
    # group by turnstile so we can get entry/exit differences
    turnstile2_df = df2_sorted.groupby('turnstiles')
    df2_sorted['entries_diff'] = turnstile2_df['ENTRIES'].diff()
    df2_sorted['exits_diff'] = turnstile2_df['EXITS'].diff()
    
    
    '''
    # calculates IQR for entries_diff
    Q3 = df_sorted['entries_diff'].quantile(0.75) 
    Q1 = df_sorted['entries_diff'].quantile(0.25)
    IQR = Q3 - Q1
    
    # calculates IQR range using outliers 
    IQR_range = (Q1 - 1.5 * IQR, Q3 + 1.5 * IQR)
    
    # removes values outside of lower and upper bounds
    df_sorted = df_sorted[df_sorted['entries_diff'].between(0, IQR_range[1])]
    
    # repeats process for exits_diff
    Q3_2 = df_sorted['exits_diff'].quantile(0.75) 
    Q1_2 = df_sorted['exits_diff'].quantile(0.25)
    IQR_2 = Q3_2 - Q1_2
    IQR_range_2 = (Q1_2 - 1.5 * IQR, Q3_2 + 1.5 * IQR)
    df_sorted = df_sorted[df_sorted['exits_diff'].between(0, IQR_range_2[1])]

    '''
    
    # removes negative values (ONLY NEEDED if no outliers removed)
    df2_sorted = df2_sorted[df2_sorted['entries_diff'].between(0, np.inf)]
    df2_sorted = df2_sorted[df2_sorted['exits_diff'].between(0, np.inf)]
    
    # created new column turnstile_turns with total turnstile interactions per turnstile
    df2_sorted['turnstile_turns'] = df2_sorted.entries_diff + df2_sorted.exits_diff
    df2_sorted.turnstile_turns.describe()

    # replaces NaN values with mean for entries_diff and exits_diff
    df2_sorted.entries_diff = df2_sorted.entries_diff.fillna(df2_sorted.entries_diff.mean())
    df2_sorted.exits_diff = df2_sorted.exits_diff.fillna(df2_sorted.exits_diff.mean())

    # provides column day_of_week that designates the day of the week 
    df2_sorted['day_of_week']=df2_sorted.converted_time.dt.dayofweek
    
    return df2_sorted

In [11]:
sf = pd.read_csv('http://web.mta.info/developers/data/nyct/turnstile/turnstile_190803.txt')
sf = clean_data(sf)

In [15]:
new = sf.sort_values('turnstile_turns',ascending=False)

new[['turnstiles', 'STATION', 'DATE', 'DESC', 'TIME', 'day_of_week','turnstile_turns']].head(50).sort_values('TIME')
                                                                                                             

Unnamed: 0,turnstiles,STATION,DATE,DESC,TIME,day_of_week,turnstile_turns
141204,R151-R033-00-00-06-TIMES SQ-42 ST,TIMES SQ-42 ST,07/29/2019,REGULAR,00:00:00,0,985.0
141165,R151-R033-00-00-05-TIMES SQ-42 ST,TIMES SQ-42 ST,07/30/2019,REGULAR,00:00:00,1,1002.0
3388,A025-R023-01-06-01-34 ST-HERALD SQ,34 ST-HERALD SQ,07/30/2019,REGULAR,00:00:00,1,1002.0
141120,R151-R033-00-00-04-TIMES SQ-42 ST,TIMES SQ-42 ST,07/30/2019,REGULAR,00:00:00,1,975.0
168338,R258-R132-00-03-00-125 ST,125 ST,07/30/2019,REGULAR,09:00:00,1,975.0
168230,R258-R132-00-00-02-125 ST,125 ST,08/02/2019,REGULAR,09:00:00,4,976.0
138563,R138-R293-00-03-02-34 ST-PENN STA,34 ST-PENN STA,08/02/2019,REGULAR,10:00:00,4,976.0
196210,R604-R108-03-00-00-BOROUGH HALL,BOROUGH HALL,08/02/2019,REGULAR,12:00:00,4,983.0
46332,N020-R101-00-00-02-145 ST,145 ST,07/30/2019,REGULAR,12:00:00,1,978.0
4057,A030-R083-01-03-02-23 ST,23 ST,07/29/2019,REGULAR,12:00:00,0,995.0


In [64]:
station_df = no_outliers.groupby('STATION')
station_totals=station_df['turnstile_turns'].sum()

station_totals.sort_values(ascending=False).head(30)

STATION
HUNTS POINT AV     3.083585e+09
GRD CNTRL-42 ST    6.039917e+06
66 ST-LINCOLN      2.093391e+06
34 ST-PENN STA     1.898187e+06
34 ST-HERALD SQ    1.400539e+06
TIMES SQ-42 ST     1.196956e+06
23 ST              1.126268e+06
14 ST-UNION SQ     1.126184e+06
FULTON ST          1.074935e+06
42 ST-PORT AUTH    1.033920e+06
86 ST              9.170790e+05
MORGAN AV          8.788540e+05
125 ST             8.398200e+05
CANAL ST           8.030450e+05
59 ST COLUMBUS     7.660630e+05
47-50 STS ROCK     7.274570e+05
96 ST              7.102940e+05
59 ST              6.815550e+05
FLUSHING-MAIN      6.543050e+05
14 ST              6.229060e+05
PATH NEW WTC       6.013240e+05
CHAMBERS ST        5.391640e+05
50 ST              5.347020e+05
42 ST-BRYANT PK    5.282780e+05
72 ST              5.269290e+05
JKSN HT-ROOSVLT    5.256410e+05
28 ST              5.196520e+05
W 4 ST-WASH SQ     5.000020e+05
ATL AV-BARCLAY     4.767180e+05
WALL ST            4.639800e+05
Name: turnstile_turns, dtype: fl

In [16]:
import datetime as dt
df_sorted['day_of_week']=df_sorted.converted_time.dt.dayofweek

In [56]:
def reverseString(word):
    reversed = word[::-1]
    reversed_word = ""
    
    for i in reversed:
        reversed_word += reversed[i]
        
    return reversed_word

In [57]:
x = reverseString("hello")
print(x)

TypeError: string indices must be integers

In [60]:
# Question 1 - Drew H, Albert L, Nick W 

def reverse_String(s):
    l = [char for char in s]
    reverse = l[::-1]
    
    return "".join(reverse)

# Question 2 - Drew H, Albert L, Nick W



In [61]:
reverse_String("hello")

'olleh'

In [None]:
def CanYouSpell(s,word):
    for letter in word:
        if letter in s:
            return True
        if 