## Polynomial functions

In [276]:
def getPoly(X, Y, degree):
    '''
    Calculates least squares polynomial fit of 'degree' of the fitting polynomial
    
    Parameters
    ----------
    X : `pd.datetime` array-like
    Y :  array-like of X size
    
    Returns
    -------
    p : `numpy.lib.polynomial.poly1d` object
    '''
    import numpy as np
    import warnings
    warnings.simplefilter('ignore', np.RankWarning)
    import matplotlib.dates as mdates
    
    #convert dates to num values for poly function
    if X.dtype == 'datetime64[ns]':
        X_num = mdates.date2num(X)
    else:
        X_num = X
    
    #calculate Polynomial coefficients, highest power first
    #ndarray, shape (deg + 1,) or (deg + 1, K)
    coefs = np.polyfit(X_num, Y, int(degree))

    #Construct the polynomial
    p = np.poly1d(coefs)
    
    return p

def plotPoly(X, Y, p, show=True,x_label=None,y_label='',title='',Mtick=True,lw=2):
    '''
    Creates a Polynomial plot
    
    Parameters
    ----------
    X : `pd.datetime` array-like
    Y :  array-like of X size
    p : `numpy.lib.polynomial.poly1d` object
    show : boolean, display figure at the end of function if True
    xy_label,title : text for labels and plot title
    Mtick : Million tick, if True shows Y ticks in millions (value/1e6)
    
    Returns
    -------
    f : `matplotlib.figure.Figure`
    '''
    import matplotlib.pyplot as plt
    import matplotlib.dates as mdates
    import matplotlib.ticker as ticker

    #buld the plot
    plt.style.use('seaborn-whitegrid')
    f, ax = plt.subplots(figsize=(10,5))
    #plt.style.use('fivethirtyeight')
    #f = plt.figure(figsize=(20,10))
    
    #set y axis scale to million
    if Mtick:
        scale_y = 1e6
        ticks_y = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x/scale_y))
        ax.yaxis.set_major_formatter(ticks_y)
        y_label = y_label + ' , Million'
    
    #convert dates to num values for poly function
    if X.dtype == 'datetime64[ns]':
        X_num = mdates.date2num(X)
    else:
        X_num = X
    
    plt.plot(X, Y, label='Actual',lw=lw, marker='o')
    plt.plot(X, p(X_num), "r-", label='Model') #p(X) evaluates the polynomial at X
    
    #ax.set_ylim(0,30*1e6)
    ax.set_xlim(min(X_num),max(X_num))
    
    plt.title(title+' Polynomial Regression', weight='bold')
    plt.ylabel(y_label, weight='bold')
    plt.xlabel(x_label, weight='bold')
    plt.legend()
    
    if show:
        plt.show()
    else:
        plt.close(f)
    return f

def calcPoly(df,X='Date',degree=3,Mtick=False,lw=2):
    '''
    Returns a DF with calculated polynomial coeffs
    
    Parameters
    ----------
    df : Pandas DataFrame, must have first `Date` column of datetime dtype, 
         other columns should be of `numeric` dtype
    degree : calculate up to degree of power
    Mtick : Million tick, if True shows Y ticks in millions (value/1e6)
    
    Results
    -------
    Dataframe
    Saved PNGs
    '''    
    #set column names for the plot excluding 'Date' column [1:]
    columns = df.columns.tolist()[1:]
    
    result_df = pd.DataFrame()
    
    for degree in range(2,degree+1):
        for data in columns:
            temp = df[[X,data]]
            temp = temp.dropna(how='any')

            #print(f'Getting poly for {data}, {degree}')
            p = getPoly(temp[X], temp[data], degree)

            #add coeffs to df
            result_df = pd.concat([result_df,pd.DataFrame(
                {data+'_x_'+str(degree):p.coef[::-1]})],axis=1) 
            #reverse order of poly so column of DF represent power of X

            f = plotPoly(temp[X].values, temp[data],p,show=False,x_label='Timeline',y_label=data,
                     title=data+', x'+str(degree),Mtick=Mtick,lw=lw)
            path_to_plot = 'results/plots/'+data+'_polynomial_x'+str(degree)+'.png'
            f.savefig(path_to_plot,dpi=150,transparent=True,bbox_inches='tight') 
    
    return result_df.T

def calcPolyY(df,Y,degree=3,Mtick=False,lw=2):
    '''
    Returns a DF with calculated polynomial coeffs for dependant Variable Y
    
    Parameters
    ----------
    df : Pandas DataFrame, must have first `Date` column of datetime dtype, 
         other columns should be of `numeric` dtype
    Y : dependant variable
    degree : calculate up to degree of power
    Mtick : Million tick, if True shows Y ticks in millions (value/1e6)
    
    Results
    -------
    Dataframe
    Saved PNGs
    '''    
    #set column names for the plot excluding Y column 
    columns = df.drop(columns=Y).columns.tolist()
    
    result_df = pd.DataFrame()
    
    for degree in range(2,degree+1):
        for X in columns:
            temp = df[[X,Y]]
            temp = temp.dropna(how='any')

            #print(f'Getting poly for {data}, {degree}')
            p = getPoly(temp[X], temp[Y], degree)

            #add coeffs to df
            result_df = pd.concat([result_df,pd.DataFrame(
                {X+'_'+Y+'_x_'+str(degree):p.coef[::-1]})],axis=1) 
            #reverse order of poly so column of DF represent power of X

            f = plotPoly(temp[X].values, temp[Y],p,show=False,x_label=X,y_label=Y,
                     title=X+'/'+Y+', x'+str(degree),Mtick=Mtick,lw=lw)
            path_to_plot = 'results/plots/'+X+'_'+Y+'_polynomial_x'+str(degree)+'.png'
            f.savefig(path_to_plot,dpi=150,transparent=True,bbox_inches='tight') 
    
    return result_df.T

Polynomial with all the metro KPI

In [31]:
#uncomment when will have latest version of the functions
#from polynomial import getPoly
#from polynomial import plotPoly
#from polynomial import calcPoly
import matplotlib.pyplot as plt
import pandas as pd

In [246]:
#calculate metro kpi coeffs
path = 'results/metro_kpi.csv'
metro_kpi = pd.read_csv(path)

metro_kpi['Date'] = pd.to_datetime(metro_kpi['Date'])
metro_kpi = metro_kpi[['Date', 'ROTP', 'RailReliability', 'MetroAccessOTP',
       'EscalatorAvail', 'ElevatorAvail', 'TotalInjuries', 'Crime', 'Metro']]

#convert % to float
metro_kpi['ElevatorAvail'] = [float(x.strip('%'))/100 for x in metro_kpi['ElevatorAvail']]

metro_kpi.head()

Unnamed: 0,Date,ROTP,RailReliability,MetroAccessOTP,EscalatorAvail,ElevatorAvail,TotalInjuries,Crime,Metro
0,2011-01-01,0.879,48241,0.901,0.888,0.963,2.08,6.39,21082553
1,2011-02-01,0.887,37703,0.89,0.866,0.96,1.66,4.68,21228262
2,2011-03-01,0.91,50328,0.913,0.869,0.969,2.16,3.96,26170157
3,2011-04-01,0.909,39302,0.912,0.862,0.964,2.21,4.72,25656797
4,2011-05-01,0.909,37355,0.922,0.825,0.974,1.69,7.32,24342603


X: all KPIs, Y: Ridership

In [267]:
#create all the graphs and combined coeffs table for metro_kpi
kpi = calcPolyY(metro_kpi,'Metro',lw=0)
kpi.to_csv('results/metro_kpi_polyY_coefs.csv')
kpi

Unnamed: 0,0,1,2,3
Date_Metro_x_2,-405301700000.0,1104677.0,-0.7526739,
ROTP_Metro_x_2,29390100.0,-40958110.0,37218160.0,
RailReliability_Metro_x_2,27595670.0,-128.3823,0.0005319427,
MetroAccessOTP_Metro_x_2,-398686200.0,898064700.0,-478949400.0,
EscalatorAvail_Metro_x_2,-192305400.0,525094700.0,-317591900.0,
ElevatorAvail_Metro_x_2,6359250000.0,-13079630000.0,6748288000.0,
TotalInjuries_Metro_x_2,24260530.0,-1545259.0,132970.8,
Crime_Metro_x_2,17231000.0,1193201.0,-60616.66,
Date_Metro_x_3,18326610000000.0,-75310570.0,103.1571,-4.709894e-05
ROTP_Metro_x_3,-274662800.0,1144517000.0,-1489055000.0,649191500.0


X: Date, Y: all KPIs

In [268]:
#create all the graphs and combined coeffs table for metro_kpi
kpi = calcPoly(metro_kpi,'Date')
kpi.to_csv('results/metro_kpi_poly_coefs.csv')
kpi

Unnamed: 0,0,1,2,3
ROTP_x_2,-15260.05,0.04158172,-2.832453e-08,
RailReliability_x_2,1066577000.0,-2918.432,0.001996436,
MetroAccessOTP_x_2,-5930.024,0.01613998,-1.098049e-08,
EscalatorAvail_x_2,-12476.11,0.03389641,-2.302159e-08,
ElevatorAvail_x_2,-28.61333,7.833497e-05,-5.182301e-11,
TotalInjuries_x_2,-32555.87,0.08842777,-6.004279e-08,
Crime_x_2,-289938.7,0.7892631,-5.371164e-07,
Metro_x_2,-405301700000.0,1104677.0,-0.7526739,
ROTP_x_3,-46531890.0,189.8022,-0.0002580656,1.1696e-10
RailReliability_x_3,-5463743000000.0,22290310.0,-30.31242,1.374055e-05


Polynomial with all combined ridership data

In [188]:
path = 'results/combined.csv'
combined_df = pd.read_csv(path)

combined_df['Date'] = pd.to_datetime(combined_df['Date'])
combined_df.head()

Unnamed: 0,Date,Bus,Metro,Taxi,Uber
0,2011-01-01,,21082553.0,,
1,2011-02-01,,21228262.0,,
2,2011-03-01,,26170157.0,,
3,2011-04-01,,25656797.0,,
4,2011-05-01,,24342603.0,,


In [190]:
#create all the graphs and combined coeffs table for combined data
combined = calcPoly(combined_df,Mtick=True)
combined.to_csv('results/combined_poly_coefs.csv')

X: all KPIs, Y: Uber

In [278]:
uber_kpi = pd.merge(combined_df[['Date','Uber']],metro_kpi.drop(columns='Metro'),on='Date',how='outer')
uber_kpi

kpi = calcPolyY(uber_kpi,'Uber',lw=0,Mtick=True)
kpi.to_csv('results/uber_kpi_polyY_coefs.csv')

X: all KPIs, Y: Taxi

In [280]:
taxi_kpi = pd.merge(combined_df[['Date','Taxi']],metro_kpi.drop(columns='Metro'),on='Date',how='outer')
taxi_kpi

kpi = calcPolyY(taxi_kpi,'Taxi',lw=0,Mtick=True)
kpi.to_csv('results/taxi_kpi_polyY_coefs.csv')

X: all KPIs, Y: Bus

In [281]:
bus_kpi = pd.merge(combined_df[['Date','Bus']],metro_kpi.drop(columns='Metro'),on='Date',how='outer')
bus_kpi

kpi = calcPolyY(bus_kpi,'Bus',lw=0,Mtick=True)
kpi.to_csv('results/bus_kpi_polyY_coefs.csv')