# Project: Taxi Data

# 1) Importing and cleaning the data

In [None]:
import pandas as pd
col_dtypes = {'pickup_weekday': 'int16', 
              'pickup_hour': 'int16', 
              'pickup_longitude': 'float32', 
              'pickup_latitude': 'float32', 
              'dropoff_longitude': 'float32', 
              'dropoff_latitude': 'float32', 
              'passenger_count': 'int16', 
              'trip_distance': 'float32', 
              'fare_amount': 'float32', 
              'tip_amount': 'float32', 
              'tolls_amount': 'float32', 
              'payment_type': 'int16'}
df = pd.read_csv('2016_Yellow_Taxi_prepared.csv', dtype=col_dtypes)

In [None]:
import numpy as np

df.isna().sum()
df.describe()

In [None]:
jfk_max_lat=40.66018    #Maximum pickup latitude for airport journeys
jfk_min_lat=40.62666    #Minimum pickup latitude for airport journeys
jfk_max_long=-73.76599  #Maximum pickup longitude for airport journeys
jfk_min_long=-73.80822  #Minimum pickup longitude for airport journeys

nyc_max_lat=40.9176     #Maximum latitude for New York City
nyc_min_lat=40.5774     #Minimum latitude for New York City
nyc_max_long=-73.7004 #Maximum longitude for New York City
nyc_min_long=-74.15   #Minimum longitude for New York City

# 2) Selecting data

In [None]:
mask_jfk_longitude = (df.loc[:,'pickup_longitude'] >= jfk_min_long ) & (df.loc[:,'pickup_longitude'] <= jfk_max_long)
mask_jfk_latitude = (df.loc[:,'pickup_latitude'] >= jfk_min_lat ) & (df.loc[:,'pickup_latitude'] <= jfk_max_lat)


df_jfk_lang = df.loc[mask_jfk_longitude,'pickup_longitude']
df_jfk_lat = df.loc[mask_jfk_latitude,'pickup_latitude']

print(df_jfk_lang.shape[0],df_jfk_lat.shape[0])

# 3) Proportion of taxis from the airport

In [None]:
proportion_jfk = df_jfk_lat.shape[0] / df.shape[0]

print('proportion of all taxi journeys start at the airport is {} %'.format(proportion_jfk*100))

# 4) Visualizing the starting points

In [None]:
%matplotlib inline 
import matplotlib.pyplot as plt
import seaborn as sns
fig, ax = plt.subplots()

sns.scatterplot(data=df, x='pickup_longitude', y='pickup_latitude', alpha=0.1, s=2, ax=ax)
ax.set_xlim(nyc_min_long,nyc_max_long)
ax.set_ylim(nyc_min_lat,nyc_max_lat)
plt.axis('off');
# ann.remove
ann = ax.annotate(s='JFK airport', xy=[-73.81, 40.63], xytext=[-73.93, 40.6],arrowprops=dict(facecolor='black'))
ax.set(title='Taxi pick-ups in New York');

# 5) Proportion of airport taxis on each day

In [None]:
df_week_all = pd.crosstab(index=df.loc[:,'pickup_weekday'], columns='count',normalize='all')

df_week_jfk_air_crosstab = pd.crosstab(index=df.loc[mask_jfk_latitude,'pickup_weekday'], columns='count',normalize='all')

for i in [df_week_all, df_week_jfk_air_crosstab]:
    i.index=['Mon','Tue','Wed','Thu','Fri','Sat','Sun']

print(df_week_all,'\n',df_week_jfk_air_crosstab)

# 6) Proportion of journeys on each day of the week from all locations and those starting from the airport

In [None]:
import seaborn as sns
fig, ax = plt.subplots(nrows=1,ncols=2,figsize=[20,6],sharey=True)


# colors = {'Mon':'blue','Tue':'red','Wed':'orange','Thu':'green','Fri':'gray','Sat':'purple','Sun':'navy'}
color=['blue','red','orange','green','gray','purple','navy']

sns.barplot(data=df_week_all, ax=ax[0], x=df_week_all.index, y='count',palette=color)
sns.barplot(data=df_week_jfk_air_crosstab, ax=ax[1], x=df_week_all.index, y='count',palette=color)

# plt.bar(df_week_all.index, df_week_all.iloc[:,0], color=['blue','red','orange','green','gray','purple','navy'])
ax[0].set(title='Proportion of journeys per week day',xlabel='Day of the week',ylabel='Proportion of journeys')
ax[0].set_ylim(0,0.2) 

ax[1].set(title='Proportion of airport journeys per week day',xlabel='Day of the week',ylabel='');

# 7) Proportion each hour for all journeys and journeys from the airport

In [None]:
traffic_all = pd.crosstab(index=df.loc[:,'pickup_hour'], columns='count', normalize='all')

traffic_jfk_air= pd.crosstab(index=df.loc[mask_jfk_latitude,'pickup_hour'], columns='count', normalize='all')

In [None]:
plt.style.use('fivethirtyeight')
fig, ax = plt.subplots(nrows=1,ncols=2,figsize=[20,6],sharey=True)

traffic_all.loc[:,'count'].plot(ax=ax[0])
traffic_jfk_air.loc[:,'count'].plot(ax=ax[1])

fig.subplots_adjust(wspace=0.03, hspace=0)

ax[0].set(title='Proportion each hour for all journeys',ylabel='Proportion of journeys',xlabel='Hour')
ax[1].set(title='Proportion each hour for all airport journeys',xlabel='Hour');

# 8) Making a recommendation

In [None]:
# Considering of makind recommendations,
# high demand for more taxis ar the airport is closely at 5-6 clock(for early flights)
# between 10-15 it is better to emphasize more taxis in the city
# and closer to 15-16 get more cars to the airport, as there is higher need that in nyc