In [4]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

Import uber data:

In [15]:
uber_data = pd.read_csv('uberdata.csv')

Convert surge:

In [16]:
uber_data['surge'] = uber_data['surge']/10

In [85]:
uber_data.head()

Unnamed: 0,time,city,income,state,product,_id,temp,surge,weather
0,"Fri, 20 Nov 2015 22:19:52 GMT",Atlanta,High,GA,UberSUV,b605b0b06730c256bc90499b6c53afaf,62,1,Clear
1,"Wed, 18 Nov 2015 17:54:37 GMT",Atlanta,Low,GA,UberBLACK,06acbfc4f40ff6cb6ef15a8755945079,66,1,Rain
2,"Wed, 18 Nov 2015 19:27:14 GMT",Atlanta,High,GA,UberBLACK,8a45a65827191fa1ba23fdf767196104,69,1,Rain
3,"Wed, 18 Nov 2015 23:12:56 GMT",Boston,Middle,MA,UberBLACK,f19996613c10c3bf40289591e13b9734,40,1,Partly Cloudy
4,"Sat, 21 Nov 2015 02:21:21 GMT",Washington,Low,DC,uberX,f5fd3879435d240acb5bbbe808caf81b,48,1,Clear


General surge stats:

In [86]:
uber_surge_mean = uber_data['surge'].mean()
uber_surge_mean

1.0313176260292483

In [87]:
uber_surge_mode = uber_data['surge'].mode()
uber_surge_mode

0    1
dtype: float64

Weather surge:

In [88]:
weather_mean = uber_data.groupby('weather').mean().reset_index()
weather_mean['count'] = uber_data.groupby('weather').surge.count().tolist()
weather_mean = weather_mean.sort(['surge'], ascending = False)
weather_mean

Unnamed: 0,weather,temp,surge,count
5,Rain,63.912381,1.046984,1575
4,Partly Cloudy,54.514039,1.041639,16775
3,Overcast,57.967309,1.026751,17375
0,Clear,50.492704,1.025793,12953
2,Mostly Cloudy,56.081335,1.019682,3836
1,Light Rain,62.0,1.0,5
6,Scattered Clouds,44.323529,1.0,68


In [89]:
weather_mean_dict = dict(zip(weather_mean.weather, weather_mean.surge))
weather_mean_dict

{'Clear': 1.0257932525283717,
 'Light Rain': 1.0,
 'Mostly Cloudy': 1.0196819603753913,
 'Overcast': 1.0267510791366932,
 'Partly Cloudy': 1.0416393442622955,
 'Rain': 1.0469841269841269,
 'Scattered Clouds': 1.0}

Car type surge:

In [91]:
car_mean = uber_data.groupby('product').mean().reset_index()
car_mean = car_mean.drop('temp', 1).sort(['surge'], ascending = False) 
car_mean['count'] = uber_data.groupby('product').surge.count().tolist()
car_mean

Unnamed: 0,product,surge,count
14,uberX + Car Seat,1.140525,1587
12,uberWAV,1.07385,1599
0,ASSIST,1.07385,1599
13,uberX,1.069519,7968
4,UberSELECT,1.060189,1590
15,uberXL,1.033329,7968
2,SUV + Car Seat,1.020013,1061
1,BLACK CAR + Car Seat,1.013196,532
5,UberSUV,1.01314,1594
3,UberBLACK,1.011772,1587


In [92]:
car_mean_dict = car_mean.set_index('product').surge.to_dict()
car_mean_dict

{'ASSIST': 1.0738500315059865,
 'BLACK CAR + Car Seat': 1.0131957473420892,
 'SUV + Car Seat': 1.0200125078173858,
 'UberBLACK': 1.0117720883534136,
 'UberSELECT': 1.0601886792452833,
 'UberSUV': 1.0131400602409635,
 'WAV': 1.0,
 'Yellow WAV': 1.0,
 'uberFAMILY': 1.0107277289836889,
 'uberSELECT': 1.0023944549464399,
 'uberT': 1.0,
 'uberTAXI': 1.0,
 'uberWAV': 1.0738500315059871,
 'uberX': 1.0695193876270532,
 'uberX + Car Seat': 1.1405253283302079,
 'uberXL': 1.0333291504580244}

Socioeconomic surge:

In [93]:
income_mean = uber_data.groupby('income').mean().reset_index()
income_mean = income_mean.drop('temp', 1).sort(['surge'], ascending = False) 
income_mean

Unnamed: 0,income,surge
0,High,1.039482
2,Middle,1.034037
1,Low,1.020437


In [94]:
city_mean = uber_data.groupby('city').mean().reset_index()
city_mean = city_mean.drop('temp', 1).sort(['surge'], ascending = False)
city_mean

Unnamed: 0,city,surge
4,Washington,1.048253
1,Boston,1.046095
3,San Francisco,1.029553
0,Atlanta,1.028129
2,New York,1.005452


Time surge:

In [95]:
#split time by spaces and grab time element

In [19]:
uber_test = uber_data.copy()
uber_test['time'] = [i.split(' ') for i in uber_test['time']]

In [20]:
uber_test['time'] = [i[4] for i in uber_test['time']]

Add hour column:

In [96]:
uber_test['hour'] = [i[:2] for i in uber_test['time']]

Hour surge:

In [97]:
hour_mean = uber_test.groupby('hour').mean().reset_index()
hour_mean = hour_mean.drop('temp', 1).sort(['surge'], ascending = False)
hour_mean

Unnamed: 0,hour,surge
13,13,1.112458
23,23,1.079816
22,22,1.068246
16,16,1.057876
17,17,1.047113
12,12,1.043241
15,15,1.040671
14,14,1.037302
20,20,1.034733
3,3,1.030451


NYC surge hour (-5 GMT):

In [74]:
nyc = uber_test[uber_test['city'] == 'New York']

In [82]:
nyc['hour'] = [(int(i) - 5) if int(i) > 4 else (int(i) - 5 + 24) for i in nyc['hour']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [84]:
nyc_hour_mean = nyc.groupby('hour').mean().reset_index()
nyc_hour_mean = nyc_hour_mean.drop('temp', 1).sort(['surge'], ascending = False)
nyc_hour_mean

Unnamed: 0,hour,surge
7,7,1.028785
9,9,1.01369
8,8,1.012836
11,11,1.012072
12,12,1.010286
10,10,1.010235
13,13,1.010204
14,14,1.006667
2,2,1.005952
18,18,1.005952


ATL surge hour (-5 GMT):

In [67]:
atl = uber_test[uber_test['city'] == 'Atlanta']

In [98]:
atl['hour'] = [(int(i) - 5) if int(i) > 4 else (int(i) - 5 + 24) for i in atl['hour']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [99]:
atl_hour_mean = atl.groupby('hour').mean().reset_index()
atl_hour_mean = atl_hour_mean.drop('temp', 1).sort(['surge'], ascending = False)
atl_hour_mean

Unnamed: 0,hour,surge
13,13,1.23215
12,12,1.083836
19,19,1.045333
20,20,1.0425
18,18,1.04
2,2,1.026667
21,21,1.025957
3,3,1.022222
14,14,1.013663
23,23,1.012083


SF surge hour (-8 GMT)

In [75]:
sf = uber_test[uber_test['city'] == 'San Francisco']

In [100]:
sf['hour'] = [(int(i) - 8) if int(i) > 4 else (int(i) - 8 + 24) for i in sf['hour']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [101]:
sf_hour_mean = sf.groupby('hour').mean().reset_index()
sf_hour_mean = sf_hour_mean.drop('temp', 1).sort(['surge'], ascending = False)
sf_hour_mean

Unnamed: 0,hour,surge
11,8,1.225379
10,7,1.175
12,9,1.105072
7,4,1.04697
21,18,1.031013
20,17,1.02425
13,10,1.023438
8,5,1.016667
14,11,1.016544
22,19,1.016006


BOS surge time (-5 GMT)

In [76]:
bos = uber_test[uber_test['city'] == 'Boston']

In [102]:
bos['hour'] = [(int(i) - 5) if int(i) > 4 else (int(i) - 5 + 24) for i in bos['hour']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [103]:
bos_hour_mean = bos.groupby('hour').mean().reset_index()
bos_hour_mean = bos_hour_mean.drop('temp', 1).sort(['surge'], ascending = False)
bos_hour_mean

Unnamed: 0,hour,surge
8,8,1.2
17,17,1.126753
18,18,1.107048
15,15,1.094444
22,22,1.082857
16,16,1.060299
12,12,1.057391
9,9,1.056905
14,14,1.049275
19,19,1.044314


WASH surge time (-5 GMT)

In [104]:
wash = uber_test[uber_test['city'] == 'Washington']

In [105]:
wash['hour'] = [(int(i) - 5) if int(i) > 4 else (int(i) - 5 + 24) for i in wash['hour']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [106]:
wash_hour_mean = wash.groupby('hour').mean().reset_index()
wash_hour_mean = wash_hour_mean.drop('temp', 1).sort(['surge'], ascending = False)
wash_hour_mean

Unnamed: 0,hour,surge
8,8,1.308333
17,17,1.135065
18,18,1.108617
9,9,1.10506
7,7,1.088194
15,15,1.058152
22,22,1.055093
0,0,1.046875
21,21,1.038554
2,2,1.035677
