In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import os
import glob 

## Data Loading and Preparation¶

### Loading Data¶

In [2]:
file = os.listdir('uber_data')[-7:]

In [3]:
file

['Uber-Jan-Feb-FOIL.csv.zip',
 'uber-raw-data-jul14.csv.zip',
 'uber-raw-data-sep14.csv.zip',
 'uber-raw-data-may14.csv.zip',
 'uber-raw-data-jun14.csv.zip',
 'uber-raw-data-aug14.csv.zip',
 'uber-raw-data-apr14.csv.zip']

In [4]:
file.remove('uber-raw-data-janjune-15.csv')

ValueError: list.remove(x): x not in list

In [None]:
file

In [None]:
path = ("uber_data")
final = pd.DataFrame()
for file in file:
    df = pd.read_csv(path+'/'+file, encoding= 'utf-8')
    final = pd.concat([final,df])

In [None]:
final.shape

In [None]:
final.head()

###  Data Preparation¶

##### Lat : The latitude of the Uber pickup

##### Lon : The longitude of the Uber pickup

##### Base : The TLC base company code affiliated with the Uber pickup

In [None]:
df=final.copy()

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
# change datatype of data/time
df['Date/Time'] = pd.to_datetime(df['Date/Time'], format="%m/%d/%Y %H:%M:%S")

In [None]:
df.dtypes

In [None]:
df.head(1)

In [None]:

df['weekday']=df['Date/Time'].dt.day_name()
df['day']=df['Date/Time'].dt.day
df['minute']=df['Date/Time'].dt.minute
df['month']=df['Date/Time'].dt.month
df['hour']=df['Date/Time'].dt.hour

In [None]:
df.dtypes

In [None]:
df.head()

In [None]:
df['weekday'].value_counts()

In [None]:
df[df.weekday == 'Sunday']

In [None]:
df['Base'].unique()

In [None]:
df['day'].unique()

In [None]:
df['weekday'].unique()

#### Analysis of journey by Week-days

In [None]:
import plotly.express as px

In [None]:
px.bar(x=df['weekday'].value_counts().index,
      y=df['weekday'].value_counts()
      )

#### seems to have highest sales on wednesday

### Analysis by Hour

In [None]:
plt.hist(df['hour']) # histogram

#### It peaks during evening time when people are logging off from work

In [None]:
df['month'].unique()

In [None]:
for i,month in enumerate(df['month'].unique()):
    print(month)

In [None]:

plt.figure(figsize=(40,20))
for i,month in enumerate(df['month'].unique()):
    plt.subplot(3,2,i+1)
    df[df['month']==month]['hour'].hist()


#### Analysis of Rush of each hour in each month

In [None]:
for i in df['month'].unique():
    plt.figure(figsize=(5,3))
    df[df['month']==i]['hour'].hist()
    

#### analysis of which month has max rides

In [None]:
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [None]:
trace1 = go.Bar( 
        x = df.groupby('month')['hour'].sum().index,
        y = df.groupby('month')['hour'].sum(),
        name= 'Priority')
iplot([trace1])

#### Analysis of Journey of Each  Day

In [None]:
plt.figure(figsize=(10,6))
plt.hist(df['day'], bins=30, rwidth=.8, range=(0.5, 30.5))
plt.xlabel('date of the month')
plt.ylabel('Total Journeys')
plt.title('Journeys by Month Day')

#### Analysis of Total rides month wise

In [None]:
plt.figure(figsize=(20,8))
for i,month in enumerate(df['month'].unique(),1):
    plt.subplot(3,2,i)
    df_out=df[df['month']==month]
    plt.hist(df_out['day'])
    plt.xlabel('days in month'.format(i))
    plt.ylabel('total rides')

### getting Rush in hour 

In [None]:
sns.set_style(style='whitegrid')
sns.pointplot(x="hour",y="Lat",data=lat_df)

##### adding hue params

In [None]:
ax=sns.pointplot(x="hour",y="Lat", hue="weekday",data=df)
ax.set_title('hoursoffday vs latiitide of passenger')

#### to analyse which base number gets popular by month name

In [None]:
df.head()

In [None]:
df['Base'].head()

In [None]:
df.groupby(['Base','month'])['Date/Time'].count()

In [None]:
base=df.groupby(['Base','month'])['Date/Time'].count().reset_index()
base

#### to analyse which base number gets popular by month name

In [None]:
plt.figure(figsize=(10,6))
sns.lineplot(x='month',y='Date/Time',hue='Base',data=base)

#### 2 Cross Analysis
#### Through our exploration we are going to visualize:


#### 1.Heatmap by Hour and Weekday.
#### 2.Heatmap by Hour and Day.
#### 3.Heatmap by Month and Day.
#### 4.Heatmap by Month and Weekday.

## Heatmap by Hour and Weekday.

#### create pivot_tables

##### simplest way of creating pivot tables,first of all call groupby on 2 columns so that we will get groups 
##### df.groupby(['weekday','hour']).apply(lambda x: len(x)), now "weekday" becomes rows and "hour" becomes cols
##### & then call unstack

In [None]:
def count_rows(rows):
    return len(rows)

In [None]:
by_cross = df.groupby(['weekday','hour']).apply(count_rows)
by_cross

In [None]:
pivot=by_cross.unstack()
pivot

##### creating heatmap so that it can be easily visualize

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(pivot, annot=False)

In [None]:
df.head()

In [None]:
def heatmap(col1,col2):
    by_cross = df.groupby([col1,col2]).apply(lambda x:len(x))
    pivot=by_cross.unstack()
    plt.figure(figsize=(10,6))
    return sns.heatmap(pivot,annot=False)

In [None]:
## validating above Analysis through Heatmap
heatmap('day','hour')

In [None]:
heatmap('day','month')

#### Analysing the results
#### We observe that the number of trips increases each month, we can say that from April to September 2014, Uber was in a continuous improvement process.

In [None]:
df[df['month']==4]

In [None]:
heatmap('weekday','month')

#### Analysis of Location data points¶

In [None]:
plt.figure(figsize=(10,6))

plt.plot(df['Lon'], df['Lat'],'r+', ms=0.5)
plt.xlim(-74.2, -73.7)
plt.ylim(40.6,41)

##### We can see a number of hot spots here. Midtown Manhattan is clearly a huge bright spot.
##### & these are made from Midtown to Lower Manhattan.
##### Followed by Upper Manhattan and the Heights of Brooklyn.


### perform Spatial Analysis using heatmap to get a clear cut of Rush on Sunday(Weekend)

In [None]:
df.head()

In [None]:
df_out=df[df['weekday']=='Sunday']
df_out.head()

In [None]:
df_out.groupby(['Lat','Lon'])['weekday'].count().reset_index()

In [None]:
from folium.plugins import HeatMap

In [None]:
import folium
from folium.plugins import HeatMap
basemap=folium.Map()

In [None]:
HeatMap(df_out.groupby(['Lat','Lon'])['weekday'].count().reset_index(),zoom=20,radius=15).add_to(basemap)
basemap

##### Lets create a function for a specific day

In [None]:
def plot(df,day):
    df_out=df[df['weekday']==day]
    df_out.groupby(['Lat','Lon'])['weekday'].count().reset_index()
    HeatMap(df_out.groupby(['Lat','Lon'])['weekday'].count().reset_index(),zoom=20,radius=15).add_to(basemap)
    return basemap


In [None]:
plot(df,'Sunday')

## Analysis of Jan-June uber_15

In [None]:
uber_15 = pd.read_csv(r'F:\Data Science projects\by_other\uber-pickups-in-new-york-city/uber-raw-data-janjune-15.csv',encoding='utf-8')
uber_15.head()

In [None]:
uber_15.shape

In [None]:
#Checking the minimum date in the uber_15
uber_15['Pickup_date'].min()

In [None]:
#Checking the maximum date in the uber_15
uber_15['Pickup_date'].max()

In [None]:
uber_15['Pickup_date'] =  pd.to_datetime(uber_15['Pickup_date'], format='%Y-%m-%d %H:%M:%S')

In [None]:

uber_15['weekday']=uber_15['Pickup_date'].dt.day_name()
uber_15['day']=uber_15['Pickup_date'].dt.day
uber_15['minute']=uber_15['Pickup_date'].dt.minute
uber_15['month']=uber_15['Pickup_date'].dt.month
uber_15['hour']=uber_15['Pickup_date'].dt.hour

In [None]:
uber_15.head()

##### Uber pickups by the month in NYC

In [None]:
px.bar(x=uber_15['month'].value_counts().index,
           y=uber_15['month'].value_counts().values)

#### We can see that the number of Uber pickup has been steadily increasing throughout the first half of 2015 in NYC

#### Analysing Rush in New york City

In [None]:
ax=sns.countplot(uber_15['hour'])
ax.yaxis.set_major_formatter(tick.FormatStrFormatter('%.0f'))

##### Interestingly, after the morning rush, the number of Uber pickups doesn't dip much throughout the rest of the morning and early afternoon. There is significantly more demand in the evening than the daytime. Let's investigate to see if there's a difference in hourly pattern for different days of the week.

### Analysing In-Depth Analysis of Rush in New york City Day & hour wise

##### group the data by Weekday and hour

In [None]:
uber_15.groupby(['weekday', 'hour'])['Pickup_date'].count()

In [None]:
uber_15.groupby(['weekday', 'hour'])['Pickup_date'].count().reset_index()

In [None]:
summary=uber_15.groupby(['weekday', 'hour'])['Pickup_date'].count().reset_index()

In [None]:
summary=summary.rename(columns = {'Pickup_date':'Counts'})
summary

In [None]:
plt.figure(figsize=(10,6))
sns.pointplot(x="hour", y="Counts", hue="weekday", data=summary)

##### Loading Uber-Jan-Feb-FOIL.csv

In [None]:
uber_foil=pd.read_csv(r'F:\Data Science projects\by_other\uber-pickups-in-new-york-city/Uber-Jan-Feb-FOIL.csv')

In [None]:
uber_foil.head()

In [None]:
uber_foil['dispatching_base_number'].unique()

In [None]:
sns.boxplot(x = 'dispatching_base_number', y = 'active_vehicles', data = uber_foil)

#### seems to have more number of Active Vehicles in B02764

In [None]:
sns.boxplot(x = 'dispatching_base_number', y = 'trips', data = uber_foil)

#### seems to have more number of trips in B02764

In [None]:
# Finding the ratio of trips/active_vehicles
uber_foil['trips/vehicle'] = uber_foil['trips']/uber_foil['active_vehicles']

In [None]:
uber_foil.head()

In [None]:
uber_foil.set_index('date')

##### how Average trips/vehicle inc/decreases with dates with each of base umber

In [None]:
plt.figure(figsize=(10,6))
uber_foil.set_index('date').groupby(['dispatching_base_number'])['trips/vehicle'].plot()
plt.ylabel('Average trips/vehicle')
plt.title('Demand vs Supply chart (Date-wise)')
plt.legend()