# Ride-Sharing apps Fair Prediction using sklearn Linear Regression approach
### Kaggle profile: https://www.kaggle.com/vaslnk

The dataset is San Francisco Taxi data from September 2012. The dataset consists of 50,000 taxi trips taken in the Bay Area during that time period. For each trip you are given the departure time, arrival time, passenger fare, departure and arrival coordinates as well as departure and arrival TAZs. 

## Installing Google Maps API

In [193]:
!pip install -U googlemaps

Requirement already up-to-date: googlemaps in /srv/app/venv/lib/python3.6/site-packages
Requirement already up-to-date: requests<3.0,>=2.11.1 in /srv/app/venv/lib/python3.6/site-packages (from googlemaps)
Requirement already up-to-date: idna<2.8,>=2.5 in /srv/app/venv/lib/python3.6/site-packages (from requests<3.0,>=2.11.1->googlemaps)
Requirement already up-to-date: chardet<3.1.0,>=3.0.2 in /srv/app/venv/lib/python3.6/site-packages (from requests<3.0,>=2.11.1->googlemaps)
Requirement already up-to-date: certifi>=2017.4.17 in /srv/app/venv/lib/python3.6/site-packages (from requests<3.0,>=2.11.1->googlemaps)
Requirement already up-to-date: urllib3<1.25,>=1.21.1 in /srv/app/venv/lib/python3.6/site-packages (from requests<3.0,>=2.11.1->googlemaps)


In [194]:
from datascience import *
import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import googlemaps

%matplotlib inline

In [196]:
sf_taxi_trips = Table.read_table('Taxi_Train.csv')

## Useful Date functions

In [197]:
def get_hour(s): 
    return datetime.datetime.strptime(s, "%m/%d/%y %H:%M").hour

def get_date(s): 
    return datetime.datetime.strptime(s, "%m/%d/%y %H:%M").date()

def get_weekday(s): # 0 = Monday - 6= Sunday
    return datetime.datetime.strptime(s, "%m/%d/%y %H:%M").date().weekday()

def get_duration(start,finish): 
    start_time = datetime.datetime.strptime(start, "%m/%d/%y %H:%M")
    end_time = datetime.datetime.strptime(finish, "%m/%d/%y %H:%M")
    return (end_time-start_time).seconds/60.

## Adding Map Distance Data

In [198]:
def distance_on_sphere(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude to spherical coordinates in radians.
    degrees_to_radians = np.pi/180.0
        
    # phi = 90 - latitude
    phi1 = (90.0 - lat1)*degrees_to_radians
    phi2 = (90.0 - lat2)*degrees_to_radians
        
    # theta = longitude
    theta1 = lon1*degrees_to_radians
    theta2 = lon2*degrees_to_radians
        
    # We can compute spherical distance from spherical coordinates.
    cos = (np.sin(phi1)*np.sin(phi2)*np.cos(theta1-theta2)+
           np.cos(phi1)*np.cos(phi2))
    arc = np.arccos(cos)

    # Multiply arc by the radius of the earth to get length.
    return 3960.*arc  # to get distance in miles

def rotate_table(table):
    '''Transforms a 2 x n table to be an n x 2 table'''
    return Table().with_columns(['Columns', list(table.labels),
                                 'Values', list(table.to_array()[0])])

## Computing trip distance and duration using Google Maps API

Will use this approach for trips with starting or ending taz of 0

In [230]:
# Get string of the input date a year later
# Used due to the fact Google Maps API only accepts dates in the future
def thisDatein2019(d_string):
    return d_string.split(' ')[0][:-1] + '9' + ' ' + d_string.split(' ')[1]

### How to setup Google Maps API:

- Link: https://console.cloud.google.com/google/maps-apis
- Note: Balance of $200/month **for students**
- To verify: https://cloud.google.com/maps-platform/pricing/sheet/

In [233]:
def getGoogleMapsTimeAndDuration(orig_lat, orig_lng, dest_lat, dest_lng, start_time):
    gmaps = googlemaps.Client(key='INSERT GOOGLE MAPS API KEY HERE')
    departure_time = datetime.datetime.strptime(thisDatein2019(start_time), "%m/%d/%y %H:%M")
    origin = str(orig_lat) + ', ' +  str(orig_lng)
    destination = str(dest_lat) + ', ' + str(dest_lng)
    directions_result = gmaps.directions(origin,
                                         destination,
                                         mode="driving",
                                         avoid="ferries",
                                         departure_time=departure_time
                                        )
    if directions_result and len(directions_result) > 0:
        distance = directions_result[0]['legs'][0]['distance']['text'].split(' ')[0] #miles
        duration = directions_result[0]['legs'][0]['duration']['text'].split(' ')[0] #minutes
        return duration, distance
    else:
        return None, None

## Computing trip distance and duration using TAZ travel distances and times 

In [200]:
times = pd.read_csv('times.csv') #TAZ Times
dists = pd.read_csv('distances.csv') #TAZ Distances

In [201]:
def getDrivingDuration(start_taz, end_taz, number_pax):
    origin = times.iloc[(times['origin']-start_taz).abs().argsort()[:1]].iloc[0]['origin']
    destination = times.iloc[(times['destination']-end_taz).abs().argsort()[:1]].iloc[0]['destination']
    df = times[(times['origin'] == origin) & (times['destination'] == destination)].iloc[0]
    if number_pax == 1:
        return df['shared ride (2 people)']
    else:
        return df['shared ride (3 people)']

In [202]:
def getDrivingDistance(start_taz, end_taz, number_pax):
    origin = times.iloc[(times['origin']-start_taz).abs().argsort()[:1]].iloc[0]['origin']
    destination = times.iloc[(times['destination']-end_taz).abs().argsort()[:1]].iloc[0]['destination']
    df = dists[(dists['origin'] == origin) & (dists['destination'] == destination)].iloc[0]
    if number_pax == 1:
        return df['shared ride (2 people)']
    else:
        return df['shared ride (3 people)']

## Combining multiple methods for preparing training data

In [203]:
def getDistanceAndDuration(row):
    if row['start_taz'] == 0 or row['end_taz'] == 0:
        return getGoogleMapsTimeAndDuration(row['start_lat'], row['start_lng'], row['end_lat'], row['end_lng'], row['start_time'])
    else:
        driving_duration = get_duration(row['start_time'], row['end_time'])
        driving_distance = getDrivingDistance(row['start_taz'], row['end_taz'], row['number_pax'])
        return driving_duration, driving_distance

## Combining multiple methods for preparing predicting data

In [204]:
def predictDistanceAndDuration(row, train_df):
    distance = row['distance']
    find = train_df[(train_df['distance'] < distance + 0.2) & (train_df['distance'] > distance - 0.2)]
    if (len(find) == 0) or (row['start_taz'] == 0 or row['end_taz'] == 0):
        return getGoogleMapsTimeAndDuration(row['start_lat'], row['start_lng'], row['end_lat'], row['end_lng'], row['start_time'])
    else:
        driving_duration = find['driving_duration'].mean()
        driving_distance = getDrivingDistance(row['start_taz'], row['end_taz'], row['number_pax'])
        return driving_duration, driving_distance

## Complete method for cleaning the data and adding all the required features

In [206]:
def prepareData(df, train_df=None):
    df = df.set_index('id')
    # Distance on sphere between start and end points
    df['distance'] = distance_on_sphere(df['start_lat'], df['start_lng'], df['end_lat'], df['end_lng'])
    # Hour of the day
    df['hour'] = df['start_time'].apply(get_hour)
    # Day of the week
    df['weekday'] = df['start_time'].apply(get_weekday)
    # Bool whether ride occurs on a weekend
    df['is_weekend'] = df['weekday'].isin([5, 6]).astype(int)
    # Bool whether ride occurs on at night
    df['is_atnight'] = ((df['hour'] >= 22) | (df['hour'] <= 6)).astype(int)
    # Bool whether ride is to SFO
    df['is_toSFO'] = (df['end_taz'] == 239).astype(int)
    # Bool whether ride is to OAK
    df['is_toOAK'] = (df['end_taz'] == 874).astype(int)
    if train_df is None:
        # Preparing complete training data
        df['driving_stuff'] = df.apply(lambda row: getDistanceAndDuration(row), axis=1)
    else:
        # Preparing incomplete testing data
        df['driving_stuff'] = df.apply(lambda row: predictDistanceAndDuration(row, train_df), axis=1)
    df[['driving_duration', 'driving_distance']] = df['driving_stuff'].apply(pd.Series)
    df = df.drop(['driving_stuff'], 1)
    data = df.drop(['fare', 'start_time', 'end_time', 'start_lng', 'start_lat', 'end_lng', 'end_lat', 'start_taz', 'end_taz'], 1)
    test = df[['fare']]
    return data, test

## Preparing the data

In [207]:
taxi_clean = sf_taxi_trips.to_df()
# taxi_clean = taxi_clean[(taxi_clean['start_taz'] != 0) & (taxi_clean['end_taz'] != 0)]
train_data, train_target = prepareData(taxi_clean)

#### Fixing failed Google Maps API results

In [217]:
train_data['driving_duration'] = train_data['driving_duration'].astype(float)
train_data['driving_duration'].fillna(train_data['driving_duration'].mean(), inplace=True)

In [218]:
train_data['driving_distance'] = train_data['driving_distance'].astype(float)
train_data['driving_distance'].fillna(train_data['driving_distance'].mean(), inplace=True)

## 80/20 split for model training & testing

In [219]:
X_train, X_test, y_train, y_test = train_test_split(train_data, train_target, test_size=0.2, random_state=0)

# Linear regression


In [220]:
# Set up the model
model = LinearRegression()
# Use fit
model.fit(X_train, y_train)
# Check the score
model.score(X_test, y_test)

0.9242138640385439

## Predicting the Fare

In [221]:
test = pd.read_csv('Taxi_Query.csv')
to_predict, _ = prepareData(test, train_data)

#### Fixing failed Google Maps API results

In [224]:
to_predict['driving_duration'] = to_predict['driving_duration'].astype(float)
to_predict['driving_distance'] = to_predict['driving_distance'].astype(float)

In [225]:
to_predict['driving_duration'].fillna(to_predict['driving_duration'].mean(), inplace=True)
to_predict['driving_distance'].fillna(to_predict['driving_distance'].mean(), inplace=True)

# Final Result

In [227]:
result = model.predict(to_predict)

In [228]:
pd.DataFrame(data=result, index=to_predict.index, columns=['Fare']).reset_index().rename({'id':'Id'}, axis=1).to_csv('result.csv', index=False)