#                                           Importing libraries


In [None]:
# Data analysis and wrangling 
import pandas as pd
from geopy import Point, distance
from math import *
import numpy as np
# Data Visualization 
import matplotlib.pyplot as plt
import seaborn as sns
# Machine learning 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

In [None]:
DF = pd.read_csv('../input/uber-fares-dataset/uber.csv')

In [None]:
DF.columns

In [None]:
DF.head()

In [None]:
DF.info()

In [None]:
DF.describe().T

- We got 9 features (2 objects 2 int and 5 float) consisting of 200,000 observations.
- (key) column is not useful therefore it can be dropped.
- (unnamed: 0) column can be renamed and used as Trip_ID for further analysis.
- We got 1 missing value in both (lang, lat) columns which resemble 0.0005 of the DF.
- Datetime column data type is 'object' which needs to be modified.
- Possibility of outliear and false info in (fare, passenger_count) columns.
- New features to consider (Distance, Year, Month, Week, Day, Pickup_hr)

# Data Cleansing & Preparation

In [None]:
DF.rename(columns = {'Unnamed: 0':'Trip_ID'}, inplace = True)

In [None]:
DF.isnull().sum()

In [None]:
DF.dropna(inplace=True)
DF.drop(['key'], axis=1, inplace=True)

# Dropping unwanted Data

In [None]:
DF.pickup_datetime = pd.to_datetime(DF.pickup_datetime, errors = 'coerce')

# Converting to the right datetime data type

In [None]:
DF.describe().T

In [None]:
DF.info()

# Checking Modified DF

# Haversine Formula to Calculate Distance


In [None]:
def distance_transform(longitude1, latitude1, longitude2, latitude2):
    Distance = []
    
    for pos in range(len(longitude1)):
        long1,lati1,long2,lati2 = map(radians,[longitude1[pos],latitude1[pos],longitude2[pos],latitude2[pos]])
        dist_long = long2 - long1
        dist_lati = lati2 - lati1
        a = sin(dist_lati/2)**2 + cos(lati1) * cos(lati2) * sin(dist_long/2)**2
        c = 2 * asin(sqrt(a))*6371
        Distance.append(c)
       
    return Distance

DF['Distance_km'] = distance_transform(DF['pickup_longitude'].to_numpy(),
                                                DF['pickup_latitude'].to_numpy(),
                                                DF['dropoff_longitude'].to_numpy(),
                                                DF['dropoff_latitude'].to_numpy()
                                              )

# haversine distance formula: d = 2R × sin⁻¹(√[sin²((θ₂ - θ₁)/2) + cosθ₁ × cosθ₂ × sin²((φ₂ - φ₁)/2)]).

In [None]:
DF= DF.assign(Pickup_hr = DF.pickup_datetime.dt.hour,
             Day= DF.pickup_datetime.dt.day,
             Month = DF.pickup_datetime.dt.month,
             Year = DF.pickup_datetime.dt.year,
             Day_of_week = DF.pickup_datetime.dt.dayofweek)
DF['day_name'] = pd.to_datetime(DF['pickup_datetime']).dt.day_name()   # Different way to assigen new column

# Extracting new columns from datetime columns to help us in further analysis

In [None]:
DF.head()

# Outliers Detection and Treatmeant

In [None]:
def find_outliers_IQR(DF):

   q1=DF.quantile(0.25)

   q3=DF.quantile(0.75)

   IQR=q3-q1

   outliers = DF[((DF<(q1-1.5*IQR)) | (DF>(q3+1.5*IQR)))]

   return outliers

In [None]:
outliers = find_outliers_IQR(DF['fare_amount'])

print('number of outliers:' + str(len(outliers)))

print('max outlier value:' + str(outliers.max()))

print('min outlier value:' + str(outliers.min()))

outliers

In [None]:
outliers = find_outliers_IQR(DF['passenger_count'])

print('number of outliers:' + str(len(outliers)))

print('max outlier value:' + str(outliers.max()))

print('min outlier value:' + str(outliers.min()))

outliers

Although this method is useful in many cases to detect outliers but in this case it's not the optimal choice since it indicates 4 as an outlier for passngers number which clearly is a valid number that needs to be kept, so we got to use our intution and internet to determine outliers in a better way.

We can see outliers in all 3 features (Passnger_count, fare_amount, Distance_km)
- Uber max passnger count is 6
- Uber introduced uber eats and uber package within the data frame period therefore 0 passnger trips are allowed to stay
- Uber has no distance limitions but commen sense compels us to set distance to 60km max and trips with 0km is unvalid
- Fare can't be negative, and can't exceed 100usd, even generous tips wouldn't be inserted as trip price.

In [None]:
DF.info()

In [None]:
DF.drop(DF[DF['Distance_km'] == 0].index, inplace = True)
DF.drop(DF[DF['Distance_km'] > 60].index, inplace = True)
DF.drop(DF[DF['fare_amount'] > 100].index, inplace = True)
DF.drop(DF[DF['fare_amount'] < 0].index, inplace = True)

DF.drop(DF[DF['passenger_count'] > 6].index, inplace = True)


DF.info()

In [None]:
plt.scatter(DF['Distance_km'], DF['fare_amount'])
plt.xlabel("Distance_km")
plt.ylabel("fare_amount")
plt.show()

# Descriptive Analysis & Visualization

In [None]:
DF['Count'] = 1

In [None]:
no_of_trips = []
year = [2009, 2010, 2011, 2012, 2013, 2014, 2015]

colors = sns.color_palette('Set1')

for i in range(2009, 2016):
    x = DF.loc[DF['Year'] == i, 'Count'].sum()
    no_of_trips.append(x)

print("Average trips a year: ")
print(year, no_of_trips)


plt.title("Average Yearly Trips")
plt.xlabel("Years")
plt.ylabel("Number of Trips")

plt.bar(year, no_of_trips, color=colors)

In [None]:
corr = DF.corr()

corr

- Strong corr between pick up and dropoff lats and same for longs cause the are close to each other in value and position
- Strong inverse corr is also showen between lat and lon this time because lang and lat are bulit on different directions 
- what's most important here is the high postive corr between fare and distance which will be our measure for our linear regression ML model

# Machine Learning Models


In [None]:
X = DF['Distance_km'].values.reshape(-1,1)
y = DF['fare_amount'].values.reshape(-1,1)  
#Assigning our features 

In [None]:
std = StandardScaler()
y_std = std.fit_transform(y)
x_std = std.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(x_std, y_std, test_size=0.2, random_state=0)

In [None]:
print(X.shape,y.shape)

In [None]:
l_reg = LinearRegression()
l_reg.fit(X_train, y_train)

print("Training set score: {:.2f}".format(l_reg.score(X_train, y_train)))
print("Test set score: {:.7f}".format(l_reg.score(X_test, y_test)))

#Our simple regression model

In [None]:
y_pred = l_reg.predict(X_test)
df = {'Actual': y_test, 'Predicted': y_pred}


In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))


In [None]:
plt.subplot(2, 2, 1)
plt.scatter(X_train, y_train, color = 'green')
plt.plot(X_train, l_reg.predict(X_train), color ="black")
plt.title("Fare vs Distance (Training Set)")
plt.ylabel("fare_amount")
plt.xlabel("Distance")

plt.subplot(2, 2, 2)
plt.scatter(X_test, y_test, color = 'green')
plt.plot(X_train, l_reg.predict(X_train), color ="black")
plt.ylabel("fare_amount")
plt.xlabel("Distance")
plt.title("Fare vs Distance (Test Set)")


plt.tight_layout()
plt.rcParams["figure.figsize"] = (32,22)
plt.show()


In [None]:
rf_reg = RandomForestRegressor()
rf_reg.fit(X_train, y_train)
print("Training set score: {:.2f}".format(rf_reg.score(X_train, y_train)))
print("Test set score: {:.7f}".format(rf_reg.score(X_test, y_test)))
y_pred = rf_reg.predict(X_test)
df = {'Actual': y_test, 'Predicted': y_pred}
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))

In [None]:
rf_reg = RandomForestRegressor()
rf_reg.fit(X_train, y_train)
print("Training set score: {:.2f}".format(rf_reg.score(X_train, y_train)))
print("Test set score: {:.7f}".format(rf_reg.score(X_test, y_test)))
y_pred = rf_reg.predict(X_test)
df = {'Actual': y_test, 'Predicted': y_pred}
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))

In [None]:
import pickle
pickle.dump(l_reg, open('linear_reg_model.pkl', 'wb'))

In [None]:
loaded_model=pickle.load(open('linear_reg_model.pkl', 'rb'))
loaded_model.predict([[10]])

In [None]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error([7],[6.02]))