In [None]:
"""
Predict the price of the Uber ride from a given pickup point to the agreed drop-off location. Perform following tasks:
1.  Pre-process the dataset. 
2.  Identify outliers. 
3.  Check the correlation. 
4.  Implement linear regression and random forest regression models. 
5.  Evaluate the models and compare their respective scores like R2, RMSE, etc.
"""


In [None]:
import pandas as pd  # loading csv
import numpy as np  # for np.sqrt
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

import geopy

import warnings
warnings.filterwarnings("ignore")


In [None]:
df = pd.read_csv("uber.csv")


In [None]:
# df["pickup_datetime"] = pd.to_datetime(df["pickup_datetime"])
df.dropna(inplace=True)  # drop null


In [None]:
df = df[(df.pickup_latitude < 90) & (df.dropoff_latitude < 90) &
        (df.pickup_latitude > -90) & (df.dropoff_latitude > -90) &
        (df.pickup_longitude < 180) & (df.dropoff_longitude < 180) &
        (df.pickup_longitude > -180) & (df.dropoff_longitude > -180)]

df.pickup_datetime = pd.to_datetime(df.pickup_datetime)

df['Distance'] = [round(geopy.distance.distance((df.pickup_latitude[i], df.pickup_longitude[i]),
                        (df.dropoff_latitude[i], df.dropoff_longitude[i])).km, 2) for i in df.index]


df.head()


In [None]:
plt.scatter(df['Distance'], df['fare_amount'])
plt.xlabel("Distance")
plt.ylabel("fare_amount")


In [None]:
# Outliers
df.drop(df[df['Distance'] > 60].index, inplace=True)

# zeros and negatives
df.drop(df[df['Distance'] == 0].index, inplace=True)
df.drop(df[df['Distance'] < 0].index, inplace=True)
df.drop(df[df['fare_amount'] == 0].index, inplace=True)
df.drop(df[df['fare_amount'] < 0].index, inplace=True)

df.drop(df[df['Distance'] > 100].index, inplace=True)
df.drop(df[df['fare_amount'] > 100].index, inplace=True)

# impossible
df.drop(df[(df['fare_amount'] > 100) & (
    df['Distance'] < 1)].index, inplace=True)
df.drop(df[(df['fare_amount'] < 100) & (
    df['Distance'] > 100)].index, inplace=True)


In [None]:
plt.scatter(df['Distance'], df['fare_amount'])
plt.xlabel("Distance")
plt.ylabel("fare_amount")


In [None]:
from tabulate import tabulate
corr = df.corr()

corr.style.background_gradient(cmap='BuGn')

# print(tabulate(corr, headers='keys', tablefmt='psql'))
corr

In [None]:
X = df['Distance'].values.reshape(-1, 1)  # Independent Variable
y = df['fare_amount'].values.reshape(-1, 1)  # Dependent Variable


In [None]:

std = StandardScaler()

y_std = std.fit_transform(y)
print(y_std)

x_std = std.fit_transform(X)
print(x_std)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(x_std, y_std, test_size=0.2, random_state=0)


In [None]:
# LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)

predict = model.predict(X_test)

rmse_error = np.sqrt(mean_squared_error(predict, y_test))
print("RMSE error for the model is ", rmse_error)


In [None]:
# RandomForest
model = RandomForestRegressor(random_state=101)
model.fit(X_train, y_train)

predict = model.predict(X_test)

rmse_error = np.sqrt(mean_squared_error(predict, y_test))
print("RMSE value for Random Forest is:", rmse_error)
