### Read Data and display features

In [None]:
# Initial Python environment setup...
import numpy as np # linear algebra
import pandas as pd # CSV file I/O (e.g. pd.read_csv)
import os # reading the input files we have access to

data_directory_path: str = '/Users/bijanfazeli/Desktop/coding/DataScience/Data/new-york-city-taxi-fare-prediction'
data_directory: [str] = os.listdir(data_directory_path)
print(data_directory)

train_df =  pd.read_csv(data_directory_path + "/train.csv", nrows = 55_000_000)
train_df.dtypes

['test.csv', 'GCP-Coupons-Instructions.rtf', 'train.csv', 'sample_submission.csv']


#### Create two new features in our training set 
- Call it "travel vector" between the start and end points of the taxi ride, both longitude and latitude coordinates. 
- Take the absolute value since we're only interested in distance traveled.

In [None]:
# Given a dataframe, add two new features 'abs_diff_longitude' and
# 'abs_diff_latitude' reprensenting the "Manhattan vector" from
# the pickup location to the dropoff location.
def add_travel_vector_features(df):
    df['abs_diff_longitude'] = (df.dropoff_longitude - df.pickup_longitude).abs()
    df['abs_diff_latitude'] = (df.dropoff_latitude - df.pickup_latitude).abs()

add_travel_vector_features(train_df)

train_df.head()

#### Prune outliers

In [None]:
print(train_df.isnull().sum())

print('Old size: %d' % len(train_df))
train_df = train_df.dropna(how = 'any', axis = 'rows')
print('New size: %d' % len(train_df))

#### Plot distribution

In [None]:
plot = train_df.iloc[:2000].plot.scatter('abs_diff_longitude', 'abs_diff_latitude')


#### Can remove values above 5 safely

In [None]:
print('Old size: %d' % len(train_df))
train_df = train_df[(train_df.abs_diff_longitude < 5.0) & (train_df.abs_diff_latitude < 5.0)]
print('New size: %d' % len(train_df))


#### Generate training feature and target

In [None]:
# Construct and return an Nx3 input matrix for our linear model
# using the travel vector, plus a 1.0 for a constant bias term.
def get_input_matrix(df):
    return np.column_stack((df.abs_diff_longitude, df.abs_diff_latitude, np.ones(len(df))))

train_X = get_input_matrix(train_df)
train_y = np.array(train_df['fare_amount'])

print(train_X.shape)
print(train_y.shape)

#### Find optimal weight using lstq

In [None]:
# The lstsq function returns several things, and we only care about the actual weight vector w.
(w, _, _, _) = np.linalg.lstsq(train_X, train_y, rcond = None)
print(w)

# Can also use Ordinary Least Squares method
# w=(XT⋅X)−1⋅XT⋅y
w_OLS = np.matmul(np.matmul(np.linalg.inv(np.matmul(train_X.T, train_X)), train_X.T), train_y)
print(w_OLS)

#### Load Test input

In [None]:
test_df = pd.read_csv(data_directory_path + '/test.csv')
test_df.dtypes


In [None]:
# Reuse the above helper functions to add our features and generate the input matrix.
add_travel_vector_features(test_df)
test_X = get_input_matrix(test_df)
# Predict fare_amount on the test set using our model (w) trained on the training set.
test_y_predictions = np.matmul(test_X, w).round(decimals = 2)

# Write the predictions to a CSV file which we can submit to the competition.
submission = pd.DataFrame(
    {'key': test_df.key, 'fare_amount': test_y_predictions},
    columns = ['key', 'fare_amount'])
submission.to_csv('submission.csv', index = False)

print(os.listdir('.'))

from sklearn.linear_model import LinearRegression

linear_regression = LinearRegression()
linear_regression.fit(train_X, train_y)

from sklearn import metrics
import numpy as np
y_prediction = linear_regression.predict(test_X)

mse = metrics.mean_squared_error(test_y_predictions, y_prediction)
rmse = np.sqrt(mse)
print(rmse)