In [1]:
#!pip install geopy

In [2]:
#!pip install sagemaker-containers

### importing libaries

In [3]:


import os
import io
import random
import boto3
import pandas as pd
import s3fs
import numpy as np
import sagemaker as sg
import seaborn as sns
import datetime as dt
import geopy.distance
import matplotlib.pyplot as plt
import warnings 

In [4]:
from statsmodels.formula import api
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from IPython.display import display
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [5]:
import argparse
import json
import logging
import pickle as pkl

from sagemaker_containers import entry_point
from sagemaker_xgboost_container.data_utils import get_dmatrix
from sagemaker_xgboost_container import distributed

import xgboost as xgb


AttributeError: module 'collections' has no attribute 'Mapping'

In [None]:
plt.rcParams['figure.figsize'] = [10,6]
warnings.filterwarnings('ignore')

# aws role and region
role = sg.get_execution_role()
region = boto3.Session().region_name
s3 = boto3.client("s3")

print(region)
print(role)

### Read File

In [None]:
# read data from s3
bucket_og = "ml-data-sceince-bucket"
prefix_og_file = "uberfare"

bucket = 'sagemaker-us-west-2-920733537674'
filename = "uber.csv"

# S3 URL
data_s3_location = "s3://{}/{}/{}".format(bucket_og, prefix_og_file, filename)  
df = pd.read_csv(data_s3_location)
df.head()

### Data Preprocessing

In [None]:
# drop unnamed column 
df.drop(['Unnamed: 0','key'], axis=1, inplace=True)
display(df.head())

target = 'fare_amount'
features = [i for i in df.columns if i not in [target]]

print('\n Descriptive: The Datset consists of {} features & {} samples.'.format(df.shape[1], df.shape[0]))

In [None]:
# find null
df.info()

In [None]:
# Reframing the columns

df = df[(df.pickup_latitude<90) & (df.dropoff_latitude<90) &
        (df.pickup_latitude>-90) & (df.dropoff_latitude>-90) &
        (df.pickup_longitude<180) & (df.dropoff_longitude<180) &
        (df.pickup_longitude>-180) & (df.dropoff_longitude>-180)]

df.pickup_datetime=pd.to_datetime(df.pickup_datetime)

df['year'] = df.pickup_datetime.dt.year
df['month'] = df.pickup_datetime.dt.month
df['weekday'] = df.pickup_datetime.dt.weekday
df['hour'] = df.pickup_datetime.dt.hour

df['Distance']=[round(geopy.distance.distance((df.pickup_latitude[i], df.pickup_longitude[i]),(df.dropoff_latitude[i], df.dropoff_longitude[i])).m,2) for i in df.index]

df.drop(['pickup_datetime','month'], axis=1, inplace=True)

original_df = df.copy(deep=True)

df.head()

In [None]:
# Checking the stats of all the columns

display(df.describe())

In [None]:
df = df.drop(columns =df.columns[df.columns.str.contains('latitude|longitude', case = False)])
df.head()

### Data Upload and Write Functions

In [None]:

def write_to_s3(fobj, bucket, key):
    return (
        boto3.Session(region_name=region)
        .resource("s3")
        .Bucket(bucket)
        .Object(key)
        .upload_fileobj(fobj)
    )


def upload_to_s3(bucket, channel, filename):
    fobj = open(filename, "rb")
    prefix = "data/uberfare"
    key = prefix + "/" + channel
    

    url = "s3://{}/{}/{}".format(bucket, key, filename)
    print("Writing to {}".format(url))
    write_to_s3(fobj, bucket, key)

In [None]:
session = sg.Session()
file_name = "cleaned_dataset.csv"
df.to_csv(file_name)
session.upload_data(path= file_name, bucket=bucket, key_prefix= "data/uberfare")

### Data Splititng 

In [None]:

def data_split(
    FILE_DATA,
    DATA_DIR,
    FILE_TRAIN_BASE,
    FILE_TRAIN_1,
    FILE_VALIDATION,
    FILE_TEST,
    PERCENT_TRAIN_0,
    PERCENT_TRAIN_1,
    PERCENT_VALIDATION,
    PERCENT_TEST,
):
    data = [l for l in open(FILE_DATA, "r")]
    train_file_0 = open(DATA_DIR + "/" + FILE_TRAIN_0, "w")
    train_file_1 = open(DATA_DIR + "/" + FILE_TRAIN_1, "w")
    valid_file = open(DATA_DIR + "/" + FILE_VALIDATION, "w")
    tests_file = open(DATA_DIR + "/" + FILE_TEST, "w")

    num_of_data = len(data)
    num_train_0 = int((PERCENT_TRAIN_0 / 100.0) * num_of_data)
    num_train_1 = int((PERCENT_TRAIN_1 / 100.0) * num_of_data)
    num_valid = int((PERCENT_VALIDATION / 100.0) * num_of_data)
    num_tests = int((PERCENT_TEST / 100.0) * num_of_data)

    data_fractions = [num_train_0, num_train_1, num_valid, num_tests]
    split_data = [[], [], [], []]

    rand_data_ind = 0

    for split_ind, fraction in enumerate(data_fractions):
        for i in range(fraction):
            rand_data_ind = random.randint(0, len(data) - 1)
            split_data[split_ind].append(data[rand_data_ind])
            data.pop(rand_data_ind)

    for l in split_data[0]:
        train_file_0.write(l)

    for l in split_data[1]:
        train_file_1.write(l)

    for l in split_data[2]:
        valid_file.write(l)

    for l in split_data[3]:
        tests_file.write(l)

    train_file_0.close()
    train_file_1.close()
    valid_file.close()
    tests_file.close()


In [None]:
# Load the dataset
FILE_DATA = "cleaned_dataset.csv"


# Split the downloaded data into train/test/validation files
FILE_TRAIN_0 = "train_0"
FILE_TRAIN_1 = "train_1"
FILE_VALIDATION = "validation"
FILE_TEST = "test"
PERCENT_TRAIN_0 = 35
PERCENT_TRAIN_1 = 35
PERCENT_VALIDATION = 15
PERCENT_TEST = 15

DATA_DIR = "data/uberfare"

In [None]:

if not os.path.exists(DATA_DIR):
    os.mkdir(DATA_DIR)

data_split(
    FILE_DATA,
    DATA_DIR,
    FILE_TRAIN_0,
    FILE_TRAIN_1,
    FILE_VALIDATION,
    FILE_TEST,
    PERCENT_TRAIN_0,
    PERCENT_TRAIN_1,
    PERCENT_VALIDATION,
    PERCENT_TEST,
)

In [None]:
# Upload the files to the S3 bucket
upload_to_s3(bucket, "train/train_0.csv", DATA_DIR + "/" + FILE_TRAIN_0)
upload_to_s3(bucket, "train/train_1.csv", DATA_DIR + "/" + FILE_TRAIN_1)
upload_to_s3(bucket, "validation/validation.csv", DATA_DIR + "/" + FILE_VALIDATION)
upload_to_s3(bucket, "test/test.csv", DATA_DIR + "/" + FILE_TEST)

In [None]:
# Upload the files to the S3 bucket
upload_to_s3(bucket, "train_0.csv")
upload_to_s3(bucket, "train_1.csv" )
upload_to_s3(bucket, "validation.csv" )
upload_to_s3(bucket, "test.csv")