In [1]:
import pandas as pd
import numpy as np
import zipfile
import io
import pandas as pd
import math
from IPython.display import display
from collections import defaultdict

zipped_data_path = "../data/raw_data/ucsd-cse-151b-class-competition.zip"

dataframes = defaultdict(pd.DataFrame)
with zipfile.ZipFile(zipped_data_path, "r") as zipped:
    for filename in zipped.namelist():
        if filename.endswith(".csv"):
            with zipped.open(filename) as f:
                dataframes.update({ filename : pd.read_csv(io.TextIOWrapper(f)) })

                # Lets take a look at the files
                print(f"FILE: {filename}")
                # If you want to see file info uncomment this:
                # display(dataframes[filename].info())
                # display(dataframes[filename].head())

train_data = dataframes["train.csv"]
test_data  = dataframes["test_public.csv"]

FILE: metaData_taxistandsID_name_GPSlocation.csv
FILE: sampleSubmission.csv
FILE: test_public.csv
FILE: train.csv


In [12]:
TAXI_METADATA = dataframes["metaData_taxistandsID_name_GPSlocation.csv"]

In [2]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1710670 entries, 0 to 1710669
Data columns (total 9 columns):
 #   Column        Dtype  
---  ------        -----  
 0   TRIP_ID       int64  
 1   CALL_TYPE     object 
 2   ORIGIN_CALL   float64
 3   ORIGIN_STAND  float64
 4   TAXI_ID       int64  
 5   TIMESTAMP     int64  
 6   DAY_TYPE      object 
 7   MISSING_DATA  bool   
 8   POLYLINE      object 
dtypes: bool(1), float64(2), int64(3), object(3)
memory usage: 106.0+ MB


## Calculate New Features
Add travel time

In [3]:
import json

def calculate_travel_time(polyline: str) -> int:
    """
    Calculates the travel time of a trip. Is defined as
    (number of points - 1) * 15 seconds.
    
    :param polyline: The polyline of the trip.
    :return: The travel time of the trip.
    """
    return (len(json.loads(polyline)) - 1) * 15

train_data["TRAVEL_TIME"] = train_data["POLYLINE"].apply(calculate_travel_time)

In [None]:
def add_meta_taxi_data(x):
    if not math.isnan(x):
        return str(TAXI_METADATA.at[x-1, "Latitude"]) +"," + str(TAXI_METADATA.at[x-1, "Longitude"])

train_data["START_LOCATION"] = train_data['ORIGIN_STAND'].apply(add_meta_taxi_data)
train_data.head()

In [18]:
def encode(data, col, max_val):
    data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
    data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
    return data

from datetime import datetime
def parse_time(x):
  # We are using python's builtin datetime library
  # https://docs.python.org/3/library/datetime.html#datetime.date.fromtimestamp

  # Each x is essentially a 1 row, 1 column pandas Series
  dt = datetime.fromtimestamp(x["TIMESTAMP"])
  return dt.year, dt.month, dt.day, dt.hour, dt.weekday()

# Because we are assigning multiple values at a time, we need to "expand" our computed (year, month, day, hour, weekday) tuples on 
# the column axis, or axis 1
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.apply.html
train_data[["YR", "MON", "DAY", "HR", "WK"]] = train_data[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")

train_data = encode(train_data, 'MON', 12)
train_data = encode(train_data, 'DAY', 31)
train_data = encode(train_data, 'HR', 24)
train_data = encode(train_data, 'WK', 7)

In [20]:
train_data["YR_2013"] = train_data["YR"] == 2013

## BAD DATA
Seemed like there was some weird recordings of data. Going to use to geocash to filter out data where taxi's are driving at crazy speeds.

In [4]:
# Remove missing data
train_data = train_data[train_data["MISSING_DATA"] != True]

In [5]:
# Remove empty polyline
train_data = train_data[train_data["POLYLINE"] != "[]"]

In [6]:
import matplotlib.pyplot as plt

mean, std = train_data["TRAVEL_TIME"].mean(), train_data["TRAVEL_TIME"].std()
median = train_data["TRAVEL_TIME"].median()

outlier_threshold = 3

# "Choose all data, where the trip length is less than 3 standard deviations away from the mean"
# This is to remove outliers. Otherwise, our plots would look very squished (since there are some
# VERRRRRY long taxi trips in the dataset)
train_data = train_data[train_data["TRAVEL_TIME"] < mean + outlier_threshold * std]

In [8]:
from geopy.distance import distance

speed_limit = 160  # Set the speed limit threshold in km/h
time_diff = 1 / (15 / 3600)  # Each consecutive coordinate is 15 seconds apart. Convert units to hours. (storing as 1 / kmh for computational efficiency)

def calculate_speed(latitude_1, longitude_1, latitude_2, longitude_2):
    dist = distance((latitude_1, longitude_1), (latitude_2, longitude_2)).km
    speed = dist * time_diff
    return speed

def calculate_speeds_with_missing_data(row):
    coordinates = eval(row['POLYLINE'])
    missing_data = False
    for i in range(len(coordinates) - 1):
        lat_1, lon_1 = coordinates[i][1], coordinates[i][0]
        lat_2, lon_2 = coordinates[i + 1][1], coordinates[i + 1][0]
        speed = calculate_speed(lat_1, lon_1, lat_2, lon_2)
        if speed > speed_limit:
            missing_data = True
            break
    row['BAD_DATA'] = missing_data
    return row

train_data = train_data.apply(calculate_speeds_with_missing_data, axis=1)

In [9]:
train_data["BAD_DATA"].value_counts()

BAD_DATA
False    1544329
True      142533
Name: count, dtype: int64

In [10]:
train_data = train_data[train_data["BAD_DATA"] != True]

## Clean up
remove unnecessary cols

In [25]:
train_data.drop(['DAY_TYPE', 'MISSING_DATA', 'YR', 'WK', 'MON', 'DAY', 'HR', 'TIMESTAMP', 'BAD_DATA'], axis=1, inplace=True)

In [None]:
# There are some START_LOCATIONS THAT ARE MESSED UP
train_data.loc[train_data["ORIGIN_STAND"] == 41, "START_LOCATION"] = "41.163066654,-8.67598304213"

## Test set
gotta prep the test set now

In [29]:
test_data["START_LOCATION"] = test_data['ORIGIN_STAND'].apply(add_meta_taxi_data)

test_data[["YR", "MON", "DAY", "HR", "WK"]] = test_data[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")

test_data = encode(test_data, 'MON', 12)
test_data = encode(test_data, 'DAY', 31)
test_data = encode(test_data, 'HR', 24)
test_data = encode(test_data, 'WK', 7)

test_data["YR_2013"] = test_data["YR"] == 2013

test_data.drop(['DAY_TYPE', 'MISSING_DATA', 'YR', 'WK', 'MON', 'DAY', 'HR', 'TIMESTAMP'], axis=1, inplace=True)

## Ready to split and save

In [27]:
train_data_call_type_A, train_data_call_type_B, train_data_call_type_C = train_data[train_data["CALL_TYPE"] == "A"], train_data[train_data["CALL_TYPE"] == "B"], train_data[train_data["CALL_TYPE"] == "C"]

# For some reason call_type_B has rows with origin_stand == NaN
train_data_call_type_B = train_data_call_type_B[~train_data_call_type_B["ORIGIN_STAND"].isna()]


# lens of diff call type datasets
print(len(train_data_call_type_A), len(train_data_call_type_B), len(train_data_call_type_C))

346466 731284 466579


In [30]:
modified_train_zip = '../data/clean_data/class-competition-cleaned.zip'

with zipfile.ZipFile(modified_train_zip, 'w') as zip:
    zip.writestr("train_call_type_A.csv", train_data_call_type_A.to_csv(index=False))
    zip.writestr("train_call_type_B.csv", train_data_call_type_B.to_csv(index=False))
    zip.writestr("train_call_type_C.csv", train_data_call_type_C.to_csv(index=False))
    zip.writestr("test_public.csv", test_data.to_csv(index=False))