In [5]:
import zipfile
import io
import pandas as pd
from IPython.display import display
from collections import defaultdict
from datetime import datetime


# zipped_data_path = "../data/raw_data/ucsd-cse-151b-class-competition.zip"

# dataframes = defaultdict(pd.DataFrame)
# with zipfile.ZipFile(zipped_data_path, "r") as zip:
#     for filename in zip.namelist():
#         if filename.endswith(".csv"):
#             with zip.open(filename) as f:
#                 dataframes.update({ filename : pd.read_csv(io.TextIOWrapper(f)) })

#                 # Lets take a look at the files
#                 print(f"FILE: {filename}")
#                 display(dataframes[filename].info())
#                 display(dataframes[filename].head())

In [None]:
TAXI_METADATA = dataframes["metaData_taxistandsID_name_GPSlocation.csv"]
SAMPLE_SUBMISSION_DF = dataframes["sample_submission.csv"]
TEST_PUBLIC_DF = dataframes["test_public.csv"]
TRAIN_DF = dataframes["train.csv"]

## **train.csv**

| Column Name | Description |
| --- | --- |
| TRIP_ID | (String) - An unique identifier for each trip |
| CALL_TYPE | (char) - Category of the ride. It may contain one of three possible values: 'A' if this trip was dispatched from the central, 'B' if this trip was demanded directly to a taxi driver on a specific stand, 'C' otherwise (i.e. a trip demanded on a random street) |
| ORIGIN_CALL | (integer) - An unique identifier for the phone number to call the taxi. It identifies the trip's customer if CALL_TYPE='A'. Otherwise, it assumes a NULL value |
| ORIGIN_STAND | (integer) - An unique identifier for the taxi stand. It identifies the starting point of the trip if CALL_TYPE='B'. Otherwise, it assumes a NULL value |
| TAXI_ID | (integer) - An unique identifier for the taxi that performed each trip |
| TIMESTAMP | (integer) - Unix Timestamp (in seconds). It identifies the trip's start |
| DAY_TYPE | (char) - It identifies the daytype of the trip's start. It assumes one of three possible values: 'A' if this trip started on a normal day or weekend, 'B' if this trip started on a holiday or other special day, 'C' if the trip started on a day before a type-B day |
| MISSING_DATA | (Boolean) - It is FALSE when the GPS data stream is complete and TRUE whenever one (or more) locations are missing |
| POLYLINE | (String) - A list of GPS coordinates (i.e. WGS84 format) mapped as a string. The beginning and the end of the string are identified with brackets (i.e. [ and ]). Each pair of coordinates is also identified by the same brackets as [LONGITUDE, LATITUDE]. The coordinates were recorded every 15 seconds during the trip. The first item represents the starting point and the last item corresponds to the destination |

In [None]:
TRAIN_DF.describe()

Unnamed: 0,TRIP_ID,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP
count,1710670.0,364770.0,806579.0,1710670.0,1710670.0
mean,1.388622e+18,24490.363018,30.272381,20000350.0,1388622000.0
std,9180944000000000.0,19624.290043,17.74784,211.2405,9180944.0
min,1.372637e+18,2001.0,1.0,20000000.0,1372637000.0
25%,1.380731e+18,6593.0,15.0,20000170.0,1380731000.0
50%,1.388493e+18,18755.0,27.0,20000340.0,1388493000.0
75%,1.39675e+18,40808.0,49.0,20000520.0,1396750000.0
max,1.404173e+18,63884.0,63.0,20000980.0,1404173000.0


In [None]:
TRAIN_DF.isnull().sum()

TRIP_ID               0
CALL_TYPE             0
ORIGIN_CALL     1345900
ORIGIN_STAND     904091
TAXI_ID               0
TIMESTAMP             0
DAY_TYPE              0
MISSING_DATA          0
POLYLINE              0
dtype: int64

In [None]:
# How many GPS data streams are complete vs incomplete
TRAIN_DF["MISSING_DATA"].value_counts()

MISSING_DATA
False    1710660
True          10
Name: count, dtype: int64

In [None]:
# Lets remove the missing data
TRAIN_DF = TRAIN_DF[TRAIN_DF["MISSING_DATA"] != True]
TRAIN_DF["MISSING_DATA"].value_counts()

MISSING_DATA
False    1710660
Name: count, dtype: int64

In [None]:
# Percentage of each call type
TRAIN_DF["CALL_TYPE"].value_counts(normalize=True)

CALL_TYPE
B    0.478107
C    0.308660
A    0.213233
Name: proportion, dtype: float64

In [None]:
# Percentage of each day type
TRAIN_DF["DAY_TYPE"].value_counts(normalize=True)

DAY_TYPE
A    1.0
Name: proportion, dtype: float64

In [None]:
import json

def calculate_travel_time(polyline: str) -> int:
    """
    Calculates the travel time of a trip. Is defined as
    (number of points - 1) * 15 seconds.
    
    :param polyline: The polyline of the trip.
    :return: The travel time of the trip.
    """
    return (len(json.loads(polyline)) - 1) * 15

TRAIN_DF["TRAVEL_TIME"] = TRAIN_DF["POLYLINE"].apply(calculate_travel_time)
TRAIN_DF.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  TRAIN_DF["TRAVEL_TIME"] = TRAIN_DF["POLYLINE"].apply(calculate_travel_time)


Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE,TRAVEL_TIME
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[...",330
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[...",270
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-...",960
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[...",630
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-...",420


In [None]:
# Lets split DATA_TYPE and CALL_TYPE into three new binary features 
# i.e. if DATA_TYPE = 'A' -> DATA_TYPE_A = 1, DATA_TYPE_B = 0, DATA_TYPE_C = 0
#day_type_dummies = pd.get_dummies(TRAIN_DF['DAY_TYPE'], prefix='DAY_TYPE')
#missing_cols = set(['DAY_TYPE_A', 'DAY_TYPE_B', 'DAY_TYPE_C']) - set(day_type_dummies.columns)
#for col in missing_cols:
#    day_type_dummies[col] = False
#TRAIN_DF = pd.concat([TRAIN_DF, day_type_dummies], axis=1)

call_type_dummies = pd.get_dummies(TRAIN_DF['CALL_TYPE'], prefix='CALL_TYPE')
missing_cols = set(['CALL_TYPE_A', 'CALL_TYPE_B', 'CALL_TYPE_C']) - set(call_type_dummies.columns)
for col in missing_cols:
    call_type_dummies[col] = False
TRAIN_DF = pd.concat([TRAIN_DF, call_type_dummies], axis=1)

# delete the old columns now that we don't need them anymore
TRAIN_DF.drop('DAY_TYPE', axis=1, inplace=True)
TRAIN_DF.drop('CALL_TYPE', axis=1, inplace=True)

TRAIN_DF.head()

Unnamed: 0,TRIP_ID,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,MISSING_DATA,POLYLINE,TRAVEL_TIME,CALL_TYPE_A,CALL_TYPE_B,CALL_TYPE_C
0,1372636858620000589,,,20000589,1372636858,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[...",330,False,False,True
1,1372637303620000596,,7.0,20000596,1372637303,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[...",270,False,True,False
2,1372636951620000320,,,20000320,1372636951,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-...",960,False,False,True
3,1372636854620000520,,,20000520,1372636854,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[...",630,False,False,True
4,1372637091620000337,,,20000337,1372637091,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-...",420,False,False,True


In [None]:
# Lets try to get some additional features from the timestamp column
TRAIN_DF['TIMESTAMP'] = pd.to_datetime(TRAIN_DF['TIMESTAMP'], unit='s')

# Extract year, month, day of the week, hour, and minute
TRAIN_DF['YEAR'] = TRAIN_DF['TIMESTAMP'].dt.year
TRAIN_DF['MONTH'] = TRAIN_DF['TIMESTAMP'].dt.month
TRAIN_DF['DAY_OF_WEEK'] = TRAIN_DF['TIMESTAMP'].dt.dayofweek
TRAIN_DF['HOUR'] = TRAIN_DF['TIMESTAMP'].dt.hour

# One-hot encode year, month, day of the week, and hour
TRAIN_DF = pd.concat([TRAIN_DF, pd.get_dummies(TRAIN_DF['YEAR'], prefix='YEAR')], axis=1)
TRAIN_DF = pd.concat([TRAIN_DF, pd.get_dummies(TRAIN_DF['MONTH'], prefix='MONTH')], axis=1)
TRAIN_DF = pd.concat([TRAIN_DF, pd.get_dummies(TRAIN_DF['DAY_OF_WEEK'], prefix='DAY_OF_WEEK')], axis=1)
TRAIN_DF = pd.concat([TRAIN_DF, pd.get_dummies(TRAIN_DF['HOUR'], prefix='HOUR')], axis=1)


# Drop the original TIMESTAMP and DAY_OF_WEEK columns
TRAIN_DF.drop(['TIMESTAMP', 'DAY_OF_WEEK', 'YEAR', 'MONTH', 'HOUR'], axis=1, inplace=True)

# We could do referene encoding to speed up train time in the future 
# For day of the week, Sunday is 0,0,0,0,0,0 - Monday is 1,0,0,0,0,0 - Tuesday is 0,1,0,0,0,0 - etc.
# TRAIN_DF.drop(['DAY_OF_WEEK_0', 'MONTH_1', 'HOUR_0', 'YEAR_2013'], axis=1, inplace=True)

pd.set_option('display.max_columns', 100)
print(TRAIN_DF.columns)

Index(['TRIP_ID', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID', 'MISSING_DATA',
       'POLYLINE', 'TRAVEL_TIME', 'CALL_TYPE_A', 'CALL_TYPE_B', 'CALL_TYPE_C',
       'YEAR_2013', 'YEAR_2014', 'MONTH_1', 'MONTH_2', 'MONTH_3', 'MONTH_4',
       'MONTH_5', 'MONTH_6', 'MONTH_7', 'MONTH_8', 'MONTH_9', 'MONTH_10',
       'MONTH_11', 'MONTH_12', 'DAY_OF_WEEK_0', 'DAY_OF_WEEK_1',
       'DAY_OF_WEEK_2', 'DAY_OF_WEEK_3', 'DAY_OF_WEEK_4', 'DAY_OF_WEEK_5',
       'DAY_OF_WEEK_6', 'HOUR_0', 'HOUR_1', 'HOUR_2', 'HOUR_3', 'HOUR_4',
       'HOUR_5', 'HOUR_6', 'HOUR_7', 'HOUR_8', 'HOUR_9', 'HOUR_10', 'HOUR_11',
       'HOUR_12', 'HOUR_13', 'HOUR_14', 'HOUR_15', 'HOUR_16', 'HOUR_17',
       'HOUR_18', 'HOUR_19', 'HOUR_20', 'HOUR_21', 'HOUR_22', 'HOUR_23'],
      dtype='object')


In [None]:
# TEST_PUBLIC_CSV feature extraction
call_type_dummies = pd.get_dummies(TEST_PUBLIC_DF['CALL_TYPE'], prefix='CALL_TYPE')
missing_cols = set(['CALL_TYPE_A', 'CALL_TYPE_B', 'CALL_TYPE_C']) - set(call_type_dummies.columns)
for col in missing_cols:
    call_type_dummies[col] = False
TEST_PUBLIC_DF = pd.concat([TEST_PUBLIC_DF, call_type_dummies], axis=1)

# delete the old columns now that we don't need them anymore
TEST_PUBLIC_DF.drop('DAY_TYPE', axis=1, inplace=True)
TEST_PUBLIC_DF.drop('CALL_TYPE', axis=1, inplace=True)

TEST_PUBLIC_DF.head()

Unnamed: 0,TRIP_ID,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,MISSING_DATA,CALL_TYPE_A,CALL_TYPE_B,CALL_TYPE_C
0,T1,,15.0,20000542,1408039037,False,False,True,False
1,T2,,57.0,20000108,1408038611,False,False,True,False
2,T3,,15.0,20000370,1408038568,False,False,True,False
3,T4,,53.0,20000492,1408039090,False,False,True,False
4,T5,,18.0,20000621,1408039177,False,False,True,False


In [None]:
# Lets try to get some additional features from the timestamp column
hour_columns = ['HOUR_' + str(i) for i in range(24)]
day_columns = ['DAY_OF_WEEK_' + str(i) for i in range(7)]
month_columns = ['MONTH_' + str(i) for i in range(1,13)]
year_columns = ['YEAR_2013', 'YEAR_2014']

TEST_PUBLIC_DF['TIMESTAMP'] = pd.to_datetime(TEST_PUBLIC_DF['TIMESTAMP'], unit='s')

# Extract year, month, day of the week, hour, and minute
TEST_PUBLIC_DF['YEAR'] = TEST_PUBLIC_DF['TIMESTAMP'].dt.year
TEST_PUBLIC_DF['MONTH'] = TEST_PUBLIC_DF['TIMESTAMP'].dt.month
TEST_PUBLIC_DF['DAY_OF_WEEK'] = TEST_PUBLIC_DF['TIMESTAMP'].dt.dayofweek
TEST_PUBLIC_DF['HOUR'] = TEST_PUBLIC_DF['TIMESTAMP'].dt.hour

# One-hot encode year, month, day of the week, and hour
TEST_PUBLIC_DF = pd.concat([TEST_PUBLIC_DF, pd.get_dummies(TEST_PUBLIC_DF['YEAR'], prefix='YEAR')], axis=1)
TEST_PUBLIC_DF = pd.concat([TEST_PUBLIC_DF, pd.get_dummies(TEST_PUBLIC_DF['MONTH'], prefix='MONTH')], axis=1)
TEST_PUBLIC_DF = pd.concat([TEST_PUBLIC_DF, pd.get_dummies(TEST_PUBLIC_DF['DAY_OF_WEEK'], prefix='DAY_OF_WEEK')], axis=1)
TEST_PUBLIC_DF = pd.concat([TEST_PUBLIC_DF, pd.get_dummies(TEST_PUBLIC_DF['HOUR'], prefix='HOUR')], axis=1)

# Fill missing dummy columns with False
for column in hour_columns:
    if column not in TEST_PUBLIC_DF.columns:
        TEST_PUBLIC_DF[column] = False

for column in day_columns:
    if column not in TEST_PUBLIC_DF.columns:
        TEST_PUBLIC_DF[column] = False

for column in month_columns:
    if column not in TEST_PUBLIC_DF.columns:
        TEST_PUBLIC_DF[column] = False

for column in year_columns:
    if column not in TEST_PUBLIC_DF.columns:
        TEST_PUBLIC_DF[column] = False

# Drop the original TIMESTAMP and DAY_OF_WEEK columns
TEST_PUBLIC_DF.drop(['TIMESTAMP', 'DAY_OF_WEEK', 'YEAR', 'MONTH', 'HOUR'], axis=1, inplace=True)
TEST_PUBLIC_DF = TEST_PUBLIC_DF.reindex(columns=TRAIN_DF.columns)
TEST_PUBLIC_DF = TEST_PUBLIC_DF.drop("TRAVEL_TIME", axis=1)

In [None]:
modified_train_zip = '../data/clean_data/class-competition.zip'

with zipfile.ZipFile(modified_train_zip, 'w') as zip:
    zip.writestr("train.csv", TRAIN_DF.to_csv(index=False))
    zip.writestr("test_public.csv", TEST_PUBLIC_DF.to_csv(index=False))

In [None]:
def hour_similarity_score(hour1, hour2):
    return (12 - min(abs(hour1-hour2), 24 - abs(hour1 - hour2)))/12
def day_of_week_similarity_score(day1, day2):
    cat1 = 0
    cat2 = 0
    type_of_day = [[5,6], [1,2,3], [0], [4]]
    for i, tod in enumerate(type_of_day):
        if day1 in tod:
            cat1 = i
        if day2 in tod:
            cat2 = i
    if cat1 == cat2:
        return ((3.5 - min(abs(day1-day2), 7 - abs(day1 - day2)))/3.5)
    elif cat1 == 0 or cat2 == 0:
        return ((3.5 - min(abs(day1-day2), 7 - abs(day1 - day2)))/3.5) * 0.2
    else:
        return ((3.5 - min(abs(day1-day2), 7 - abs(day1 - day2)))/3.5) * 0.4
def month_similarity_score(month1, month2):
    return (6 - min(abs(month1-month2), 12 - abs(month1 - month2)))/6
def day_of_month_similarity_score(day1, day2):
    return (15 - min(abs(day1-day2), 30 - abs(day1 - day2)))/15
def year_similarity_score(year1, year2):
    return year1 == year2
def call_type_similarity_score(type1, type2):
    return type1 == type2

#Takes in 2-tuple: (datatime object, call_type)
def get_similarity_score(dt1, dt2):
    hour1 = dt1[0].hour
    hour2 = dt2[0].hour
    week_day1 = dt1[0].weekday()
    week_day2 = dt2[0].weekday()
    month1 = dt1[0].month
    month2 = dt2[0].month    
    
    score =  hour_similarity_score(hour1, hour2) * 0.35
    score += day_of_week_similarity_score(week_day1, week_day2) * 0.3
    score += month_similarity_score(month1, month2) * 0.05
    score += call_type_similarity_score(dt1[1], dt2[1]) * 0.3
    return score

train_preprocess_data = []
test_preprocess_data = []

#preprocess and create datetime objects for all datapoints
for i in range(len(TRAIN_DF["TIMESTAMP"])):
    val = datetime.fromtimestamp(TRAIN_DF.iloc[i]["TIMESTAMP"])
    val2 = TRAIN_DF.iloc[i]["CALL_TYPE"]
    train_preprocess_data.append([val, val2])
for i in range(len(TEST_PUBLIC_DF["TIMESTAMP"])):
    val = datetime.fromtimestamp(TEST_PUBLIC_DF.iloc[i]["TIMESTAMP"])
    val2 = TEST_PUBLIC_DF.iloc[i]["CALL_TYPE"]
    test_preprocess_data.append([val, val2])

#iterate through all test data and train data to find similar datapoints
similar_train_data_vals = set()
for i in range(len(test_preprocess_data)):
    print(i)
    similarity_scores = []
    for j in range(len(train_preprocess_data)):
        if j not in similar_train_data_vals:
            similarity_scores.append([get_similarity_score(test_preprocess_data[i], train_preprocess_data[j]), j])
    similarity_scores.sort(reverse = True)
    similar_train_data_vals.update(similarity_scores[k][1] for k in range(50))

similar_train_data_vals_list = list(similar_train_data_vals)
similar_train_data_vals_list.sort()

new_df = TRAIN_DF.iloc[0:0]
for val in similar_train_data_vals_list:
    new_df = new_df.append(TRAIN_DF.iloc[val])

#save dataset
# new_df.to_csv('representative_dataset.csv')