In [2]:
import pandas as pd
import numpy as np
import zipfile
import io
import pandas as pd
import math
from IPython.display import display
from collections import defaultdict

zipped_data_path = "../data/raw_data/ucsd-cse-151b-class-competition.zip"

dataframes = defaultdict(pd.DataFrame)
with zipfile.ZipFile(zipped_data_path, "r") as zipped:
    for filename in zipped.namelist():
        if filename.endswith(".csv"):
            with zipped.open(filename) as f:
                dataframes.update({ filename : pd.read_csv(io.TextIOWrapper(f)) })

                # Lets take a look at the files
                print(f"FILE: {filename}")
                # If you want to see file info uncomment this:
                # display(dataframes[filename].info())
                # display(dataframes[filename].head())

train_data = dataframes["train.csv"]
test_data  = dataframes["test_public.csv"]

FILE: metaData_taxistandsID_name_GPSlocation.csv
FILE: sampleSubmission.csv
FILE: test_public.csv
FILE: train.csv


In [16]:
len(train_data)

1710670

In [2]:
import geohash

def extract_geohash(polyline):
    coordinates = eval(polyline)
    latitudes = [coord[1] for coord in coordinates]
    longitudes = [coord[0] for coord in coordinates]
    geohashes = [geohash.encode(lat, lon) for lat, lon in zip(latitudes, longitudes)]
    return geohashes

train_data['GEOHASHES'] = train_data['POLYLINE'].apply(extract_geohash)

KeyboardInterrupt: 

In [None]:
index_dict = defaultdict(list)
for i, row in train_data.iterrows():
    for geohash_val in row['GEOHASHES']:
        index_dict[geohash_val].append(i)

In [17]:
from geopy.distance import distance

speed_limit = 160  # Set the speed limit threshold in km/h

def calculate_speed(latitude_1, longitude_1, latitude_2, longitude_2):
    dist = distance((latitude_1, longitude_1), (latitude_2, longitude_2)).km
    time_diff = 15 / 3600  # Convert timestamp difference from seconds to hours
    speed = dist / time_diff
    return speed

def calculate_speeds_with_missing_data(row):
    coordinates = eval(row['POLYLINE'])
    missing_data = False
    for i in range(len(coordinates) - 1):
        lat_1, lon_1 = coordinates[i][1], coordinates[i][0]
        lat_2, lon_2 = coordinates[i + 1][1], coordinates[i + 1][0]
        speed = calculate_speed(lat_1, lon_1, lat_2, lon_2)
        if speed > speed_limit:
            missing_data = True
            break
    row['MISSING_DATA2'] = missing_data
    return row

TRAIN_DF = train_data[train_data["MISSING_DATA"] != True]

train_data = train_data.apply(calculate_speeds_with_missing_data, axis=1)

In [18]:
train_data["MISSING_DATA2"].value_counts()

MISSING_DATA2
False    1512671
True      197999
Name: count, dtype: int64

In [32]:
test_data["TAXI_ID"].value_counts()


TAXI_ID
20000434    4
20000436    3
20000081    3
20000612    3
20000503    3
           ..
20000597    1
20000230    1
20000126    1
20000296    1
20000667    1
Name: count, Length: 244, dtype: int64

In [40]:
test_data["ORIGIN_STAND"].value_counts().sum()

123

In [4]:
polylineA = train_data["POLYLINE"].sample(1)
polylineA.values

array(['[[-8.609661,41.151294],[-8.60967,41.151312],[-8.609643,41.151321],[-8.609652,41.151357],[-8.609616,41.151402],[-8.610012,41.152239],[-8.610102,41.153202],[-8.609562,41.15439],[-8.60931,41.155479],[-8.609265,41.156721],[-8.60931,41.157],[-8.60931,41.157027],[-8.609301,41.157099],[-8.608905,41.158035],[-8.60832,41.159286],[-8.607744,41.160447],[-8.606565,41.163471],[-8.606232,41.164425],[-8.606295,41.164884],[-8.606457,41.166396],[-8.606457,41.168196],[-8.605764,41.169879],[-8.605071,41.170734],[-8.605071,41.170734],[-8.604999,41.170806],[-8.604189,41.172066],[-8.606268,41.172867],[-8.60832,41.173677],[-8.611119,41.174604],[-8.613252,41.174856],[-8.613792,41.175324],[-8.614098,41.176395],[-8.614134,41.177412],[-8.614107,41.177439],[-8.614107,41.177466],[-8.614089,41.177592],[-8.614404,41.177745],[-8.616042,41.178123],[-8.617896,41.178474],[-8.618841,41.178627],[-8.619381,41.178726],[-8.619381,41.178744],[-8.619399,41.178735],[-8.61939,41.178744],[-8.619561,41.178807]]'],
      dt

In [5]:
polylineB = train_data["POLYLINE"].sample(1)
polylineB

1301294    [[-8.610129,41.160789],[-8.610012,41.16069],[-...
Name: POLYLINE, dtype: object

In [9]:
import folium

def prepare_polyline_coords(polyline):
	polyline_str = polyline

	# Convert the polyline string to a list of tuples
	polyline_list = eval(polyline_str)

	# Create a list of coordinates as tuples with corrected order
	coords = [(coord[1], coord[0]) for coord in polyline_list]
	return coords


coords1 = prepare_polyline_coords(polylineA.values[0])
coords2 = prepare_polyline_coords(polylineB.values[0])

# Create a Folium map centered around the first coordinate
map_route = folium.Map(location=coords1[0], zoom_start=13, width="70%", height="70%")

folium.TileLayer('http://{s}.tile.stamen.com/toner-lite/{z}/{x}/{y}.png', attr='Map data &copy; <a href="http://openstreetmap.org">OpenStreetMap</a> contributors, <a href="http://creativecommons.org/licenses/by-sa/2.0/">CC-BY-SA</a>, Imagery © <a href="http://stamen.com">Stamen</a>').add_to(map_route)


# Add a polyline layer to represent the route
folium.PolyLine(coords1, color='blue', weight=2.5, opacity=1).add_to(map_route)
folium.PolyLine(coords2, color='red', weight=2.5, opacity=1).add_to(map_route)

# Display the map
map_route


In [11]:
import json

def calculate_travel_time(polyline: str) -> int:
    """
    Calculates the travel time of a trip. Is defined as
    (number of points - 1) * 15 seconds.
    
    :param polyline: The polyline of the trip.
    :return: The travel time of the trip.
    """
    return (len(json.loads(polyline)) - 1) * 15

train_data["TRAVEL_TIME"] = train_data["POLYLINE"].apply(calculate_travel_time)
train_data.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE,TRAVEL_TIME
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[...",330
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[...",270
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-...",960
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[...",630
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-...",420


In [42]:
mean, std = train_data["TRAVEL_TIME"].mean(), train_data["TRAVEL_TIME"].std()
median = train_data["TRAVEL_TIME"].median()

# First n samples to analyze. Set to -1 to use all data
end = -1

outlier_threshold = 3

# "Choose all data, where the trip length is less than 3 standard deviations away from the mean"
# This is to remove outliers. Otherwise, our plots would look very squished (since there are some
# VERRRRRY long taxi trips in the dataset)
df_trimmed = train_data[train_data["TRAVEL_TIME"] < mean + outlier_threshold * std]
df_trimmed = df_trimmed[df_trimmed["TRAVEL_TIME"] > 30]
len(train_data), len(df_trimmed)

(1710670, 1644245)

In [43]:
df_trimmed["ORIGIN_CALL"].value_counts().iloc[:12170]

ORIGIN_CALL
2002.0     56521
63882.0     6297
2001.0      2375
13168.0     1288
6728.0      1088
           ...  
56315.0        4
53778.0        4
45759.0        4
11727.0        4
54733.0        4
Name: count, Length: 12170, dtype: int64

In [59]:
test_data["ORIGIN_CALL"].value_counts(), test_data["ORIGIN_CALL"].unique()[1:] # [1:] eliminates NaN as a unique value

(ORIGIN_CALL
 2002.0     12
 4785.0      2
 42612.0     1
 72911.0     1
 49729.0     1
 38347.0     1
 47514.0     1
 66996.0     1
 72185.0     1
 14123.0     1
 13706.0     1
 15093.0     1
 61692.0     1
 10640.0     1
 75460.0     1
 73870.0     1
 34651.0     1
 41052.0     1
 67507.0     1
 59708.0     1
 73071.0     1
 36542.0     1
 75037.0     1
 56743.0     1
 37332.0     1
 48578.0     1
 80148.0     1
 66812.0     1
 70885.0     1
 76232.0     1
 65127.0     1
 41519.0     1
 31780.0     1
 15427.0     1
 85698.0     1
 37007.0     1
 32563.0     1
 62371.0     1
 45085.0     1
 36752.0     1
 44696.0     1
 34007.0     1
 85224.0     1
 53493.0     1
 3048.0      1
 34944.0     1
 2024.0      1
 42046.0     1
 86098.0     1
 27031.0     1
 35304.0     1
 86436.0     1
 4391.0      1
 13297.0     1
 5549.0      1
 19845.0     1
 10363.0     1
 74478.0     1
 81753.0     1
 31208.0     1
 Name: count, dtype: int64,
 array([42612., 31780., 85698., 37007.,  2002., 32563., 623

In [63]:
df_trimmed[df_trimmed["ORIGIN_CALL"].isin(test_data["ORIGIN_CALL"].unique()[1:]) ]

analyze_orcall_2002 = df_trimmed[df_trimmed["ORIGIN_CALL"] == 2002 ]

In [67]:
from folium import plugins

# Convert the polyline strings to lists of coordinates
analyze_orcall_2002['COORDINATES'] = analyze_orcall_2002['POLYLINE'].apply(prepare_polyline_coords)

# Extract the start coordinates from the polyline lists
analyze_orcall_2002['START_COORDINATES'] = analyze_orcall_2002['COORDINATES'].apply(lambda x: x[0])

# Extract the start coordinates from the polyline lists
analyze_orcall_2002['END_COORDINATES'] = analyze_orcall_2002['COORDINATES'].apply(lambda x: x[-1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  analyze_orcall_2002['COORDINATES'] = analyze_orcall_2002['POLYLINE'].apply(prepare_polyline_coords)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  analyze_orcall_2002['START_COORDINATES'] = analyze_orcall_2002['COORDINATES'].apply(lambda x: x[0])


In [68]:
# Create a Folium map centered around Porto, Portugal
map_heatmap = folium.Map(location=[41.14961, -8.61099], zoom_start=13)

# Create a list of start coordinates
start_coords = analyze_orcall_2002['START_COORDINATES'].tolist()

# Create a heatmap layer with the start coordinates
heatmap = plugins.HeatMap(start_coords, min_opacity=0.1)

# Add the heatmap layer to the map
map_heatmap.add_child(heatmap)

# Display the map
map_heatmap

In [None]:
# Create a Folium map centered around Porto, Portugal
map_heatmap = folium.Map(location=[41.14961, -8.61099], zoom_start=13)

# Create a list of start coordinates
start_coords = analyze_orcall_2002['END_COORDINATES'].tolist()

# Create a heatmap layer with the start coordinates
heatmap = plugins.HeatMap(start_coords, min_opacity=0.1)

# Add the heatmap layer to the map
map_heatmap.add_child(heatmap)

# Display the map
map_heatmap