
# **Running Pyspark in Colab**

Now that you installed Spark and Java in Colab, it is time to set the environment path which enables you to run Pyspark in your Colab environment. Set the location of Java and Spark by running the following code:

Run a local spark session to test your installation:

In [68]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.0.1/spark-3.0.1-bin-hadoop3.2.tgz
!tar -xvf spark-3.0.1-bin-hadoop3.2.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop3.2"
import findspark
findspark.init()



In [69]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [70]:
import urllib
import requests
urllib.request.urlretrieve("https://download.microsoft.com/download/F/4/8/F4894AA5-FDBC-481E-9285-D5F8C4C4F039/Geolife%20Trajectories%201.3.zip", "/tmp/geo_raw.zip")

('/tmp/geo_raw.zip', <http.client.HTTPMessage at 0x7f7b96e7f278>)

In [71]:
%%sh
unzip /tmp/geo_raw.zip

Archive:  /tmp/geo_raw.zip


replace Geolife Trajectories 1.3/Data/000/Trajectory/20081023025304.plt? [y]es, [n]o, [A]ll, [N]one, [r]ename:  NULL
(EOF or read error, treating as "[N]one" ...)


# **Directory and file preprocessing**

In [72]:
os.listdir(path="./Geolife Trajectories 1.3/Data/010")

from pathlib import Path
import os

mypath = "./Geolife Trajectories 1.3/Data/"

dir_with_labels = []

for directory in os.listdir(mypath):
  label_file = Path("./Geolife Trajectories 1.3/Data/"+directory+"/labels.txt")
  if label_file.is_file():
    dir_with_labels.append(directory)

dir_with_labels.sort()
print (dir_with_labels)
len(dir_with_labels)

['010', '020', '021', '052', '053', '056', '058', '059', '060', '062', '064', '065', '067', '068', '069', '073', '075', '076', '078', '080', '081', '082', '084', '085', '086', '087', '088', '089', '091', '092', '096', '097', '098', '100', '101', '102', '104', '105', '106', '107', '108', '110', '111', '112', '114', '115', '116', '117', '118', '124', '125', '126', '128', '129', '136', '138', '139', '141', '144', '147', '153', '154', '161', '163', '167', '170', '174', '175', '179']


69

In [73]:
# Pick a folder with User labled data and raw GPS data
user_labled_num = 10

path = "./Geolife Trajectories 1.3/Data/"
first_dir = dir_with_labels[user_labled_num]

dir_path = path + first_dir
traject_path = dir_path+"/Trajectory/"
os.listdir(path = traject_path)[0]

#first_file = traject_path + os.listdir(path = traject_path)[0]
first_file = traject_path + '*.plt'
first_file

'./Geolife Trajectories 1.3/Data/064/Trajectory/*.plt'

In [74]:
file_location = first_file
file_type = "csv"

from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType, DateType, DoubleType, TimestampType

customSchema = StructType([
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("zero", IntegerType(), True),
    StructField("altitude", DoubleType(), True),
    StructField("datetype", StringType(), True),
    StructField("date", StringType(), True),
    StructField("time", StringType(), True),
])


infer_schema = "false"
first_row_is_header = "false"
delimiter = ","

df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .schema(customSchema) \
  .load(file_location)


df.printSchema
df.show()
df.count()

+---------+----------+----+--------+----------------+----------+--------+
| latitude| longitude|zero|altitude|        datetype|      date|    time|
+---------+----------+----+--------+----------------+----------+--------+
|     null|      null|null|    null|            null|      null|    null|
|     null|      null|null|    null|            null|      null|    null|
|     null|      null|null|    null|            null|      null|    null|
|     null|      null|null|    null|            null|      null|    null|
|      0.0|       2.0| 255|    null|               0|         0|       2|
|      0.0|      null|null|    null|            null|      null|    null|
| 39.97572|116.331006|   0|   492.0|39689.3979861111|2008-08-29|09:33:06|
|39.975729| 116.33103|   0|   492.0|39689.3980092593|2008-08-29|09:33:08|
|39.975677|116.331086|   0|   492.0|39689.3980324074|2008-08-29|09:33:10|
|39.975655|116.331083|   0|   492.0|39689.3980555556|2008-08-29|09:33:12|
|39.975651|116.331097|   0|   491.0|39

58892

In [75]:
dff = df.na.drop(how='any')
dff.show()
dff.count()

+---------+----------+----+--------+----------------+----------+--------+
| latitude| longitude|zero|altitude|        datetype|      date|    time|
+---------+----------+----+--------+----------------+----------+--------+
| 39.97572|116.331006|   0|   492.0|39689.3979861111|2008-08-29|09:33:06|
|39.975729| 116.33103|   0|   492.0|39689.3980092593|2008-08-29|09:33:08|
|39.975677|116.331086|   0|   492.0|39689.3980324074|2008-08-29|09:33:10|
|39.975655|116.331083|   0|   492.0|39689.3980555556|2008-08-29|09:33:12|
|39.975651|116.331097|   0|   491.0|39689.3980787037|2008-08-29|09:33:14|
|39.975646|116.331101|   0|   492.0|39689.3981018519|2008-08-29|09:33:16|
|39.975646|116.331093|   0|   492.0|    39689.398125|2008-08-29|09:33:18|
|39.975647|116.331083|   0|   491.0|39689.3981481481|2008-08-29|09:33:20|
|39.975662|116.331091|   0|   491.0|39689.3981712963|2008-08-29|09:33:22|
|39.975347|116.330993|   0|    87.0|39689.3981944444|2008-08-29|09:33:24|
|39.975346|116.331017|   0|   103.0|39

58754

# **Data cleaning and data managment**

In [76]:
dff=dff.drop('zero')
dff.show()

+---------+----------+--------+----------------+----------+--------+
| latitude| longitude|altitude|        datetype|      date|    time|
+---------+----------+--------+----------------+----------+--------+
| 39.97572|116.331006|   492.0|39689.3979861111|2008-08-29|09:33:06|
|39.975729| 116.33103|   492.0|39689.3980092593|2008-08-29|09:33:08|
|39.975677|116.331086|   492.0|39689.3980324074|2008-08-29|09:33:10|
|39.975655|116.331083|   492.0|39689.3980555556|2008-08-29|09:33:12|
|39.975651|116.331097|   491.0|39689.3980787037|2008-08-29|09:33:14|
|39.975646|116.331101|   492.0|39689.3981018519|2008-08-29|09:33:16|
|39.975646|116.331093|   492.0|    39689.398125|2008-08-29|09:33:18|
|39.975647|116.331083|   491.0|39689.3981481481|2008-08-29|09:33:20|
|39.975662|116.331091|   491.0|39689.3981712963|2008-08-29|09:33:22|
|39.975347|116.330993|    87.0|39689.3981944444|2008-08-29|09:33:24|
|39.975346|116.331017|   103.0|39689.3982175926|2008-08-29|09:33:26|
|39.975352|116.330991|   105.0|396

In [77]:
from pyspark.sql.functions import concat, col, lit

dfft = dff.withColumn('timedate', concat(dff.date.cast("string"), lit(" "), dff.time.cast("string")))
dfft.show()

+---------+----------+--------+----------------+----------+--------+-------------------+
| latitude| longitude|altitude|        datetype|      date|    time|           timedate|
+---------+----------+--------+----------------+----------+--------+-------------------+
| 39.97572|116.331006|   492.0|39689.3979861111|2008-08-29|09:33:06|2008-08-29 09:33:06|
|39.975729| 116.33103|   492.0|39689.3980092593|2008-08-29|09:33:08|2008-08-29 09:33:08|
|39.975677|116.331086|   492.0|39689.3980324074|2008-08-29|09:33:10|2008-08-29 09:33:10|
|39.975655|116.331083|   492.0|39689.3980555556|2008-08-29|09:33:12|2008-08-29 09:33:12|
|39.975651|116.331097|   491.0|39689.3980787037|2008-08-29|09:33:14|2008-08-29 09:33:14|
|39.975646|116.331101|   492.0|39689.3981018519|2008-08-29|09:33:16|2008-08-29 09:33:16|
|39.975646|116.331093|   492.0|    39689.398125|2008-08-29|09:33:18|2008-08-29 09:33:18|
|39.975647|116.331083|   491.0|39689.3981481481|2008-08-29|09:33:20|2008-08-29 09:33:20|
|39.975662|116.331091

In [78]:
dfft = dfft.drop('datetype', 'date', 'time')
dfft.show()

+---------+----------+--------+-------------------+
| latitude| longitude|altitude|           timedate|
+---------+----------+--------+-------------------+
| 39.97572|116.331006|   492.0|2008-08-29 09:33:06|
|39.975729| 116.33103|   492.0|2008-08-29 09:33:08|
|39.975677|116.331086|   492.0|2008-08-29 09:33:10|
|39.975655|116.331083|   492.0|2008-08-29 09:33:12|
|39.975651|116.331097|   491.0|2008-08-29 09:33:14|
|39.975646|116.331101|   492.0|2008-08-29 09:33:16|
|39.975646|116.331093|   492.0|2008-08-29 09:33:18|
|39.975647|116.331083|   491.0|2008-08-29 09:33:20|
|39.975662|116.331091|   491.0|2008-08-29 09:33:22|
|39.975347|116.330993|    87.0|2008-08-29 09:33:24|
|39.975346|116.331017|   103.0|2008-08-29 09:33:26|
|39.975352|116.330991|   105.0|2008-08-29 09:33:28|
|39.975367|116.330962|   108.0|2008-08-29 09:33:30|
|39.975379|116.330934|   109.0|2008-08-29 09:33:32|
|39.975375|  116.3309|   111.0|2008-08-29 09:33:34|
|39.975373|116.330857|   112.0|2008-08-29 09:33:36|
|39.975374|1

In [79]:
dfft = dfft.withColumn("user", lit(first_dir))
dfft.show()

+---------+----------+--------+-------------------+----+
| latitude| longitude|altitude|           timedate|user|
+---------+----------+--------+-------------------+----+
| 39.97572|116.331006|   492.0|2008-08-29 09:33:06| 064|
|39.975729| 116.33103|   492.0|2008-08-29 09:33:08| 064|
|39.975677|116.331086|   492.0|2008-08-29 09:33:10| 064|
|39.975655|116.331083|   492.0|2008-08-29 09:33:12| 064|
|39.975651|116.331097|   491.0|2008-08-29 09:33:14| 064|
|39.975646|116.331101|   492.0|2008-08-29 09:33:16| 064|
|39.975646|116.331093|   492.0|2008-08-29 09:33:18| 064|
|39.975647|116.331083|   491.0|2008-08-29 09:33:20| 064|
|39.975662|116.331091|   491.0|2008-08-29 09:33:22| 064|
|39.975347|116.330993|    87.0|2008-08-29 09:33:24| 064|
|39.975346|116.331017|   103.0|2008-08-29 09:33:26| 064|
|39.975352|116.330991|   105.0|2008-08-29 09:33:28| 064|
|39.975367|116.330962|   108.0|2008-08-29 09:33:30| 064|
|39.975379|116.330934|   109.0|2008-08-29 09:33:32| 064|
|39.975375|  116.3309|   111.0|

In [80]:
first_labels = dir_path +'/labels.txt'
file_type = "csv"

customSchema = StructType([
    StructField("start_time", StringType(), True),
    StructField("end_time", StringType(), True),
    StructField("mode", StringType(), True),
])

infer_schema = "false"
first_row_is_header = "true"
delimiter = "	"

dfl = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .schema(customSchema) \
  .load(first_labels)

dfl.show()
dfl.count()

+-------------------+-------------------+----+
|         start_time|           end_time|mode|
+-------------------+-------------------+----+
|2008/08/15 07:47:00|2008/08/15 08:02:22|walk|
|2008/08/15 08:02:23|2008/08/15 08:18:23| bus|
|2008/08/15 08:18:24|2008/08/15 08:24:01|walk|
|2008/08/16 00:17:00|2008/08/16 00:27:20| bus|
|2008/08/16 00:27:21|2008/08/16 00:59:10|walk|
|2008/08/16 06:18:59|2008/08/16 06:20:57| bus|
|2008/08/16 06:20:58|2008/08/16 06:54:18|walk|
|2008/08/16 06:54:19|2008/08/16 07:09:44| bus|
|2008/08/16 07:09:45|2008/08/16 09:33:04|walk|
|2008/08/16 09:33:05|2008/08/16 09:56:54| bus|
|2008/08/16 11:57:36|2008/08/16 12:19:42| bus|
|2008/08/16 12:19:43|2008/08/16 12:31:18|bike|
|2008/08/16 12:31:19|2008/08/16 12:52:08|walk|
|2008/08/16 23:53:35|2008/08/17 00:00:33|walk|
|2008/08/17 00:00:34|2008/08/17 00:21:26| bus|
|2008/08/17 00:21:27|2008/08/17 00:58:27|walk|
|2008/08/17 06:20:06|2008/08/17 06:30:16| bus|
|2008/08/17 06:30:17|2008/08/17 07:57:00|walk|
|2008/08/17 0

68

In [81]:
from pyspark.sql.functions import row_number,lit
from pyspark.sql.window import Window

w = Window().orderBy(lit('A'))

dfl = dfl.withColumn("traject_id", row_number().over(w))

In [82]:
dfl.show()

+-------------------+-------------------+----+----------+
|         start_time|           end_time|mode|traject_id|
+-------------------+-------------------+----+----------+
|2008/08/15 07:47:00|2008/08/15 08:02:22|walk|         1|
|2008/08/15 08:02:23|2008/08/15 08:18:23| bus|         2|
|2008/08/15 08:18:24|2008/08/15 08:24:01|walk|         3|
|2008/08/16 00:17:00|2008/08/16 00:27:20| bus|         4|
|2008/08/16 00:27:21|2008/08/16 00:59:10|walk|         5|
|2008/08/16 06:18:59|2008/08/16 06:20:57| bus|         6|
|2008/08/16 06:20:58|2008/08/16 06:54:18|walk|         7|
|2008/08/16 06:54:19|2008/08/16 07:09:44| bus|         8|
|2008/08/16 07:09:45|2008/08/16 09:33:04|walk|         9|
|2008/08/16 09:33:05|2008/08/16 09:56:54| bus|        10|
|2008/08/16 11:57:36|2008/08/16 12:19:42| bus|        11|
|2008/08/16 12:19:43|2008/08/16 12:31:18|bike|        12|
|2008/08/16 12:31:19|2008/08/16 12:52:08|walk|        13|
|2008/08/16 23:53:35|2008/08/17 00:00:33|walk|        14|
|2008/08/17 00

In [83]:
from datetime import datetime
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType

def timetotimestamp (string1) : 
  s = datetime.strptime(string1, "%Y-%m-%d %H:%M:%S")
  return datetime.timestamp(s)

udftimetotimestamp = udf(lambda x:timetotimestamp(x), DoubleType())

In [84]:
dfft = dfft.withColumn("date_timestamp", udftimetotimestamp('timedate'))
dfft.show()

+---------+----------+--------+-------------------+----+--------------+
| latitude| longitude|altitude|           timedate|user|date_timestamp|
+---------+----------+--------+-------------------+----+--------------+
| 39.97572|116.331006|   492.0|2008-08-29 09:33:06| 064| 1.220002386E9|
|39.975729| 116.33103|   492.0|2008-08-29 09:33:08| 064| 1.220002388E9|
|39.975677|116.331086|   492.0|2008-08-29 09:33:10| 064|  1.22000239E9|
|39.975655|116.331083|   492.0|2008-08-29 09:33:12| 064| 1.220002392E9|
|39.975651|116.331097|   491.0|2008-08-29 09:33:14| 064| 1.220002394E9|
|39.975646|116.331101|   492.0|2008-08-29 09:33:16| 064| 1.220002396E9|
|39.975646|116.331093|   492.0|2008-08-29 09:33:18| 064| 1.220002398E9|
|39.975647|116.331083|   491.0|2008-08-29 09:33:20| 064|   1.2200024E9|
|39.975662|116.331091|   491.0|2008-08-29 09:33:22| 064| 1.220002402E9|
|39.975347|116.330993|    87.0|2008-08-29 09:33:24| 064| 1.220002404E9|
|39.975346|116.331017|   103.0|2008-08-29 09:33:26| 064| 1.22000

In [85]:
def timetotimestampl (string1) : 
  s = datetime.strptime(string1, "%Y/%m/%d %H:%M:%S")
  return datetime.timestamp(s)

udftimetotimestampl = udf(lambda x:timetotimestampl(x), DoubleType())


In [86]:
dfl = dfl.withColumn("start_time_conv", udftimetotimestampl('start_time'))
dfl = dfl.withColumn("end_time_conv", udftimetotimestampl('end_time'))
dfl.show()

+-------------------+-------------------+----+----------+---------------+-------------+
|         start_time|           end_time|mode|traject_id|start_time_conv|end_time_conv|
+-------------------+-------------------+----+----------+---------------+-------------+
|2008/08/15 07:47:00|2008/08/15 08:02:22|walk|         1|   1.21878642E9|1.218787342E9|
|2008/08/15 08:02:23|2008/08/15 08:18:23| bus|         2|  1.218787343E9|1.218788303E9|
|2008/08/15 08:18:24|2008/08/15 08:24:01|walk|         3|  1.218788304E9|1.218788641E9|
|2008/08/16 00:17:00|2008/08/16 00:27:20| bus|         4|   1.21884582E9| 1.21884644E9|
|2008/08/16 00:27:21|2008/08/16 00:59:10|walk|         5|  1.218846441E9| 1.21884835E9|
|2008/08/16 06:18:59|2008/08/16 06:20:57| bus|         6|  1.218867539E9|1.218867657E9|
|2008/08/16 06:20:58|2008/08/16 06:54:18|walk|         7|  1.218867658E9|1.218869658E9|
|2008/08/16 06:54:19|2008/08/16 07:09:44| bus|         8|  1.218869659E9|1.218870584E9|
|2008/08/16 07:09:45|2008/08/16 

files=glob.glob(path +'*.csv')

for idx,f in enumerate(files):
    if idx == 0:
        df = spark.read.csv(f,header=True,inferSchema=True)
        dff = df
    else:
        df = spark.read.csv(f,header=True,inferSchema=True)
        dff=dff.unionAll(df)

In [87]:
cond = [(dfft.date_timestamp >= dfl.start_time_conv) & (dfft.date_timestamp <= dfl.end_time_conv)]
joi = dfft.join(dfl, cond, 'inner').select(dfft.latitude, dfft.longitude,
                                           dfft.altitude,
                                           dfft.timedate,
                                           dfft.date_timestamp,  
                                           #dfl.start_time, 
                                           #dfl.end_time, 
                                           dfl.mode,
                                           dfft.user,
                                           dfl.traject_id, 
                                           )
joi.show()
joi.count()

+---------+----------+--------+-------------------+--------------+----+----+----------+
| latitude| longitude|altitude|           timedate|date_timestamp|mode|user|traject_id|
+---------+----------+--------+-------------------+--------------+----+----+----------+
| 39.97572|116.331006|   492.0|2008-08-29 09:33:06| 1.220002386E9|bike| 064|        63|
|39.975729| 116.33103|   492.0|2008-08-29 09:33:08| 1.220002388E9|bike| 064|        63|
|39.975677|116.331086|   492.0|2008-08-29 09:33:10|  1.22000239E9|bike| 064|        63|
|39.975655|116.331083|   492.0|2008-08-29 09:33:12| 1.220002392E9|bike| 064|        63|
|39.975651|116.331097|   491.0|2008-08-29 09:33:14| 1.220002394E9|bike| 064|        63|
|39.975646|116.331101|   492.0|2008-08-29 09:33:16| 1.220002396E9|bike| 064|        63|
|39.975646|116.331093|   492.0|2008-08-29 09:33:18| 1.220002398E9|bike| 064|        63|
|39.975647|116.331083|   491.0|2008-08-29 09:33:20|   1.2200024E9|bike| 064|        63|
|39.975662|116.331091|   491.0|2

37682

In [88]:
#joi.coalesce(1).write.csv("joi_new.csv", header = True)

In [89]:
#data = joi.take(10)
#columns = joi.columns
#test10 = spark.createDataFrame(data = data, schema = columns)
#test10.show(truncate=False)


In [90]:
from pyspark.sql.window import Window
import pyspark.sql.functions as func
from pyspark.sql.functions import lit
from pyspark.sql.functions import lag   

#test10 = test10.withColumn('prev_period_lat', lag('latitude', 1).over(Window.orderBy("traject_id").partitionBy("traject_id"))) \
#                .withColumn('prev_period_long', lag('longitude', 1).over(Window.orderBy("traject_id").partitionBy("traject_id"))) \
#                .withColumn('prev_period_alt', lag('altitude', 1).over(Window.orderBy("traject_id").partitionBy("traject_id"))) \
#                .withColumn('prev_period_td', lag('date_timestamp', 1).over(Window.orderBy("traject_id").partitionBy("traject_id"))).na.drop() 


#test10.show()

# **Feature ingeneering**

In [91]:
joi = joi.withColumn('prev_period_lat', lag('latitude', 1).over(Window.orderBy("traject_id").partitionBy("traject_id"))) \
                .withColumn('prev_period_long', lag('longitude', 1).over(Window.orderBy("traject_id").partitionBy("traject_id"))) \
                .withColumn('prev_period_alt', lag('altitude', 1).over(Window.orderBy("traject_id").partitionBy("traject_id"))) \
                .withColumn('prev_period_td', lag('date_timestamp', 1).over(Window.orderBy("traject_id").partitionBy("traject_id"))).na.drop() 

joi.show()

+---------+----------+--------+-------------------+--------------+----+----+----------+---------------+----------------+---------------+--------------+
| latitude| longitude|altitude|           timedate|date_timestamp|mode|user|traject_id|prev_period_lat|prev_period_long|prev_period_alt|prev_period_td|
+---------+----------+--------+-------------------+--------------+----+----+----------+---------------+----------------+---------------+--------------+
| 40.04533|116.290971|   182.0|2008-08-20 07:51:12| 1.219218672E9| bus| 064|        31|      40.045226|      116.291025|          183.0|  1.21921867E9|
|40.045447|116.290935|   182.0|2008-08-20 07:51:14| 1.219218674E9| bus| 064|        31|       40.04533|      116.290971|          182.0| 1.219218672E9|
|40.045603|116.290924|   182.0|2008-08-20 07:51:16| 1.219218676E9| bus| 064|        31|      40.045447|      116.290935|          182.0| 1.219218674E9|
|40.045742|116.290905|   181.0|2008-08-20 07:51:18| 1.219218678E9| bus| 064|        31| 

In [92]:
! pip install haversine
from haversine import haversine, Unit

def distance (la1, lo1, la2, lo2) : 
  point1 = (la1, lo1)
  point2 = (la2, lo2)
  return haversine(point1, point2)

udfdistance = udf(lambda x1, y1, x2, y2:distance(x1, y1, x2, y2), DoubleType())



In [93]:
joi = joi.withColumn('dist', udfdistance('prev_period_lat', 'prev_period_long', 'latitude', 'longitude')) \
      .withColumn('delta_t', col('date_timestamp') - col('prev_period_td')) \
      .withColumn('velocity', col('dist') / col('delta_t')) \
      .drop('latitude', 'longitude', 'timedate', 'date_timestamp', 'prev_period_lat', 'prev_period_long', 'prev_period_alt', 'prev_period_td', 'delta_t')



In [94]:
joi.show()

+--------+----+----+----------+--------------------+--------------------+
|altitude|mode|user|traject_id|                dist|            velocity|
+--------+----+----+----------+--------------------+--------------------+
|   182.0| bus| 064|        31|0.012444368662029982|0.006222184331014991|
|   182.0| bus| 064|        31| 0.01336586733725195|0.006682933668625975|
|   182.0| bus| 064|        31|0.017371686430085225|0.008685843215042613|
|   181.0| bus| 064|        31|0.015540506049884157|0.007770253024942078|
|   181.0| bus| 064|        31|  0.0148316738747519| 0.00741583693737595|
|   181.0| bus| 064|        31| 0.01271963051462853|0.006359815257314265|
|   181.0| bus| 064|        31|0.011212797709876782|0.005606398854938391|
|   182.0| bus| 064|        31|0.012092830008644582|0.006046415004322291|
|   181.0| bus| 064|        31|0.013930279979849516|0.006965139989924758|
|   182.0| bus| 064|        31| 0.01414332840102037|0.007071664200510185|
|   184.0| bus| 064|        31|0.01429

# **Data split and normalization**

In [95]:
trainDF, testDF = joi.randomSplit([0.8, 0.2], seed=42)
print(trainDF.cache().count()) # Cache because accessing training data multiple times
print(testDF.count())

30072
7542


In [96]:
from pyspark.ml.feature import StringIndexer

labelToIndex = StringIndexer(inputCol="mode", outputCol="label")


In [97]:
from pyspark.ml.feature import VectorAssembler

# This includes both the numeric columns and the one-hot encoded binary vector columns in our dataset.
numericCols = ["dist", "velocity", "altitude"]
assemblerInputs =  numericCols
vecAssembler = VectorAssembler(outputCol="features")
vecAssembler.setInputCols(numericCols)

VectorAssembler_bb320c320191

In [98]:
vecAssembler.transform(trainDF).head().features

DenseVector([0.0358, 0.0179, 87.0])

#**Running Logistic Model as Multiclass Classification Tool**


In [99]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol="features", labelCol="label", regParam=1.0)

In [100]:
from pyspark.ml import Pipeline

# Define the pipeline based on the stages created in previous steps.
pipeline = Pipeline(stages=[labelToIndex, vecAssembler.setParams(handleInvalid="skip"), lr])

# Define the pipeline model.
pipelineModel = pipeline.fit(trainDF)

# Apply the pipeline model to the test dataset.
predDF = pipelineModel.transform(testDF)

In [101]:
predDF.select("features", "label", "prediction", "probability").show()

+--------------------+-----+----------+--------------------+
|            features|label|prediction|         probability|
+--------------------+-----+----------+--------------------+
|[0.03528404321834...|  0.0|       0.0|[0.57935227321187...|
|[0.03748638555547...|  0.0|       0.0|[0.59485349109513...|
|[0.03769135536379...|  0.0|       0.0|[0.59628946901424...|
|[0.03304485955598...|  0.0|       0.0|[0.56351007867173...|
|[0.03448348778162...|  0.0|       0.0|[0.57374096459849...|
|[0.01755555834966...|  0.0|       0.0|[0.45140029767151...|
|[0.02233139132480...|  0.0|       0.0|[0.48604858303953...|
|[0.03891646186345...|  0.0|       0.0|[0.60482882783627...|
|[0.01606323814569...|  0.0|       0.0|[0.44064678919543...|
|[0.01644624621099...|  0.0|       0.0|[0.44340927933177...|
|[0.00685321432775...|  0.0|       0.0|[0.37571566438903...|
|     [0.0,0.0,104.0]|  0.0|       1.0|[0.32988313633215...|
|[4.20114484625496...|  0.0|       1.0|[0.33261527944487...|
|[0.01941322017564...|  

In [102]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

mcEvaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print(f"Accuracy: {mcEvaluator.evaluate(predDF)}")

mcEvaluator = MulticlassClassificationEvaluator(metricName="f1")
print(f"F1-score: {mcEvaluator.evaluate(predDF)}")


Accuracy: 0.6100503845133917
F1-score: 0.5227435722133764


#**Running Random Forest as Multiclass Classification Tool**

In [103]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)

In [104]:
# Define the pipeline based on the stages created in previous steps.
pipeline_rf = Pipeline(stages=[labelToIndex, vecAssembler.setParams(handleInvalid="skip"), rf])

# Define the pipeline model.
pipelineModel_rf = pipeline_rf.fit(trainDF)

# Apply the pipeline model to the test dataset.
predDF_rf = pipelineModel_rf.transform(testDF)

predDF_rf.select("features", "label", "prediction", "probability").show()

+--------------------+-----+----------+--------------------+
|            features|label|prediction|         probability|
+--------------------+-----+----------+--------------------+
|[0.03528404321834...|  0.0|       0.0|[0.91765971693057...|
|[0.03748638555547...|  0.0|       0.0|[0.91765971693057...|
|[0.03769135536379...|  0.0|       0.0|[0.91765971693057...|
|[0.03304485955598...|  0.0|       0.0|[0.91765971693057...|
|[0.03448348778162...|  0.0|       0.0|[0.91765971693057...|
|[0.01755555834966...|  0.0|       0.0|[0.91488234106959...|
|[0.02233139132480...|  0.0|       0.0|[0.91695716788965...|
|[0.03891646186345...|  0.0|       0.0|[0.91977356917974...|
|[0.01606323814569...|  0.0|       0.0|[0.91488234106959...|
|[0.01644624621099...|  0.0|       0.0|[0.91488234106959...|
|[0.00685321432775...|  0.0|       2.0|[0.14550197961853...|
|     [0.0,0.0,104.0]|  0.0|       1.0|[0.19784992487864...|
|[4.20114484625496...|  0.0|       1.0|[0.19784992487864...|
|[0.01941322017564...|  

In [105]:
mcEvaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print(f"Accuracy: {mcEvaluator.evaluate(predDF_rf)}")

mcEvaluator = MulticlassClassificationEvaluator(metricName="f1")
print(f"F1-score: {mcEvaluator.evaluate(predDF_rf)}")


Accuracy: 0.7291169451073986
F1-score: 0.7303179792697163


# **Model hyperparameters tuning**

In [106]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.5, 2.0])
             .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
             .build())

In [None]:
# Create a 3-fold CrossValidator
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=bcEvaluator, numFolds=3, parallelism = 4)

# Run cross validations. This step takes a few minutes and returns the best model found from the cross validation.
cvModel = cv.fit(trainDF)

In [None]:
# Use the model identified by the cross-validation to make predictions on the test dataset
cvPredDF = cvModel.transform(testDF)

mcEvaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print(f"Accuracy: {mcEvaluator.evaluate(cvPredDF)}")

mcEvaluator = MulticlassClassificationEvaluator(metricName="f1")
print(f"F1-score: {mcEvaluator.evaluate(cvPredDF)}")
