## CS4830 Project

* Aniruddha (ME18B181)
* Vasudev Gupta (ME18B182)
* Shubham (ME18B183)

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import (
    StringIndexer,
    IndexToString,
    VectorAssembler,
    OneHotEncoder,
)
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline, PipelineModel

In [2]:
spark = (
    SparkSession.builder.appName("CS4830_project")
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.5")
    .getOrCreate()
)

:: loading settings :: url = jar:file:/usr/lib/spark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.11 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-31cd26bd-1ffa-4876-93c3-2643a5fa1ab0;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.11;2.4.5 in central
	found com.typesafe#config;1.3.0 in central
	found org.rocksdb#rocksdbjni;6.5.3 in central
	found org.apache.hadoop#hadoop-aws;3.2.0 in central
	found com.amazonaws#aws-java-sdk-core;1.11.603 in central
	found commons-logging#commons-logging;1.1.3 in central
	found org.apache.httpcomponents#httpclient;4.5.9 in central
	found org.apache.httpcomponents#httpcore;4.4.11 in central
	found commons-codec#commons-codec;1.11 in central
	found software.amazon.ion#ion-java;1.0.2 in central
	found com.fasterxml.jackson.dataformat#jackson-dataformat-cbor;2.6.7 in central
	found joda-time#joda-time;2.8.1 in central
	found com.amazonaws#aws-java-sdk-s3;1.11.603 in c

In [3]:
DATA_PATH = "gs://big-data-cs4830/project/trainingdatanyc.csv/*.csv"

## Data Exploration Steps

In [None]:
df = spark.read.option("header", "true").csv(DATA_PATH)

In [None]:
df = df.filter(col("Violation Precinct").isNotNull())
df.select("Violation Precinct").distinct().count()

In [None]:
df.select("Violation Precinct").distinct().sort("Violation Precinct").show()

In [None]:
df.printSchema()

In [None]:
df.select(
    [count(when(isnan(col) | isnull(col), col)).alias(col) for col in df.columns]
).show()

In [None]:
# removed due to presence of many null values
cols_to_drop = [
    "Time First Observed",
    "Intersecting Street",
    "Law Section",
    "Violation Legal Code",
    "To Hours In Effect",
    "Unregistered Vehicle?",
    "Meter Number",
    "Violation Description",
    "No Standing or Stopping Violation",
    "Hydrant Violation",
    "Double Parking Violation",
    "Latitude",
    "Longitude",
    "Community Board",
    "Community Council",
    "Census Tract",
    "BIN",
    "BBL",
    "NTA",
]
df = df.select([col for col in df.columns if col not in cols_to_drop])

In [None]:
# handling null values
df = df.na.fill("NULL")

In [None]:
for col in df.columns:
    print(col, df.select(col).distinct().count())

In [None]:
input_columns = [
    "Feet From Curb",
    "Violation In Front Of Or Opposite",
    "Issuing Agency",
    "Violation County",
    "Plate Type",
    "Violation Code",
    "Registration State",
    "Issuer Squad",
]
df = df.select(input_columns + ["Violation Precinct"])

In [None]:
df.printSchema()

## Exporting code for production

In [4]:
INPUT_COLUMNS = [
    "Feet From Curb",
    "Violation In Front Of Or Opposite",
    "Issuing Agency",
    "Violation County",
    "Plate Type",
    "Violation Code",
    "Registration State",
    "Issuer Squad",
]
TARGET_COLUMN = "Violation Precinct"

In [5]:
def read_and_prepare_data(path):
    df = spark.read.option("header", "true").csv(path)
    df = df.filter(col(TARGET_COLUMN).isNotNull())

    df = df.na.fill("NULL")
    df = df.select(INPUT_COLUMNS + [TARGET_COLUMN])

    return df

## Preparing data for training & inference

In [6]:
data = read_and_prepare_data(DATA_PATH)
data.printSchema()

                                                                                

root
 |-- Feet From Curb: string (nullable = false)
 |-- Violation In Front Of Or Opposite: string (nullable = false)
 |-- Issuing Agency: string (nullable = false)
 |-- Violation County: string (nullable = false)
 |-- Plate Type: string (nullable = false)
 |-- Violation Code: string (nullable = false)
 |-- Registration State: string (nullable = false)
 |-- Issuer Squad: string (nullable = false)
 |-- Violation Precinct: string (nullable = false)



In [9]:
tr_data, val_data, _ = data.randomSplit([0.01, 0.001, 0.989], seed=42)
print({"train_size": tr_data.count(), "val_size": val_data.count()})



{'train_size': 224367, 'val_size': 22468}


                                                                                

In [None]:
# # just for testing purposes
# _, tr_data = data.randomSplit([0.999, 0.001], seed=42)
# val_data = tr_data
# tr_data.count()

[Stage 2:>                                                          (0 + 1) / 1]

## Setting up model pipeline

In [8]:
labelIndexer = StringIndexer(inputCol=TARGET_COLUMN, outputCol="label")\
    .setHandleInvalid("skip")\
    .fit(tr_data)
print("labels:", labelIndexer.labels)

                                                                                

labels: ['0', '19', '18', '14', '1', '114', '13', '109', '17', '20', '84', '70', '115', '61', '112', '6', '66', '10', '52', '103', '108', '9', '5', '90', '24', '68', '110', '62', '104', '46', '78', '102', '43', '49', '107', '34', '94', '23', '7', '47', '77', '33', '44', '67', '72', '48', '40', '45', '88', '106', '105', '79', '28', '50', '83', '63', '75', '60', '71', '30', '25', '26', '32', '41', '111', '76', '73', '42', '69', '120', '113', '122', '81', '100', '101', '121', '123', '27', '22', '2', '11', '15', '12', '3', '65', '87', '127', '21', '4', '8', '119', '16', '29', '31', '39', '56', '74', '98', '116', '118', '133', '170', '276', '35', '36', '367', '37', '408', '428', '501', '54', '57', '58', '64', '668', '80', '806', '85', '89', '91', '923', '96', '99']


In [9]:
feature_indexers = [
    StringIndexer(inputCol=col, outputCol=col + "_index").setHandleInvalid("keep") for col in INPUT_COLUMNS
]
feature_pipeline = Pipeline(stages=feature_indexers).fit(tr_data)

                                                                                

In [10]:
OHE = OneHotEncoder(
    inputCols=[col + "_index" for col in INPUT_COLUMNS],
    outputCols=[col + "_onehot" for col in INPUT_COLUMNS],
)

In [11]:
assembler = VectorAssembler(
    inputCols=[col + "_onehot" for col in INPUT_COLUMNS],
    outputCol="features",
)

In [12]:
model = LogisticRegression(
    featuresCol="features", labelCol="label", predictionCol="class"
)
index_to_string = IndexToString(
    inputCol="class", outputCol="prediction", labels=labelIndexer.labels
)

In [13]:
stages = [
    labelIndexer,
    feature_pipeline,
    OHE,
    assembler,
    model,
    index_to_string,
]
pipeline = Pipeline(stages=stages)

In [14]:
pipeline = pipeline.fit(tr_data)

22/05/15 11:59:36 WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/05/15 11:59:36 WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
                                                                                

In [15]:
MODEL_PATH = "gs://big-data-cs4830/project/final_model"

In [17]:
pipeline.save(MODEL_PATH)

                                                                                

## Using model for inference

In [18]:
# running the model directly for testing
pipeline = PipelineModel.load(MODEL_PATH)

                                                                                

In [19]:
accuracy_metric = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="class", metricName="accuracy"
)
f1_metric = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="class", metricName="f1"
)

In [22]:
tr_pred = pipeline.transform(tr_data.limit(20000)).select("class", "label")
tr_pred.show()



+-----+-----+
|class|label|
+-----+-----+
| 39.0| 29.0|
| 39.0| 39.0|
| 29.0| 18.0|
| 32.0| 42.0|
| 29.0| 46.0|
| 29.0| 63.0|
| 29.0| 32.0|
| 39.0| 63.0|
| 39.0| 18.0|
| 46.0| 39.0|
| 63.0| 63.0|
| 63.0| 32.0|
| 29.0| 39.0|
| 29.0| 67.0|
| 29.0| 29.0|
| 29.0| 45.0|
| 29.0| 33.0|
| 29.0| 18.0|
| 29.0| 42.0|
| 29.0| 45.0|
+-----+-----+
only showing top 20 rows



                                                                                

In [24]:
accuracy_metric.evaluate(tr_pred), f1_metric.evaluate(tr_pred)

                                                                                

(0.3599, 0.3346105074712877)

In [20]:
val_pred = pipeline.transform(val_data).select("class", "label")
val_pred.show()

[Stage 380:>                                                        (0 + 1) / 1]

+-----+-----+
|class|label|
+-----+-----+
| 29.0| 42.0|
| 32.0| 29.0|
| 39.0| 33.0|
| 39.0| 53.0|
| 10.0| 10.0|
| 58.0| 16.0|
| 58.0| 72.0|
| 43.0| 54.0|
| 43.0| 65.0|
|  4.0| 35.0|
|  1.0| 15.0|
| 24.0| 24.0|
|  1.0| 60.0|
|  3.0|  8.0|
| 35.0| 61.0|
| 70.0| 28.0|
| 12.0|  5.0|
| 12.0|  7.0|
| 44.0| 58.0|
| 27.0| 11.0|
+-----+-----+
only showing top 20 rows



                                                                                

In [21]:
accuracy_metric.evaluate(val_pred), f1_metric.evaluate(val_pred)

                                                                                

(0.36024927665257067, 0.33193978290893916)

### Kafka Producer

In [25]:
# TODO: change following for demo
REAL_TIME_DATA_PATH = DATA_PATH
BROKER = "10.128.0.40:9092"
TOPIC = "CS4830-project"
LIMIT = 150

In [26]:
!pip3 install -U -q kafka-python

import os

os.environ[
    "PYSPARK_SUBMIT_ARGS"
] = "--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.1"

[0m

In [29]:
import json
import time
from kafka import KafkaProducer
from tqdm.auto import tqdm

In [None]:
real_time_df = read_and_prepare_data(REAL_TIME_DATA_PATH)

                                                                                

In [30]:
# data file should be small otherwise one needs to allocate bigger cluster
real_time_df = real_time_df.limit(LIMIT)

In [31]:
producer = KafkaProducer(
    bootstrap_servers=[BROKER], value_serializer=lambda x: json.dumps(x).encode("utf-8")
)

In [32]:
pandas_df = real_time_df.toPandas()

In [34]:
for index, row in tqdm(pandas_df.iterrows()):
    payload = ",".join(str(x) for x in row.to_dict().values())
    producer.send(TOPIC, value="," + payload + ",")
    producer.flush()
    time.sleep(0.1)

0it [00:00, ?it/s]