In [None]:
import seaborn as sns
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.ml.feature import PCA, RFormula
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime

### Connect to Spark

In [None]:
ss = SparkSession.Builder() \
     .appName("DashBoard") \
     .master("spark://post-batch-processing-spark-master:7077") \
     .getOrCreate()

### Read data from parquet file "trips.parquet" in hdfs

In [None]:
df = ss.read.parquet("hdfs://namenode:9000/trips/trips.parquet")

In [None]:
print(f"Number of records: {df.count()}")
df = df.sort('ArrivalTime')

# Data Mining

### Distribution of trips over time

In [None]:
arrivalTime = df.select('ArrivalTime').toPandas()['ArrivalTime'].astype('int64')
sns.distplot(arrivalTime)

In [None]:
arrivalTime = df.select('DepartureTime').toPandas()['DepartureTime'].astype('int64')
sns.distplot(arrivalTime)

### Top 10 most visited destinations in the last year

In [None]:
arrivalTime = df.select('DepartureTime').toPandas()['DepartureTime'].astype('int32')
thirty_days = 86400 * 30 * 12
lastRecordTime = arrivalTime.iloc[-1]
pivot = int(lastRecordTime - thirty_days)
s = df.filter(df.DepartureTime > pivot).select('Destination')

In [None]:
s.head(10)

# Anomaly detection

### PCA

In [None]:
df = ss.read.parquet("hdfs://namenode:9000/trips/processed_trips.parquet")

In [None]:
print(f"Number of records: {df.count()}")
df = df.sort('ArrivalTime')

In [None]:
df = df.select(
    'ArrivalTime',
    'BusinessLeisure',
    'CabinCategory',
    'CreationDate',
    'CurrencyCode',
    'DepartureTime',
    'Destination',
    'OfficeIdCountry',
    'Origin',
    'TotalAmount',
    'nPAX'
)

In [None]:
pca = PCA().setInputCol("features").setK(2)
data = RFormula(formula=" ~ {0}".format(" + ".join(df.columns))).fit(df).transform(df)
s = pca.fit(data).transform(data)
r = s.select(s.columns[-1]).toPandas()[s.columns[-1]]

X = []
Y = []
for i in range(len(r)):
    X.append(r[i][0])
    Y.append(r[i][1])

In [None]:
plt.scatter(X, Y)