*Problem Statement*: Predict No.of faults, revenue and store it in hive table by reading incoming data from Producer as Stream. Predict No.of faults, revenue is done by using Spark ML Linear regression. 
1. Launch API Producer -> 2. Read data from API as stream -> 3. Build model -> 4. apply Linear regression Algorithm -> 5. Save the results in hive Table.

In [0]:
import requests
import json
from pyspark.sql.types import StringType, IntegerType, TimestampType, DateType, DoubleType, StructType, StructField
from pyspark.sql.functions import udf
from pyspark.sql import SparkSession
from pyspark.sql import HiveContext
from pyspark.sql import SQLContext, Row
from pyspark.sql.functions import unix_timestamp, from_unixtime
from pyspark.sql import functions as F
from pyspark.sql.window import Window as W
from functools import reduce  # For Python 3.x
from pyspark.sql import DataFrame
from pyspark.sql.functions import lit
from pyspark.sql.functions import rank, col
import datetime
import time
import re
from pyspark.streaming import StreamingContext
import pprint
from pyspark.mllib.stat import Statistics
import numpy as np
  


In [0]:
def convertCharges(charges):
  if (len(charges) > 0):
    try:
      obj = charges.partition("£")[2]
      return obj
    except UnicodeEncodeError:
        return obj.encode('ascii', 'ignore').decode('ascii')   
  else: 
    return "0"
  
udfConvertCharges = udf(convertCharges, StringType())

In [0]:
# Build datasets 
def create_category_vars( dataset, field_name ):
  idx_col = field_name + "Index"
  col_vec = field_name + "Vec"

  month_stringIndexer = StringIndexer( inputCol=field_name,
                                       outputCol=idx_col )

  month_model = month_stringIndexer.fit( dataset )
  month_indexed = month_model.transform( dataset )

  month_encoder = OneHotEncoder( dropLast=True,
                                 inputCol=idx_col,
                                 outputCol= col_vec )

  return month_encoder.transform( month_indexed )

In [0]:
def build_model(tableName):
  maintenance_sql = "select * from " + tableName
  df_maintenance = spark.sql(maintenance_sql)
  df_maintenance = df_maintenance.withColumn("Charges_incurred", udfConvertCharges("Charges_incurred") )
  df = df_maintenance.select(col('Maintenance_id'), col('Apartment_number'), col('Mdate'), col('Issue_reported'), \
                           col('Contractor_id'), col('Resolution'), col('Status'), \
                           df_maintenance.Charges_incurred.cast('float').alias('Charges_incurred'))

  df.show()
  # Find the correlation between house price and sqft_living
  column_labels = ['Apartment_number','Mdate', 'Issue_reported', 'Contractor_id','Resolution', \
         'Status', 'Charges_incurred']
  df.stat.corr( 'Apartment_number', 'Charges_incurred' )
  # find which features highly correlated with price One can use many methoods to fidn co relations between the columns 
  column_corr = Statistics.corr(df.rdd.map(lambda x:
                         np.array([x['Apartment_number'],
                                   x['Contractor_id'],
                                   x['Charges_incurred']
                                  ])), method='pearson')
  # categorize continuous features and categorical features
  continuous_features = ['Apartment_number',  'Contractor_id', 'Charges_incurred']

  categorical_features = ['Status']
  # OneHot Encoding for all the categorical columns
  # Encoding - convert any text value within columns into the numbers Spark supply function - OneHotEncoder
  from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler, PolynomialExpansion, VectorIndexer

  for columnName in categorical_features:
    df = create_category_vars( df, columnName )

  df.cache()
  # create features for vector Assembler
  featureCols = continuous_features + ['StatusVec']
  assembler = VectorAssembler( inputCols = featureCols, outputCol = "features")
  maintenance_train_df = assembler.transform( df )
  # Create label
  from pyspark.sql.functions import round

  maintenance_train_df = maintenance_train_df.withColumn( "label", round('Charges_incurred', 4) )
  # split the dataset
  train_df, test_df = maintenance_train_df.randomSplit( [0.7, 0.3], seed = 30 )
  # build the linear regression model
  from pyspark.ml.regression import LinearRegression
  # regParam=0.0 
  linreg = LinearRegression(maxIter=500, regParam=0.0)
  lm = linreg.fit( train_df )
  trainingSummary = lm.summary
  print("numIterations: %d" % trainingSummary.totalIterations)
  print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
  trainingSummary.residuals.show()
  print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
  print("r2: %f" % trainingSummary.r2)
  
  lm.coefficients
  # make predictions for test data and evaluate
  y_pred = lm.transform( test_df )
  from pyspark.sql.functions import month, sum

  (y_pred
    .groupBy(month("Mdate").alias("Month"))
    .agg(sum("prediction").alias("Expected Revenue"))
    .show())
  
  # store the predicted value in table.
  if (len(spark.sql("SHOW TABLES LIKE '" + "apt_maintenace_prediction"+ "'").collect()) == 1):
      y_pred.write.insertInto("apt_maintenace_prediction")
  else:
      y_pred.write.saveAsTable("apt_maintenace_prediction")
  
  # calculate actual predicted price
  from pyspark.sql.functions import exp

  y_pred = y_pred.withColumn( "y_pred", exp( 'prediction' ) )

  # calculate RMSE
  from pyspark.ml.evaluation import RegressionEvaluator
  rmse_evaluator = RegressionEvaluator(labelCol="Charges_incurred",
                              predictionCol="y_pred",
                              metricName="rmse" )
  lm_rmse = rmse_evaluator.evaluate( y_pred )
  lm_rmse


In [0]:
def processD(apt_maintenance):
  apt_maintenance_map = apt_maintenance.filter(lambda l: len(l) == 8).map(lambda p: Row(\
                                         Maintenance_id=p[0], Apartment_number=p[1], Mdate=p[2], \
                                         Issue_reported=p[3],Contractor_id=p[4], Resolution=p[5], \
                                         Status=p[6], Charges_incurred=p[7]))

  #apt_maintenance_map = apt_maintenance.map(lambda p: Row(Maintenance_id=p[0], Apartment_number=p[1], Mdate=p[2], \
  #                                       Issue_reported=p[3],Contractor_id=p[4], Resolution=p[5], \
  #                                       Status=p[6], Charges_incurred=p[7]))
  #apt_maintenance_map_filter = apt_maintenance_map.filter(lambda line: len(line) == 8)

  count = apt_maintenance_map.count()
  if (count > 0):
    print("count of records:" + str(count))
    apt_maint_df = sqlContext.createDataFrame(apt_maintenance_map)
     
    if (len(spark.sql("SHOW TABLES LIKE '" + "apt_maintenace_data"+ "'").collect()) == 1):
      apt_maint_df.write.insertInto("apt_maintenace_data")
    else:
      apt_maint_df.write.saveAsTable("apt_maintenace_data")
    build_model("apt_maintenace_data")
  else:
    print("RDD is empty")

In [0]:
# Schema for Apartment Maintenance
apt_maintenance_schema = StructType([
            StructField("Maintenance_id", IntegerType(), True),
            StructField("Apartment_number", IntegerType(), True),
            StructField("Mdate", StringType(), True),
            StructField("Issue_reported", StringType(), True),
            StructField("Contractor_id", IntegerType(), True), 
            StructField("Resolution", StringType(), True), 
            StructField("Status", StringType(), True),
            StructField("Charges_incurred", StringType(), True),
            StructField("event_time", StringType(), True)])

apt_maintenance_path = "/FileStore/users/apt_maintenance/inprogress" 


In [0]:
# streaming starts here by reading the input files 
apt_maint_df = (
  spark
    .readStream
    .schema(apt_maintenance_schema)
    .option("maxFilesPerTrigger", "1")
    .option("header", "true")
    .csv(apt_maintenance_path)
)
apt_maint_df.registerTempTable("apt_maintenance")
query = apt_maint_df.writeStream.outputMode("append").format("console").start()
query.awaitTermination()
apt_maint_df.show()

build_model("apt_maintenance")
if (len(spark.sql("SHOW TABLES LIKE '" + "apt_maintenace_data"+ "'").collect()) == 1):
  apt_maint_df.write.insertInto("apt_maintenace_data")
else:
  apt_maint_df.write.saveAsTable("apt_maintenace_data")

In [0]:
# streaming starts here by reading the input files 
apt_stream_df =  ( spark.readStream \
  .schema(apt_maintenance_schema) \
  .option("maxFilesPerTrigger", "1") \
  .option("header", "true") \
  .option("multiLine", "true") \
  .csv(apt_maintenance_path) )
  


In [0]:
import schedule
import time
def job():
  query = apt_stream_df \
    .writeStream \
    .outputMode("complete") \
    .format("console") \
    .start()

   
  apt_stream_df.show()
  print(apt_stream_df.count())
schedule.every(10).seconds.do(job)

while True:
  schedule.run_pending()
  time.sleep(1)

In [0]:
from io import StringIO
import csv
batchIntervalSeconds = 10 
fileprefix = apt_maintenance_path

ssc = StreamingContext(sc, batchIntervalSeconds)

# Set each DStreams in this context to remember RDDs it generated in the last given duration.
# DStreams remember RDDs only for a limited duration of time and releases them for garbage
# collection. This method allows the developer to specify how long to remember the RDDs (
# if the developer wishes to query old data outside the DStream computation).


userDStream = ssc.textFileStream(fileprefix)
# lines = [v for v in csv.reader(StringIO(userDStream.encode('utf8', 'ignore')))]

split_users = userDStream.map(lambda l: l.split(","))

#files = userDStream.foreachRDD(fileName)
split_users.foreachRDD(processD)
#print(rows)
   
#rows.foreachRDD(process)
ssc.start()
ssc.awaitTerminationOrTimeout(200)



In [0]:
# calcualte R-Squared - Diff between predicted value and actual value
r2_evaluator = RegressionEvaluator(labelCol="Charges_incurred",
                              predictionCol="y_pred",
                              metricName="r2" )
lm_r2 = r2_evaluator.evaluate( y_pred )
lm_r2

In [0]:
def get_r2_rmse( model, test_df ):
  y_pred = model.transform( test_df )
  y_pred = y_pred.withColumn( "y_pred", exp( 'prediction' ) )
  rmse_evaluator = RegressionEvaluator(labelCol="Charges_incurred",
                              predictionCol="y_pred",
                              metricName="rmse" )
  r2_evaluator = RegressionEvaluator(labelCol="Charges_incurred",
                              predictionCol="y_pred",
                              metricName="r2" )


In [0]:
perf_params = get_r2_rmse( lm, test_df )

# create dataframe to store all the model performances

import pandas as pd

model_perf = pd.DataFrame( columns = ['name', 'rsquared', 'rmse'] )

In [0]:
evaluator = RegressionEvaluator(
  metricName="r2",
  labelCol="label",
)

In [0]:
crossval = CrossValidator(estimator=lrModel,
                        estimatorParamMaps=paramGrid,
                        evaluator=evaluator,
                        numFolds=2)  # use 3+ folds in practice

In [0]:
cvModel = crossval.fit( train_df )


In [0]:
ridge_perf = get_r2_rmse( cvModel.bestModel, test_df )


In [0]:
model_perf = model_perf.append( pd.Series( ["Ridge Regression"] + ridge_perf ,
                 index = model_perf.columns ),
                 ignore_index = True )

model_perf

In [0]:
#Using Lasso Regression  - Another method to avoid overfit 
# the regParam is a L1 (ridge) penalty, if elastic param is 1.0
paramGrid = ParamGridBuilder()                          \
  .addGrid(lrModel.regParam, [0.1, 0.01, 0.001])      \
  .addGrid(lrModel.elasticNetParam, [1.0])            \
  .build()

evaluator = RegressionEvaluator(
  metricName="r2",
  labelCol="label",
)

crossval = CrossValidator(estimator=lrModel,
                        estimatorParamMaps=paramGrid,
                        evaluator=evaluator,
                        numFolds=2)  # use 3+ folds in practice

In [0]:
cvModel = crossval.fit( train_df )
lasso_perf = get_r2_rmse( cvModel.bestModel, test_df )


In [0]:
model_perf = model_perf.append( pd.Series( ["Lasso Regression"] + lasso_perf ,
                 index = model_perf.columns ),
                 ignore_index = True )

model_perf