In [1]:
import os
from pyspark.sql import SparkSession, SQLContext
from pyspark import SparkContext, SparkConf

from pyspark.sql.functions import explode
from pyspark.sql.functions import countDistinct, avg
from pyspark.sql.functions import dayofmonth,dayofyear,year,month,hour,weekofyear,date_format
from pyspark.sql.functions import col as func_col
from pyspark.sql.functions import lit
from pyspark.sql.functions import *
from pyspark.ml import Pipeline


###Description of the Data

##datetime - hourly date + timestamp

##season -  1 = spring, 2 = summer, 3 = fall, 4 = winter 

##holiday - whether the day is considered a holiday

##workingday - whether the day is neither a weekend nor holiday

##weather - 1: Clear, Few clouds, Partly cloudy, Partly cloudy

##2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist

##3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds

##4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog 

#temp - temperature in Celsius

#atemp - "feels like" temperature in Celsius

#humidity - relative humidity

#windspeed - wind speed

#casual - number of non-registered user rentals initiated
#registered - number of registered user rentals initiated
#count - number of total rentals



####-----------Created a ML Model using the Season Categorical Variable and other Continuous Variables of temp , humidity , windspeed -----------------
####-----------Reason in the problem statement exploding of season column was asked , but in same manner one hot encoding can be performed of weather categorical variable also 

In [2]:
cwd = os.getcwd()
for part in cwd.split('/'):
    if part.lower().startswith('edureka'):
        user_id = part.title()

In [3]:
app_name = '{0} : Spark SQL'.format(user_id)
app_name

'Edureka_960126 : Spark SQL'

In [4]:
#Configuration of the Spark Session
conf = SparkConf()  # create the configuration
conf.set('spark.driver.extraClassPath', "/usr/share/cmf/common_jars/mysql-connector-java-5.1.15.jar")  # set the spark.jars
conf.set('spark.executor.extraClassPath', "/usr/share/cmf/common_jars/mysql-connector-java-5.1.15.jar")

#Spark Session object
spark = SparkSession.builder.config(conf=conf).appName(app_name).getOrCreate()



In [5]:
#Function to get the HDFS file path 
def get_hdfs_filepath(file_name):
    my_hdfs = '/user/{0}'.format(user_id.lower())
    return os.path.join(my_hdfs, file_name)

In [6]:
##loading of the training Data and then analysing the structure of Data using various commands 

#Training csv from the hdfs path
TRAIN_CSV = get_hdfs_filepath('train.csv')

# Reading of the Data Set
train_df=spark.read.csv(TRAIN_CSV,inferSchema=True,header=True)

#Sample showing of the Data
train_df.show()

#Get summary of data and variable types and other summaries
train_df.printSchema()

train_df.describe()

#summaries of Data
train_df.describe().show()

+--------------------+------+-------+----------+-------+-----+------+--------+---------+------+----------+-----+
|            datetime|season|holiday|workingday|weather| temp| atemp|humidity|windspeed|casual|registered|count|
+--------------------+------+-------+----------+-------+-----+------+--------+---------+------+----------+-----+
|2011-01-01 00:00:...|     1|      0|         0|      1| 9.84|14.395|      81|      0.0|     3|        13|   16|
|2011-01-01 01:00:...|     1|      0|         0|      1| 9.02|13.635|      80|      0.0|     8|        32|   40|
|2011-01-01 02:00:...|     1|      0|         0|      1| 9.02|13.635|      80|      0.0|     5|        27|   32|
|2011-01-01 03:00:...|     1|      0|         0|      1| 9.84|14.395|      75|      0.0|     3|        10|   13|
|2011-01-01 04:00:...|     1|      0|         0|      1| 9.84|14.395|      75|      0.0|     0|         1|    1|
|2011-01-01 05:00:...|     1|      0|         0|      2| 9.84| 12.88|      75|   6.0032|     0| 

In [None]:
#Decide which columns should be categorical and then convert them accordingly 
#Check for any missing value in dataset and treat it

In [7]:
# convert the timestamp into various attributes - as per the problem statement

train_df=train_df.withColumn('day',dayofmonth(train_df["datetime"]))
train_df=train_df.withColumn('month',month(train_df["datetime"]))
train_df=train_df.withColumn('year',year(train_df["datetime"]))
train_df=train_df.withColumn('hour',hour(train_df["datetime"]))
train_df=train_df.drop("datetime")

#showing the data after changing the datetime column
train_df.show()


+------+-------+----------+-------+-----+------+--------+---------+------+----------+-----+---+-----+----+----+
|season|holiday|workingday|weather| temp| atemp|humidity|windspeed|casual|registered|count|day|month|year|hour|
+------+-------+----------+-------+-----+------+--------+---------+------+----------+-----+---+-----+----+----+
|     1|      0|         0|      1| 9.84|14.395|      81|      0.0|     3|        13|   16|  1|    1|2011|   0|
|     1|      0|         0|      1| 9.02|13.635|      80|      0.0|     8|        32|   40|  1|    1|2011|   1|
|     1|      0|         0|      1| 9.02|13.635|      80|      0.0|     5|        27|   32|  1|    1|2011|   2|
|     1|      0|         0|      1| 9.84|14.395|      75|      0.0|     3|        10|   13|  1|    1|2011|   3|
|     1|      0|         0|      1| 9.84|14.395|      75|      0.0|     0|         1|    1|  1|    1|2011|   4|
|     1|      0|         0|      2| 9.84| 12.88|      75|   6.0032|     0|         1|    1|  1|    1|201

In [8]:
#Explore how count varies with different features such as hour, month, etc
train_df.groupby("hour").count().orderBy("hour").show()
train_df.groupby("month").count().orderBy("month").show()
train_df.groupby("year").count().orderBy("year").show()
train_df.groupby("day").count().orderBy("day").show()
train_df.count()

+----+-----+
|hour|count|
+----+-----+
|   0|  455|
|   1|  454|
|   2|  448|
|   3|  433|
|   4|  442|
|   5|  452|
|   6|  455|
|   7|  455|
|   8|  455|
|   9|  455|
|  10|  455|
|  11|  455|
|  12|  456|
|  13|  456|
|  14|  456|
|  15|  456|
|  16|  456|
|  17|  456|
|  18|  456|
|  19|  456|
+----+-----+
only showing top 20 rows

+-----+-----+
|month|count|
+-----+-----+
|    1|  884|
|    2|  901|
|    3|  901|
|    4|  909|
|    5|  912|
|    6|  912|
|    7|  912|
|    8|  912|
|    9|  909|
|   10|  911|
|   11|  911|
|   12|  912|
+-----+-----+

+----+-----+
|year|count|
+----+-----+
|2011| 5422|
|2012| 5464|
+----+-----+

+---+-----+
|day|count|
+---+-----+
|  1|  575|
|  2|  573|
|  3|  573|
|  4|  574|
|  5|  575|
|  6|  572|
|  7|  574|
|  8|  574|
|  9|  575|
| 10|  572|
| 11|  568|
| 12|  573|
| 13|  574|
| 14|  574|
| 15|  574|
| 16|  574|
| 17|  575|
| 18|  563|
| 19|  574|
+---+-----+



10886

In [9]:
#Explode season column into separate columns such as season_<val> and drop season , the reason it is being done is
#Called a technique " Dummy encoding " to be used in the Linear regression model
import pyspark.sql.functions as F

######Note that the below is working code to convert the season into season_val but I have not used it because
####### I have used one hot encoding for getting the codes out of the categorical variables 
####### Dummy encoding and dropping one column so season with 1 = 0 0 0 and with 2 = 0 0 1 and so on

#So here we selected the season column and then collect the distinct values out of the season column 

#season_categ = train_df.select('season').distinct().rdd.flatMap(lambda x:x).collect()

#Now we are using the SQL functions 

#exprs = [F.when(F.col('season') == cat,1).otherwise(0).alias('season'+str(cat)) for cat in season_categ]

#train_df = train_df.select(exprs+train_df.columns)

#train_df =train_df.drop("seasoncat")

In [10]:
###Converted the season column into the string Data type for using the string indexer
####String indexer is very useful for encoding the categorical columns with string values as various category labels

train_df = train_df.withColumn("season", train_df["season"].cast(StringType()))
train_df.printSchema()
train_df.show()

###Decription of Data after converting to pandas and transposing the description 
train_df.toPandas().transpose().describe()

root
 |-- season: string (nullable = true)
 |-- holiday: integer (nullable = true)
 |-- workingday: integer (nullable = true)
 |-- weather: integer (nullable = true)
 |-- temp: double (nullable = true)
 |-- atemp: double (nullable = true)
 |-- humidity: integer (nullable = true)
 |-- windspeed: double (nullable = true)
 |-- casual: integer (nullable = true)
 |-- registered: integer (nullable = true)
 |-- count: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- hour: integer (nullable = true)

+------+-------+----------+-------+-----+------+--------+---------+------+----------+-----+---+-----+----+----+
|season|holiday|workingday|weather| temp| atemp|humidity|windspeed|casual|registered|count|day|month|year|hour|
+------+-------+----------+-------+-----+------+--------+---------+------+----------+-----+---+-----+----+----+
|     1|      0|         0|      1| 9.84|14.395|      81|      0.0|     3|

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10876,10877,10878,10879,10880,10881,10882,10883,10884,10885
count,15,15,15,15,15,15,15,15,15,15,...,15,15,15,15,15,15,15,15,15,15
unique,10,10,11,10,8,10,9,10,9,11,...,14,14,14,14,14,13,14,14,13,14
top,0,1,1,1,1,1,0,1,1,1,...,1,1,1,1,1,19,1,1,12,1
freq,4,4,3,3,5,4,4,4,4,3,...,2,2,2,2,2,2,2,2,2,2


In [13]:
#### Training of the various ML Models with Data 

import pandas as pd
from pylab import *
from pyspark.ml.feature import StringIndexer, OneHotEncoder , VectorAssembler

#String Indexer to create the integer 0,1,2,.... like indexes of the categorical variable 
stringIndexer = StringIndexer(inputCol = "season", outputCol = 'season' + 'Index')

#Then One hot encoding --- is called one hot means that if 4 labels then our code contain three feature columns  
OHencoder = OneHotEncoder(inputCol=stringIndexer.getOutputCol(), outputCol="season_cat")

#Used the pipelines to stremline the whole process
stages = []
stages += [stringIndexer, OHencoder]

## This assembler inputs are the columns which will be converted into the feature vector
assemblerInputs = ["season_cat"] + ['temp','atemp','humidity','windspeed']

### Vector Creator with column features which will be used in the training of the model
Vectassembler = VectorAssembler(inputCols=assemblerInputs, outputCol='features')

### Vector assembler is added to the pipeline
stages += [Vectassembler]

from pyspark.ml import Pipeline

cols = train_df.columns

##Pipleline is created 
pipeline = Pipeline(stages = stages)

### Pipeline fitting with train_df data 
pipelineModel = pipeline.fit(train_df)

## then the whole pipeline model is used to transform the data into the desired way what we want
train_df = pipelineModel.transform(train_df)

selectedCols = ['features']+cols

train_df = train_df.select(selectedCols)


### way to look into the Data in the pandas way
pd.DataFrame(train_df.take(100), columns=train_df.columns)

#train_df.printSchema()
#train_df.filter(train_df["season"]== '4').show(1000)

Unnamed: 0,features,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,day,month,year,hour
0,"(0.0, 0.0, 0.0, 9.84, 14.395, 81.0, 0.0)",1,0,0,1,9.84,14.395,81,0.0000,3,13,16,1,1,2011,0
1,"(0.0, 0.0, 0.0, 9.02, 13.635, 80.0, 0.0)",1,0,0,1,9.02,13.635,80,0.0000,8,32,40,1,1,2011,1
2,"(0.0, 0.0, 0.0, 9.02, 13.635, 80.0, 0.0)",1,0,0,1,9.02,13.635,80,0.0000,5,27,32,1,1,2011,2
3,"(0.0, 0.0, 0.0, 9.84, 14.395, 75.0, 0.0)",1,0,0,1,9.84,14.395,75,0.0000,3,10,13,1,1,2011,3
4,"(0.0, 0.0, 0.0, 9.84, 14.395, 75.0, 0.0)",1,0,0,1,9.84,14.395,75,0.0000,0,1,1,1,1,2011,4
5,"[0.0, 0.0, 0.0, 9.84, 12.88, 75.0, 6.0032]",1,0,0,2,9.84,12.880,75,6.0032,0,1,1,1,1,2011,5
6,"(0.0, 0.0, 0.0, 9.02, 13.635, 80.0, 0.0)",1,0,0,1,9.02,13.635,80,0.0000,2,0,2,1,1,2011,6
7,"(0.0, 0.0, 0.0, 8.2, 12.88, 86.0, 0.0)",1,0,0,1,8.20,12.880,86,0.0000,1,2,3,1,1,2011,7
8,"(0.0, 0.0, 0.0, 9.84, 14.395, 75.0, 0.0)",1,0,0,1,9.84,14.395,75,0.0000,1,7,8,1,1,2011,8
9,"(0.0, 0.0, 0.0, 13.12, 17.425, 76.0, 0.0)",1,0,0,1,13.12,17.425,76,0.0000,8,6,14,1,1,2011,9


In [14]:
# Validation Set - set aside training data for validation
vtrain_df = train_df.select('features', 'count')
splits = vtrain_df.randomSplit([0.7, 0.3])
f_train_df = splits[0]
f_test_df = splits[1]


In [15]:
#Here we have imported the linear regression Libraries
from pyspark.ml.regression import LinearRegression

####### Working on the ML model using Linear regression 

#Generating the Linear regression model based on the features and other parameters passed
lr = LinearRegression(featuresCol = 'features', labelCol='count')

#Then we fit the training data into the model generated
lr_model = lr.fit(f_train_df)

pred = lr_model.evaluate(f_test_df)

#Show the predicted Grade values along side actual Grade values
pred.predictions.show(1000)

#Training summary
trainingSummary = lr_model.summary

print("RMSE: %f" % trainingSummary.rootMeanSquaredError)

print("r2: %f" % trainingSummary.r2)

#Then we print the coefficients and the intercept of the linear model we have got after fitting our training data

print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))



+--------------------+-----+--------------------+
|            features|count|          prediction|
+--------------------+-----+--------------------+
|(7,[3,4,5],[3.28,...|   74|   -33.1913230352572|
|(7,[3,4,5],[4.1,9...|   17|  28.578987643727828|
|(7,[3,4,5],[5.74,...|    5|   47.43183868893995|
|(7,[3,4,5],[5.74,...|   72|   88.05476726835361|
|(7,[3,4,5],[5.74,...|  108|   32.33189535067342|
|(7,[3,4,5],[5.74,...|   15| -26.177120162890787|
|(7,[3,4,5],[6.56,...|    5|   41.18193867985022|
|(7,[3,4,5],[6.56,...|    7|   41.18193867985022|
|(7,[3,4,5],[6.56,...|    2|  13.320502721010115|
|(7,[3,4,5],[6.56,...|    2|  13.320502721010115|
|(7,[3,4,5],[6.56,...|   47|  -53.54694358020612|
|(7,[3,4,5],[7.38,...|   68|  50.015521037039065|
|(7,[3,4,5],[8.2,1...|  187|  120.16072347566407|
|(7,[3,4,5],[8.2,1...|   13|    100.657718304476|
|(7,[3,4,5],[8.2,1...|   90|    100.657718304476|
|(7,[3,4,5],[8.2,1...|  110|   92.29928751682397|
|(7,[3,4,5],[8.2,1...|   45|  58.865564366215864|


In [16]:
##### Predictions out of the model over the test data

lr_predictions = lr_model.transform(f_test_df)
lr_predictions.select("prediction","count","features").show(5)

#so here we are doing the evaluations over the predictions made on the test data and the corresponding to the actual output
from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="count",metricName="r2")

print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))

+------------------+-----+--------------------+
|        prediction|count|            features|
+------------------+-----+--------------------+
| -33.1913230352572|   74|(7,[3,4,5],[3.28,...|
|28.578987643727828|   17|(7,[3,4,5],[4.1,9...|
| 47.43183868893995|    5|(7,[3,4,5],[5.74,...|
| 88.05476726835361|   72|(7,[3,4,5],[5.74,...|
| 32.33189535067342|  108|(7,[3,4,5],[5.74,...|
+------------------+-----+--------------------+
only showing top 5 rows

R Squared (R2) on test data = 0.278803


In [17]:
# Let us work through the Random Forest Regression

from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator


# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.

featureIndexer =VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(train_df)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = train_df.randomSplit([0.7, 0.3])

# Train a RandomForest model.
rf = RandomForestRegressor(labelCol="count",featuresCol="indexedFeatures")

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[featureIndexer,rf])

# Train model.  This also runs the indexer.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "count","indexedFeatures").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(labelCol="count", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

rfModel = model.stages[0]
print(rfModel)  # summary only



+-----------------+-----+--------------------+
|       prediction|count|     indexedFeatures|
+-----------------+-----+--------------------+
| 78.6344429173995|   43|(7,[3,4,5],[2.46,...|
|73.33274214892094|   17|(7,[3,4,5],[4.1,9...|
|74.31252885595272|    3|(7,[3,4,5],[4.92,...|
|73.33274214892094|  210|(7,[3,4,5],[4.92,...|
|65.83382249905421|    5|(7,[3,4,5],[5.74,...|
+-----------------+-----+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 154.533
VectorIndexer_4ea19019c16f47619134


In [18]:
# Let us work through the Decision Tree Regression

from pyspark.ml import Pipeline
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator


# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(train_df)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = train_df.randomSplit([0.7, 0.3])


# Train a DecisionTree model.
dt = DecisionTreeRegressor(labelCol="count",featuresCol="indexedFeatures")

# Chain indexer and tree in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, dt])

# Train model.  This also runs the indexer.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "count" , "features").show(100)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(labelCol="count", predictionCol="prediction", metricName="rmse")

rmse = evaluator.evaluate(predictions)

print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

treeModel = model.stages[1]
# summary only
print(treeModel)






+------------------+-----+--------------------+
|        prediction|count|            features|
+------------------+-----+--------------------+
| 63.89525368248773|   64|(7,[3,4,5],[2.46,...|
| 63.89525368248773|   17|(7,[3,4,5],[4.1,9...|
|103.32673267326733|   51|(7,[3,4,5],[4.92,...|
| 63.89525368248773|  210|(7,[3,4,5],[4.92,...|
| 63.89525368248773|  108|(7,[3,4,5],[5.74,...|
| 63.89525368248773|    5|(7,[3,4,5],[6.56,...|
| 63.89525368248773|  219|(7,[3,4,5],[6.56,...|
| 63.89525368248773|   95|(7,[3,4,5],[6.56,...|
| 63.89525368248773|    2|(7,[3,4,5],[6.56,...|
| 63.89525368248773|    2|(7,[3,4,5],[6.56,...|
| 63.89525368248773|   51|(7,[3,4,5],[6.56,...|
| 63.89525368248773|   16|(7,[3,4,5],[7.38,...|
| 63.89525368248773|   53|(7,[3,4,5],[7.38,...|
| 63.89525368248773|   51|(7,[3,4,5],[7.38,...|
| 63.89525368248773|   30|(7,[3,4,5],[7.38,...|
| 63.89525368248773|   19|(7,[3,4,5],[8.2,1...|
| 63.89525368248773|    1|(7,[3,4,5],[8.2,1...|
| 63.89525368248773|    3|(7,[3,4,5],[9.

In [20]:
# Let us work through the Gradient Boosted Regression

from pyspark.ml import Pipeline
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import PipelineModel


# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(train_df)


# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = train_df.randomSplit([0.7, 0.3])

# Train a GBT model.
gbt = GBTRegressor(labelCol="count",featuresCol="indexedFeatures", maxIter=10)

# Chain indexer and GBT in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, gbt])

# Train model.  This also runs the indexer.
model = pipeline.fit(trainingData)

######Saving the model to HDFS for persistence and then using it in our prediction application

model.save("/user/edureka_960126/models_gbr")



# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "count" , "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(labelCol="count" , predictionCol="prediction", metricName="rmse")

rmse = evaluator.evaluate(predictions)

print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

gbtModel = model.stages[1]
print(gbtModel)  # summary only

+------------------+-----+--------------------+
|        prediction|count|            features|
+------------------+-----+--------------------+
|47.610280181106006|   64|(7,[3,4,5],[2.46,...|
| 42.42890753194144|    5|(7,[3,4,5],[5.74,...|
| 65.20404048822024|   59|(7,[3,4,5],[5.74,...|
| 42.42890753194144|   87|(7,[3,4,5],[5.74,...|
| 68.32981043282204|    3|(7,[3,4,5],[5.74,...|
+------------------+-----+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 155.045
GBTRegressionModel (uid=GBTRegressor_484a91f619a6a418b661) with 10 trees


In [21]:
#GTB Rgeressional Model is looking fine for me as it is working fine 
# Let us save this model so that it can be pulled from the HDFS when required

reloaded_model = PipelineModel.load("/user/edureka_960126/models_gbr")

# Make predictions.
predictions = reloaded_model.transform(testData)
#predictions = reloaded_model.transform(testData.select('features'))

#predictions.select("prediction", "features").show(5)

# Select example rows to display.

predictions.select("prediction", "count" , "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(labelCol="count" , predictionCol="prediction", metricName="rmse")

rmse = evaluator.evaluate(predictions)

print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

gbtModel = reloaded_model.stages[1]
print(gbtModel)  # summary only


+------------------+-----+--------------------+
|        prediction|count|            features|
+------------------+-----+--------------------+
|47.431993362757424|   64|(7,[3,4,5],[2.46,...|
|47.431993362757424|    5|(7,[3,4,5],[5.74,...|
|   83.253126976595|   59|(7,[3,4,5],[5.74,...|
| 48.62723597983599|   87|(7,[3,4,5],[5.74,...|
| 51.81819961311954|    3|(7,[3,4,5],[5.74,...|
+------------------+-----+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 147.918
GBTRegressionModel (uid=GBTRegressor_42eb8b371d3f9f535e90) with 10 trees


In [22]:
########----------- Commands were used to check the RDBMS connectivity and read/write operations--------------------

#Connectivity to the RDBMS for making the application working

#Practising the mysql command 
#dataframe_mysql = spark.read.format("jdbc").option("url", 'jdbc:mysql://dbserver.edu.cloudlab.com/labuser_database').option("driver", "com.mysql.jdbc.Driver").option("dbtable", "emp").option("user", 'edu_labuser').option("password", "edureka").load() 

#dataframe_mysql.show()

#temp_df=train_df.drop("features")

#temp_df.write.format('jdbc').options(url='jdbc:mysql://dbserver.edu.cloudlab.com/labuser_database',driver='com.mysql.jdbc.Driver',dbtable='r_table',user='edu_labuser', password='edureka').save()

#dataframe_mysql = spark.read.format("jdbc").option("url", 'jdbc:mysql://dbserver.edu.cloudlab.com/labuser_database').option("driver", "com.mysql.jdbc.Driver").option("dbtable", "r_table").option("user", 'edu_labuser').option("password", "edureka").load() 

#dataframe_mysql.show()

### Sample reading of the data from the sample table created in the Database 
dataframe_mysql = spark.read.format("jdbc").option("url", 'jdbc:mysql://dbserver.edu.cloudlab.com/labuser_database').option("driver", "com.mysql.jdbc.Driver").option("dbtable", "r_table").option("user", 'edu_labuser').option("password", "edureka").load() 
dataframe_mysql.show()

+------+-------+----------+-------+-----+------+--------+---------+------+----------+-----+---+-----+----+----+
|season|holiday|workingday|weather| temp| atemp|humidity|windspeed|casual|registered|count|day|month|year|hour|
+------+-------+----------+-------+-----+------+--------+---------+------+----------+-----+---+-----+----+----+
|     1|      0|         0|      1| 9.84|14.395|      81|      0.0|     3|        13|   16|  1|    1|2011|   0|
|     1|      0|         0|      1| 9.02|13.635|      80|      0.0|     8|        32|   40|  1|    1|2011|   1|
|     1|      0|         0|      1| 9.02|13.635|      80|      0.0|     5|        27|   32|  1|    1|2011|   2|
|     1|      0|         0|      1| 9.84|14.395|      75|      0.0|     3|        10|   13|  1|    1|2011|   3|
|     1|      0|         0|      1| 9.84|14.395|      75|      0.0|     0|         1|    1|  1|    1|2011|   4|
|     1|      0|         0|      2| 9.84| 12.88|      75|   6.0032|     0|         1|    1|  1|    1|201

In [23]:
#Part 2 of making an application for reading the file path of the test.csv from the hdfs and then putting the result in the RDBMS

#We may get the input from the user for the file location in the hdfs so as to load the CSV but here we are using the test file provided 
#in the Dataset

#Step 1 loading of the file from the hdfs 


#getting the hdfs file path
TEST_CSV = get_hdfs_filepath('test.csv')

#loading the csv file from the hdfs file path into the data frame
test_df=spark.read.csv(TEST_CSV,inferSchema=True,header=True)


#Getting the features vector from the test.csv for inputting into our stored loaded model

#This is the string indexer used for encoding the categorical variable column with string categories but
#with our schema inference this may not be the case but we are still using this 
stringIndexer = StringIndexer(inputCol = "season", outputCol = 'season' + 'Index')

#One hot encoding of the season category 
OHencoder = OneHotEncoder(inputCol=stringIndexer.getOutputCol(), outputCol="season_cat")

#Pipelining of stages
stages = []
stages += [stringIndexer, OHencoder]

#The columns that need to be assembled for getting the features
assemblerInputs = ["season_cat"] + ['temp','atemp','humidity','windspeed']

#Vector Assembler for converting the assembler inputs into features column with vector of all features for putting inside the ML Model
Vectassembler = VectorAssembler(inputCols=assemblerInputs, outputCol='features')

#Now Vector assembler has been added
stages += [Vectassembler]


###Note the above stages were already created in the training of the model phase but still we have created the above steps to show them again while working with the 
#Testing Data

cols = test_df.columns

#Creating the whole pipeline
pipeline = Pipeline(stages = stages)

#Fitting the testing data to the pipeline of features extraction from the testing data
pipelineModel = pipeline.fit(test_df)

test_df = pipelineModel.transform(test_df)

selectedCols = ['features']+cols

test_df = test_df.select(selectedCols)

###Now we work on the predictions part using the stored model

#reloading the model from hdfs

reloaded_model = PipelineModel.load("/user/edureka_960126/models_gbr")

# Make predictions.
predictions = reloaded_model.transform(test_df)


##### Some sample results of error and the sample columns are shown

# Select example rows to display.
predictions.select("prediction","features").show(5)

predictions=predictions.drop("features","indexedFeatures")

predictions.show(5)
##### Storing the resulting predictions based results into the RDBMS

##### Here I am appending the results into the same table but we can take user input and create table according to that
predictions.write.mode('append').format('jdbc').options(url='jdbc:mysql://dbserver.edu.cloudlab.com/labuser_database',driver='com.mysql.jdbc.Driver',dbtable='test_predictions',user='edu_labuser', password='edureka').save()


#reading of the Data again from the table and show the sample of results
dataframe_mysql = spark.read.format("jdbc").option("url", 'jdbc:mysql://dbserver.edu.cloudlab.com/labuser_database').option("driver", "com.mysql.jdbc.Driver").option("dbtable", "test_predictions").option("user", 'edu_labuser').option("password", "edureka").load() 
dataframe_mysql.show(5)




+------------------+--------------------+
|        prediction|            features|
+------------------+--------------------+
|119.34586314905356|[0.0,0.0,1.0,10.6...|
| 95.81666833649959|[0.0,0.0,1.0,10.6...|
| 95.81666833649959|[0.0,0.0,1.0,10.6...|
|104.79592682312574|[0.0,0.0,1.0,10.6...|
|104.79592682312574|[0.0,0.0,1.0,10.6...|
+------------------+--------------------+
only showing top 5 rows

+--------------------+------+-------+----------+-------+-----+------+--------+---------+------------------+
|            datetime|season|holiday|workingday|weather| temp| atemp|humidity|windspeed|        prediction|
+--------------------+------+-------+----------+-------+-----+------+--------+---------+------------------+
|2011-01-20 00:00:...|     1|      0|         1|      1|10.66|11.365|      56|  26.0027|119.34586314905356|
|2011-01-20 01:00:...|     1|      0|         1|      1|10.66|13.635|      56|      0.0| 95.81666833649959|
|2011-01-20 02:00:...|     1|      0|         1|      1|1

In [None]:
#Part 3 of the Integration of ML Model for prediction with the Flume 

#1. Setup flume to push data into spark flume sink.  
#2. Configure spark streaming to pull data from spark flume sink using receivers 
#and predict the demand using model and persist the result to RDBMS. 
#3. Push messages from flume to test the application. Here application should process and persist the result to RDBMS 

#Architecture to be followed here

from pyspark.ml import Pipeline
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import PipelineModel

from pyspark import SparkContext

from pyspark.streaming import StreamingContext

from pyspark.streaming.flume import FlumeUtils

import json

from pyspark.sql import Row

#These are the variables set for the machine learning model to be applied over the microbatches
stages = []

assemblerInputs = ["season_1","season_2","season_3"] + ['temp','atemp','humidity','windspeed']
Vectassembler = VectorAssembler(inputCols=assemblerInputs, outputCol='features')
stages += [Vectassembler]

pipeline = Pipeline(stages = stages)


#Model stored in the hdfs to be used for the machine learning application
reloaded_model = PipelineModel.load("/user/edureka_960126/models_gbr")


#Seasons code for converting each season into code
seasons_code={1 : [0.0,0.0,0.0] , 2 : [0.0,1.0,0.0], 3:[0.0,0.0,1.0], 4:[1.0,0.0,0.0]}

#Schema is being set for the dataframe creation out of the records
cSchema = StructType([StructField("season_1", DoubleType(),nullable=True),StructField("season_2", DoubleType(),nullable=True),StructField("season_3", DoubleType(),nullable=True),StructField("weather", IntegerType(),nullable=True),
                    
        StructField("temp", DoubleType(),nullable=True),StructField("atemp", DoubleType(),nullable=True),StructField("humidity", DoubleType(),nullable=True)
                     
                     ,StructField("windspeed", DoubleType(),nullable=True)])




#Process for working out the prediction
def process(rdd):
    
        # Get the singleton instance of SparkSession
        #spark = getSparkSessionInstance(rdd.context.getConf())

        #Conversion to the Data Frame to work on the dataframe for working out the predictions
        df = spark.createDataFrame(rdd,schema=cSchema)

        #Now we work on the dataframe for producing the predictions
        cols = df.columns

        #Getting the output after passing the data frame through vector assembler to produce the dataframe with vectorised features
        pipelineModel = pipeline.fit(df)
        df = pipelineModel.transform(df)

        #dataframe features selection 
        selectedCols = ['features']+cols
        df = df.select(selectedCols)

        # Make predictions.
        temp_df=reloaded_model.transform(df)
        
        temp_df.select("features","prediction").show()
        
        temp_df=temp_df.drop("features","indexedFeatures")
        
        #Storing the Data in the table created for storing the results of the predictions into the RDBMS Table       
        temp_df.write.mode('append').format('jdbc').options(url='jdbc:mysql://dbserver.edu.cloudlab.com/labuser_database',driver='com.mysql.jdbc.Driver',dbtable='streaming_result',user='edu_labuser', password='edureka').save()
        
        #reading of the Data again from the table and show the sample of results
        dataframe_mysql = spark.read.format("jdbc").option("url", 'jdbc:mysql://dbserver.edu.cloudlab.com/labuser_database').option("driver", "com.mysql.jdbc.Driver").option("dbtable", "streaming_result").option("user", 'edu_labuser').option("password", "edureka").load() 
        dataframe_mysql.show(5)
        

ssc= StreamingContext(spark.sparkContext, 15)

#Flume stream is generated after the spark streaming receiver is connected to custom spark streaming sink created at the host with a given port
flumeStream = FlumeUtils.createPollingStream(ssc, [('ip-20-0-41-164.ec2.internal' , 9090)])

#Flume_microbatches count
flumeStream.count().pprint()

#flumeStream.foreachRDD(lambda rdd: rdd.foreach(sendRecord))  
 
#RDD[Strings]

#Here we get the strings of the json format input data 
lines = flumeStream.map(lambda x: x[1])

#RDD of Dicts or JSON objects by extracting the json objects from the string 
records_dict=lines.map(lambda x: json.loads(x))

#Rows RDD rows rdd is created here
rows_rdd=records_dict.map(lambda res: Row(seasons_code[res['season']][0],seasons_code[res['season']][1],seasons_code[res['season']][2],res['weather'],res['temp'],res['atemp'],res['humidity'],res['windspeed'] ))

rows_rdd.foreachRDD(process)

ssc.start()             # Start the computation
ssc.awaitTermination()  # Wait for the computation to terminate

In [None]:
#### Sample code created By me for checking the working over the input json data which will be streamed to spark streaming through 
#### Flume before I finalised the Part 3 

#Dataset that will be coming in the each push of message at the flume source
import json

from pyspark.sql import Row

seasons_code={1 : [0.0,0.0,0.0] , 2 : [0.0,1.0,0.0], 3:[0.0,0.0,1.0], 4:[1.0,0.0,0.0]}

reloaded_model = PipelineModel.load("/user/edureka_960126/model")

#Sample input request from http source through Flume in microbatches

x='{"season" : 1 , "weather" :2 , "temp" : 13.4 , "atemp" : 14.6 , "humidity" : 84.0 , "windspeed" : 18.0}'

res = json.loads(x)

features=seasons_code[res['season']]+[res['weather'],res['temp'],res['atemp'],res['humidity'],res['windspeed'] ]

r = Row(features)

cSchema = StructType([StructField("season_1", DoubleType(),nullable=True),StructField("season_2", DoubleType(),nullable=True),StructField("season_3", DoubleType(),nullable=True),StructField("weather", IntegerType(),nullable=True),
                    
        StructField("temp", DoubleType(),nullable=True),StructField("atemp", DoubleType(),nullable=True),StructField("humidity", DoubleType(),nullable=True)
                     
                     ,StructField("windspeed", DoubleType(),nullable=True)])

df=spark.createDataFrame(r,schema=cSchema)

df.show()


#stringIndexer = StringIndexer(inputCol = "season", outputCol = 'season' + 'Index')

#OHencoder = OneHotEncoder(inputCol=stringIndexer.getOutputCol(), outputCol="season_cat")

stages = []

#stages += [stringIndexer, OHencoder]

assemblerInputs = ["season_1","season_2","season_3"] + ['temp','atemp','humidity','windspeed']

Vectassembler = VectorAssembler(inputCols=assemblerInputs, outputCol='features')

stages += [Vectassembler]

cols = df.columns

pipeline = Pipeline(stages = stages)

pipelineModel = pipeline.fit(df)

df = pipelineModel.transform(df)

selectedCols = ['features']+cols

df = df.select(selectedCols)



# Make predictions.
reloaded_model.transform(df).select("features","prediction").show()