# Start a Sparksession

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('RG').getOrCreate()

# Load data from HDFS

In [2]:
hour_df = spark.read.format('csv')\
            .option("header", "true")\
            .load("hdfs://mycluster/user/oracle/rg/rg_data/hour.csv")

In [None]:
print(hour_df.columns)

In [3]:
#keep the columns that are needed for building a model
hour_df=hour_df.drop("instant").drop("dteday").drop("yr").drop("casual").drop("registered")

In [None]:
print(hour_df)

In [None]:
print(hour_df.printSchema())

In [4]:
#transform the data type from string to double
from pyspark.sql.functions import col
hour_df = hour_df.select([col(column).cast("double").alias(column)
                         for column in hour_df.columns])

In [None]:
hour_df.printSchema()

In [None]:
hour_df.show(2)

# Split data from training set and test set

In [5]:
train_df, test_df = hour_df.randomSplit([0.7,0.3])
train_df.count(), test_df.count()

(12323, 5056)

# Build a pipeline model: DecisionTreeRegressor 

In [6]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorIndexer, VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor

# "cnt" is label column
featuresCol = hour_df.columns[:-1]
print(featuresCol)

['season', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed']


In [None]:
#vectorIndexer
#The choice between continuous and categorical is based upon a maxCategories parameter
#Feature 0 has unique values {-1.0, 0.0}, and feature 1 values {1.0, 3.0, 5.0}. 
#If maxCategories = 2, then feature 0 -> categorical 
#                           feature 1 -> continuous

In [7]:
#VectorIndexer helps index categorical features in datasets of Vectors. It can both automatically decide which features are categorical and convert original values to category indices. 
vetorAssembler = VectorAssembler(inputCols=featuresCol, outputCol="aFeatures")
vectorIndexer = VectorIndexer(inputCol="aFeatures", outputCol="features", maxCategories=24)
dt = DecisionTreeRegressor(labelCol="cnt", featuresCol="features")

dt_pipeline=Pipeline(stages=[vetorAssembler, vectorIndexer, dt])

In [8]:
dt_pipeline.getStages()

[VectorAssembler_4bbab2fe2bc72a7e5be2,
 VectorIndexer_4d0d83ef9874d2e111fb,
 DecisionTreeRegressor_4753abeb3b6f3568361e]

# Train a model

In [9]:
dt_pipelineModel=dt_pipeline.fit(train_df)

In [10]:
dt_pipelineModel.stages[1]

VectorIndexer_4d0d83ef9874d2e111fb

# Evaluate the model: DecisionTreeRegressor

In [11]:
predicted_df=dt_pipelineModel.transform(test_df)

In [None]:
print(predicted_df.columns)

In [12]:
predicted_df.select('season', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 
                    'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'cnt', 'prediction').show(5)

+------+----+---+-------+-------+----------+----------+----+------+----+---------+----+------------------+
|season|mnth| hr|holiday|weekday|workingday|weathersit|temp| atemp| hum|windspeed| cnt|        prediction|
+------+----+---+-------+-------+----------+----------+----+------+----+---------+----+------------------+
|   1.0| 1.0|0.0|    0.0|    0.0|       0.0|       1.0|0.04|0.0758|0.57|   0.1045|22.0|55.222972972972975|
|   1.0| 1.0|0.0|    0.0|    0.0|       0.0|       1.0|0.16|0.1364| 0.8|   0.2985|52.0|55.222972972972975|
|   1.0| 1.0|0.0|    0.0|    0.0|       0.0|       1.0|0.16|0.1818| 0.8|   0.1045|33.0|55.222972972972975|
|   1.0| 1.0|0.0|    0.0|    0.0|       0.0|       1.0|0.38|0.3939| 0.4|   0.2836|91.0|55.222972972972975|
|   1.0| 1.0|0.0|    0.0|    0.0|       0.0|       2.0|0.46|0.4545|0.88|   0.2985|17.0|55.222972972972975|
+------+----+---+-------+-------+----------+----------+----+------+----+---------+----+------------------+
only showing top 5 rows



In [13]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(labelCol='cnt',
                               predictionCol='prediction',
                               metricName='rmse')
rmse=evaluator = evaluator.evaluate(predicted_df)
rmse

92.1248621372482

# Build a pipeline model #2 : GBTRegressor

In [21]:
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

vetorAssembler = VectorAssembler(inputCols=featuresCol, outputCol="aFeatures")
vectorIndexer = VectorIndexer(inputCol="aFeatures", outputCol="features", maxCategories=24)
gbt = GBTRegressor(labelCol="cnt", featuresCol="features")

paramGrid = ParamGridBuilder()\
    .addGrid(gbt.maxIter, [10, 50])\
    .addGrid(gbt.maxDepth, [5, 10])\
    .addGrid(gbt.maxBins, [25, 40])\
    .build()

evaluator = RegressionEvaluator(labelCol='cnt',
                               predictionCol='prediction',
                               metricName='rmse')

cv=CrossValidator(estimator=gbt, evaluator=evaluator,
                  estimatorParamMaps=paramGrid, numFolds=3)

gbt_pipeline = Pipeline(stages=[vetorAssembler, vectorIndexer, cv])

# Train a model

In [22]:
gbt_pipelineModel=gbt_pipeline.fit(train_df)

In [25]:
gbt_predicted_df=gbt_pipelineModel.transform(test_df)
gbt_predicted_df.select('season', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 
                    'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'cnt', 'prediction').show(5)

+------+----+---+-------+-------+----------+----------+----+------+----+---------+----+------------------+
|season|mnth| hr|holiday|weekday|workingday|weathersit|temp| atemp| hum|windspeed| cnt|        prediction|
+------+----+---+-------+-------+----------+----------+----+------+----+---------+----+------------------+
|   1.0| 1.0|0.0|    0.0|    0.0|       0.0|       1.0|0.04|0.0758|0.57|   0.1045|22.0|39.592129753197035|
|   1.0| 1.0|0.0|    0.0|    0.0|       0.0|       1.0|0.16|0.1364| 0.8|   0.2985|52.0| 34.80160910943228|
|   1.0| 1.0|0.0|    0.0|    0.0|       0.0|       1.0|0.16|0.1818| 0.8|   0.1045|33.0| 46.74697361272475|
|   1.0| 1.0|0.0|    0.0|    0.0|       0.0|       1.0|0.38|0.3939| 0.4|   0.2836|91.0| 66.79428289954987|
|   1.0| 1.0|0.0|    0.0|    0.0|       0.0|       2.0|0.46|0.4545|0.88|   0.2985|17.0|2.4646612426891936|
+------+----+---+-------+-------+----------+----------+----+------+----+---------+----+------------------+
only showing top 5 rows



# Evaluate the model:  GBTRegressor

In [26]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(labelCol='cnt',
                               predictionCol='prediction',
                               metricName='rmse')
rmse=evaluator = evaluator.evaluate(gbt_predicted_df)
rmse

69.77893608113733

In [28]:
gbt_pipelineModel.save("hdfs://mycluster/user/oracle/rg/rg_model")

In [30]:
from pyspark.ml import Pipeline
reloaded_gbt_model= Pipeline.load("hdfs://mycluster/user/oracle/rg/rg_model")


In [31]:
reloaded_gbt_model

PipelineModel_4cf88f1786b3870c6fea