# Gradient Boosting with Pipeline

In [None]:
import pyspark
import sys

In [None]:
import pyspark.sql.functions as fn

In [None]:
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline

In [None]:
# Check spark app name
spark.sparkContext.appName

In [None]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", True)

In [None]:
# print runtime versions
# Python version
sys.version

In [None]:
# Spark version
spark.version

### Exploring Data

In [None]:
# load biking data into Spark dataframe
df = spark.read.csv('data/bike/hour.csv', header=True, inferSchema=True)

In [None]:
#validate the size of data
df.count(), len(df.columns)

In [None]:
# First 5 rows of Iris dataset
df.show(5)

In [None]:
df.printSchema()

### Feature Engineering

In [None]:
df.select('casual', 'registered', 'cnt').show(10)

In [None]:
# discard 'instant', 'dteday', 'casual' and 'registered' columns
df = df.drop('instant').drop('dteday').drop('casual').drop('registered')

In [None]:
df.show(10)

In [None]:
df.printSchema()

### Split Data - Train & Test sets

In [None]:
# use Logistic Regression to train on the training set
train_df, test_df = df.randomSplit([0.70, 0.30], seed=42)

In [None]:
train_df.count(), len(train_df.columns)

In [None]:
test_df.count(), len(test_df.columns)

### Build GBT Regression Model using Pipeline 

In [None]:
featuresCols = df.columns[:-1]

In [None]:
featuresCols

In [None]:
# concatenates all feature columns into a single feature vector
vectorAssembler = VectorAssembler(inputCols=featuresCols, outputCol="features")

In [None]:
# Takes the "features" column and learns to predict "cnt"
gbt = GBTRegressor(labelCol="cnt")

In [None]:
paramGrid = ParamGridBuilder()\
    .addGrid(gbt.maxDepth, [2, 5])\
    .addGrid(gbt.maxIter, [10, 100])\
    .build()

In [None]:
# define evaluation metric  
# tell CrossValidator how well we are doing by comparing the true labels with predictions
evaluator = RegressionEvaluator(metricName="rmse", 
            labelCol=gbt.getLabelCol(), 
            predictionCol=gbt.getPredictionCol())

In [None]:
# declare the CrossValidator, which runs model tuning for us
cv = CrossValidator(estimator=gbt, evaluator=evaluator, 
        estimatorParamMaps=paramGrid)

In [None]:
# tie feature processing and model training stages together into a single Pipeline
pipeline = Pipeline(stages=[vectorAssembler, cv])

In [None]:
%%time
# train the Pipeline
pipelineModel = pipeline.fit(train_df)

In [None]:
pipelineModel.stages[1].bestModel._java_obj.getMaxDepth()

In [None]:
pipelineModel.stages[1].bestModel._java_obj.getMaxIter()

### Evaluate Model

In [None]:
# make predictions on test data 
predictions = pipelineModel.transform(test_df)

In [None]:
# view the residual errors based on predictions 
predictions.select('cnt', 'prediction').show(10,False)

In [None]:
# RMSE
rmse = evaluator.evaluate(predictions)

In [None]:
rmse