# Business Problem
- Using the bike sharing dataset to predict number of bikes rentals per hour

#Data Preprocessing


In [0]:
display(dbutils.fs.ls('/databricks-datasets'))

path,name,size,modificationTime
dbfs:/databricks-datasets/COVID/,COVID/,0,1727904900716
dbfs:/databricks-datasets/README.md,README.md,976,1532502332000
dbfs:/databricks-datasets/Rdatasets/,Rdatasets/,0,1727904900716
dbfs:/databricks-datasets/SPARK_README.md,SPARK_README.md,3359,1455505270000
dbfs:/databricks-datasets/adult/,adult/,0,1727904900716
dbfs:/databricks-datasets/airlines/,airlines/,0,1727904900716
dbfs:/databricks-datasets/amazon/,amazon/,0,1727904900716
dbfs:/databricks-datasets/asa/,asa/,0,1727904900716
dbfs:/databricks-datasets/atlas_higgs/,atlas_higgs/,0,1727904900716
dbfs:/databricks-datasets/bikeSharing/,bikeSharing/,0,1727904900716


In [0]:
df = spark.read.csv('/databricks-datasets/bikeSharing/data-001/hour.csv', header=True, inferSchema=True)
df.cache()

DataFrame[instant: int, dteday: date, season: int, yr: int, mnth: int, hr: int, holiday: int, weekday: int, workingday: int, weathersit: int, temp: double, atemp: double, hum: double, windspeed: double, casual: int, registered: int, cnt: int]

In [0]:
df.head()

Row(instant=1, dteday=datetime.date(2011, 1, 1), season=1, yr=0, mnth=1, hr=0, holiday=0, weekday=6, workingday=0, weathersit=1, temp=0.24, atemp=0.2879, hum=0.81, windspeed=0.0, casual=3, registered=13, cnt=16)

In [0]:
df.count()

17379

In [0]:
df = df.drop("instant").drop("dteday").drop("casual").drop("registered")

In [0]:
df.printSchema()

root
 |-- season: integer (nullable = true)
 |-- yr: integer (nullable = true)
 |-- mnth: integer (nullable = true)
 |-- hr: integer (nullable = true)
 |-- holiday: integer (nullable = true)
 |-- weekday: integer (nullable = true)
 |-- workingday: integer (nullable = true)
 |-- weathersit: integer (nullable = true)
 |-- temp: double (nullable = true)
 |-- atemp: double (nullable = true)
 |-- hum: double (nullable = true)
 |-- windspeed: double (nullable = true)
 |-- cnt: integer (nullable = true)



In [0]:
# splitting the data into training and test sets 
# We split the data on 4 seasons. Keeping intact 
# the order of seasons. 
from pyspark.sql.functions import col
from pyspark.sql import DataFrame

def split_seasons(df: DataFrame):
    seasons = [1, 2, 3, 4]
    train_dfs = []
    test_dfs = []
    
    for season in seasons:
        season_df = df.filter(col("season") == season)
        train, test = season_df.randomSplit([0.8, 0.2], seed=42)
        train_dfs.append(train)
        test_dfs.append(test)
    
    train_df = train_dfs[0].union(train_dfs[1]).union(train_dfs[2]).union(train_dfs[3])
    test_df = test_dfs[0].union(test_dfs[1]).union(test_dfs[2]).union(test_dfs[3])
    
    return train_df, test_df

train_df, test_df = split_seasons(df)
display(train_df)
display(test_df)

season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
1,0,1,0,0,0,0,1,0.04,0.0758,0.57,0.1045,22
1,0,1,0,0,0,0,1,0.1,0.0758,0.42,0.3881,25
1,0,1,0,0,0,0,1,0.26,0.303,0.56,0.0,39
1,0,1,0,0,0,0,2,0.46,0.4545,0.88,0.2985,17
1,0,1,0,0,1,1,1,0.06,0.0606,0.41,0.194,7
1,0,1,0,0,1,1,1,0.22,0.197,0.44,0.3582,5
1,0,1,0,0,2,1,1,0.14,0.1667,0.59,0.1045,12
1,0,1,0,0,2,1,1,0.16,0.1818,0.55,0.1045,5
1,0,1,0,0,2,1,2,0.16,0.1364,0.69,0.2836,9
1,0,1,0,0,3,1,1,0.2,0.2576,0.64,0.0,6


season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
1,0,1,0,0,0,0,1,0.16,0.1818,0.8,0.1045,33
1,0,1,0,0,1,1,1,0.12,0.1212,0.5,0.2836,5
1,0,1,0,0,1,1,2,0.24,0.2273,0.65,0.2239,7
1,0,1,0,0,3,1,2,0.16,0.197,0.86,0.0896,7
1,0,1,0,0,5,1,1,0.12,0.1364,0.5,0.194,14
1,0,1,0,0,6,0,1,0.04,0.0303,0.45,0.2537,13
1,0,1,1,0,0,0,1,0.04,0.0758,0.57,0.1045,13
1,0,1,1,0,1,1,1,0.12,0.1212,0.5,0.2836,1
1,0,1,1,0,4,1,1,0.14,0.1212,0.5,0.2836,2
1,0,1,1,0,4,1,1,0.16,0.2273,0.64,0.0,4


In [0]:
train_df.select("season").distinct().display()
test_df.select("season").distinct().display()

season
1
2
3
4


season
1
2
3
4


In [0]:
display(train_df.select("hr","cnt"))

hr,cnt
0,22
0,25
0,39
0,17
0,7
0,5
0,12
0,5
0,9
0,6


Databricks visualization. Run in Databricks to view.

In [0]:
display(test_df.select("hr","cnt"))

hr,cnt
0,33
0,5
0,7
0,7
0,14
0,13
1,13
1,1
1,2
1,4


Databricks visualization. Run in Databricks to view.

#Train the ML pipeline

### A few steps to be done
- VectorAssembler to assemble features into a feature vector
- VectorIndexer Identifies the column that should be treated as categorical
- GBTRegressor
- Cross Validator


In [0]:
from pyspark.ml.feature import VectorAssembler,VectorIndexer

featureCols = df.columns
featureCols.remove("cnt")

vectorAssembler = VectorAssembler(inputCols=featureCols,outputCol="rawFeatures")
vectorIndexer = VectorIndexer(inputCol="rawFeatures",outputCol="features",maxCategories=4)

In [0]:
from pyspark.ml.regression import GBTRegressor

gbt = GBTRegressor(labelCol="cnt", maxIter=2)

In [0]:
from pyspark.ml.tuning import CrossValidator,ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

paramGrid = ParamGridBuilder() \
  .addGrid(gbt.maxDepth, [5,7, 10])\
  .addGrid(gbt.maxIter, [2, 5])\
  .build()

evaluator = RegressionEvaluator(metricName="rmse",labelCol=gbt.getLabelCol(),predictionCol=gbt.getPredictionCol())  
cv = CrossValidator(estimator=gbt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=2)   
      
      

In [0]:
from pyspark.ml import Pipeline 
pipeline = Pipeline(stages=[vectorAssembler,vectorIndexer,cv])

In [0]:
pipelineModel = pipeline.fit(train_df)

#Make predictions and evaluate


In [0]:
predictions  = pipelineModel.transform(test_df)

In [0]:
display(predictions.select("cnt","prediction",*featureCols))

cnt,prediction,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed
33,22.26313143175853,1,0,1,0,0,0,0,1,0.16,0.1818,0.8,0.1045
5,10.221963814637018,1,0,1,0,0,1,1,1,0.12,0.1212,0.5,0.2836
7,15.98847153072368,1,0,1,0,0,1,1,2,0.24,0.2273,0.65,0.2239
7,10.572782207872647,1,0,1,0,0,3,1,2,0.16,0.197,0.86,0.0896
14,12.997364884155736,1,0,1,0,0,5,1,1,0.12,0.1364,0.5,0.194
13,21.899005680577883,1,0,1,0,0,6,0,1,0.04,0.0303,0.45,0.2537
13,25.69101999147162,1,0,1,1,0,0,0,1,0.04,0.0758,0.57,0.1045
1,4.041135927926781,1,0,1,1,0,1,1,1,0.12,0.1212,0.5,0.2836
2,4.041135927926781,1,0,1,1,0,4,1,1,0.14,0.1212,0.5,0.2836
4,5.615989388220726,1,0,1,1,0,4,1,1,0.16,0.2273,0.64,0.0


In [0]:
rmse = evaluator.evaluate(predictions)
print(rmse)

55.981180299629955


In [0]:
import pyspark.sql.functions as F
predictions_with_residuals = predictions.withColumn("residual", F.col("cnt") - F.col("prediction"))
display(predictions_with_residuals.agg({'residual':'mean'}))

avg(residual)
-1.9083890694392285
