# Basic Regression using PySpark ML Pipeline
1. Imports
2. Load Data<br> &emsp;2.1 Data pre-processing<br>
3. Test Pipeline components<br>&nbsp;3.1 Encoding data<br>    &emsp;3.1.1 Ordinal encoding<br>    &emsp;3.1.2 One-hot encoding<br>    &emsp;3.1.3 Assembling features<br>&nbsp;3.2 Linear Regression<br>
4. Testing Pipeline<br>&nbsp;4.1 Split Data<br>&nbsp;4.2 Test Pipeline<br>
5. Pipeline + Cross-validation + Tuning<br>&nbsp;5.1 Define Pipeline<br>&nbsp;5.2 Define Hyper-parameter grid<br>&nbsp;5.3 Define Cross validation params<br>&nbsp;5.4 Get best CV model<br>&nbsp;5.5 Predictions from best CV model<br>&nbsp;5.6 Compute Eval metrics<br>    
        

## 1. Imports

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import  StringIndexer, OneHotEncoder,VectorAssembler,StandardScaler
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col
from sklearn.metrics import r2_score

In [0]:
spark = SparkSession.builder.getOrCreate()

## 2. Load Data

In [0]:
# Load file
data_sparkdf = df1 = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/{user-directory}/insurance.csv")

In [0]:
data_sparkdf.printSchema()

In [0]:
data_sparkdf.show(2)

### 2.1 Data pre-processing

In [0]:
### Datatype conversion
data_sparkdf = data_sparkdf.withColumn("age",col("age").cast('integer'))\
            .withColumn("bmi",col("bmi").cast('float'))\
            .withColumn("children",col("children").cast('integer'))\
            .withColumn("charges",col("charges").cast('float'))

data_sparkdf.printSchema()

## 3. Test Pipeline components

### 3.1 Encoding data

#### 3.1.1 Ordinal encoding

In [0]:
### Converting categorical columns to indices for one-hot encoding
indexer = StringIndexer(inputCols=['sex','smoker','region'],outputCols=['sex_ind','smoker_ind','region_ind']).fit(data_sparkdf)
data_indexed_sparkdf = indexer.transform(data_sparkdf)

In [0]:
data_indexed_sparkdf.show(2)

#### 3.1.2 One-hot encoding

In [0]:
ohe = OneHotEncoder(inputCols=['sex_ind','smoker_ind','region_ind'],\
                    outputCols=['sex_encoded','smoker_encoded','region_encoded']).fit(data_indexed_sparkdf)
data_ohe_sparkdf = ohe.transform(data_indexed_sparkdf)

In [0]:
data_ohe_sparkdf.show(2)

In [0]:
data_ohe_sparkdf.columns

#### 3.1.3 Assembling features

In [0]:
vecAssembler = VectorAssembler(inputCols=['age','bmi','children',\
                                'sex_encoded','smoker_encoded','region_encoded'],\
                               outputCol="features")
data_vec_sparkdf = vecAssembler.transform(data_ohe_sparkdf)

In [0]:
data_vec_sparkdf.show(5)

### 3.3 Linear Regression

In [0]:
### Split data into Train & Test [90%,10%]
train_sparkdf, test_sparkdf = data_vec_sparkdf.randomSplit([0.8, 0.2], seed=123)

In [0]:
lr = LinearRegression(labelCol='charges',maxIter=10, regParam=0.01).fit(train_sparkdf)

In [0]:
pred_results=lr.evaluate(test_sparkdf)

In [0]:
### Performance Metrics
pred_results.r2

## 4. Testing Pipeline

#### 4.1 Split Data

In [0]:
### Rename column to default name 'label'
data_sparkdf = data_sparkdf.withColumnRenamed('charges','label')
data_sparkdf.show(2)

In [0]:
### Split data into Train & Test [90%,10%]
train_sparkdf, test_sparkdf = data_sparkdf.randomSplit([0.8, 0.2], seed=123)
print((train_sparkdf.count(), len(train_sparkdf.columns)))

#### 4.2 Test Pipeline

In [0]:
### Ordinal encoding
indexer = StringIndexer(inputCols=['sex','smoker','region'],outputCols=['sex_ind','smoker_ind','region_ind'])
### One-hot encoding
ohe = OneHotEncoder(inputCols=['sex_ind','smoker_ind','region_ind']\
                    ,outputCols=['sex_encoded','smoker_encoded','region_encoded'])
### Assembling features
vecAssembler = VectorAssembler(inputCols=['age','bmi','children',\
                                'sex_encoded','smoker_encoded','region_encoded'],\
                               outputCol="features")

### Linear regression model
lr = LinearRegression(maxIter=10, regParam=0.01)

### Define pipeline
pipeline = Pipeline(stages=[indexer,ohe,vecAssembler,lr])


In [0]:
### Fit the pipeline to training documents.
model = pipeline.fit(train_sparkdf)

In [0]:
### transform the test documents using the pipeline
result_sparkdf = model.transform(test_sparkdf).select("features", "label", "prediction")
result_df = result_sparkdf.toPandas()

In [0]:
### Compute R2 score 
r2_score(result_df.label,result_df.prediction)

## 5. Pipeline + Cross-validation + Tuning

### 5.1 Define Pipeline

In [0]:
### Ordinal encoding
indexer = StringIndexer(inputCols=['sex','smoker','region'],outputCols=['sex_ind','smoker_ind','region_ind'])
### One-hot encoding
ohe = OneHotEncoder(inputCols=['sex_ind','smoker_ind','region_ind']\
                    ,outputCols=['sex_encoded','smoker_encoded','region_encoded'])
### Assembling features
vecAssembler = VectorAssembler(inputCols=['age','bmi','children',\
                                'sex_encoded','smoker_encoded','region_encoded'],\
                               outputCol="features")

### Linear regression model
lr = LinearRegression(maxIter=10)

### Define pipeline
pipeline = Pipeline(stages=[indexer,ohe,vecAssembler,lr])


### 5.2 Define Hyper-parameter grid

In [0]:
### Grid hyper parameter search
paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.1, 0.01,0.001]) \
    .build()

### 5.3 Define Cross validation params

In [0]:
### Cross validation
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(),
                          numFolds=5)

### 5.4 Get best CV model

In [0]:
### Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(train_sparkdf)

### 5.5 Predictions from best CV model

In [0]:
### Make predictions on test documents. cvModel uses the best model found 
prediction = cvModel.transform(test_sparkdf).select("features", "label", "prediction")

### 5.6 Compute Eval metrics

In [0]:
### Convert to Pandas DF to compute evaluation metrics
result_df = prediction.toPandas()
### Compute R2 score 
print(r2_score(result_df.label,result_df.prediction))