# Solution for Assignment 2 Question 2 Scalable ML

In [None]:
import pyspark

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Assignment2_Questions2") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("WARN")

## Reading Data - Qs 2.1

In [None]:
from pyspark.sql.functions import isnan, when, count, col

df = spark.read.load("/data/ac1ash//train_set.csv",format="csv", inferSchema="true", header="true").cache()

#Replace ?s with Nulls


for i in df.columns:
    df = df.withColumn(i,when((col(i)=='?'),None).otherwise(col(i)))
    
    
#We get rid of the null data
df = df.na.drop()

## Dealing with the categorical features 

In [None]:
from pyspark.ml.feature import StringIndexer,VectorAssembler,Binarizer
from pyspark.ml import Pipeline

Num_Categorical = 12

#We create some indexers so as to transform the categorical features
#Here index_feature simply obtains the names of all the categorical features
index_feature = df.columns[5:21]+df.columns[29:30]

#The outputCol will retain the same name of each categorical feature adding "_Num" at
#the end just to refer as the column feature that it is already transformed
Indexers = [StringIndexer(inputCol=i, outputCol=i+"_Num") for i in index_feature]

pipeline1 = Pipeline(stages=Indexers) #This pipeline is for transforming the categorical features

#The following variable which_cols will keep all the names of the variables that where not categorical
#and also the names of the new transformed features (say "Cat1_Num") that used to be categorical. 
which_cols= df.columns[1:5]+df.columns[21:29]+df.columns[30:34]+["Cat"+str(i+1)+"_Num" for i in range(Num_Categorical)]
which_cols = which_cols+["NVCat_Num","Blind_Make_Num","Blind_Model_Num","Blind_Submodel_Num","OrdCat_Num"]

#The intention is that which_cols indicates what are the final features to use for training purposes
#The pipeline2 is to define the final features for training
pipeline2 = Pipeline(stages=[VectorAssembler(inputCols=which_cols, outputCol="features")]) 

#The pipeline3 is defined for defining a binary label so as to train a binary classifier
#which decides in a first stage if a costumer might or not claim for money 
pipeline3 = Pipeline(stages=[Binarizer(threshold=0.0001, inputCol="Claim_Amount", outputCol="label")])

#This is a pipeline to gather all pipelines
pipeline = Pipeline(stages=[pipeline1,pipeline2,pipeline3])

#We make the data go through the pipelines
df_end = pipeline.fit(df).transform(df)

#At this stage we have prepared our dataset
transformed = df_end.select(['Row_ID','features','Claim_Amount','label'])
df=[]

## Balancing the Dataset

In [None]:
#We create two dataframes with costumers which did claim and did not claim for money respectively
claim_true=transformed.where(transformed["label"]>0.0)
claim_false=transformed.where(transformed["label"]==0.0)

relative_amount_toTake=0.8    #we will take 80% of data from costumer that did claim for money

#portion_Tobalance refers to the relative percentage of data to take randomly
#from the claim_false (costumers that did not claim money) in  order to have
#balanced dataset, with almost the same number of data from costumers 
#who claim and the ones who did not claim
portion_Tobalance=claim_true.count()*relative_amount_toTake/float(claim_false.count()) 
transformed=[]

##We select a proper portion of data to represent the information 
##when the costumer claims for money for dealing with the unbalanced data
(train_NoClaim, test_NoClaim) = claim_false.randomSplit([portion_Tobalance, 1.0-portion_Tobalance],seed=20) 
(train_Claim, test_Claim) = claim_true.randomSplit([relative_amount_toTake, 1.0-relative_amount_toTake],seed=20)

#We join the training data for both types of costumers
trainData = train_NoClaim.unionAll(train_Claim)
#We join the testing data for both types of costumers
testData = test_NoClaim.unionAll(test_Claim)

## Training a Linear Regression Model Qs  2.2

In [None]:
#Model Training with Generalised Linear Models (Gaussian distribution)
from pyspark.ml.regression import LinearRegression

glr = LinearRegression(maxIter=20, regParam=0.001)

# Fit the model
model = glr.fit(trainData)


## Validate Linear Regression Model

In [None]:
#model.transform(trainData).show()

# Select prediction and true label so as to compute training and test error
from pyspark.ml.evaluation import RegressionEvaluator

pred_train = model.transform(trainData)
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="mae")
mae = evaluator.evaluate(pred_train)
print("\n\n Mean Absolute Error (MAE) on train data = %g\n\n" % mae)

evaluator2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="mse")
mse = evaluator2.evaluate(pred_train)
print("\n\n Mean Sqaured Error (MSE) on train data = %g\n\n" % mse)



pred_test_claim = model.transform(test_Claim)

evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="mae")
mae = evaluator.evaluate(pred_test_claim)
print("\n\nMean Absolute Error (MAE) on test data costumers who claimed = %g\n\n" % mae)

evaluator1 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="mse")
mse = evaluator2.evaluate(pred_test_claim)
print("\n\nMean Sqaured Error (MSE) on test data costumers who claimed = %g\n\n" % mse)

print("Testing with costumer that actually claimed for money\n")
#pred_test_claim.orderBy("label",ascending=False).show(1000)

pred_test_claim.describe().show()

pred_test_Noclaim = model.transform(test_NoClaim)

evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="mae")
mae = evaluator.evaluate(pred_test_Noclaim)
print("\n\nMean Absolute Error (MAE) on test data costumers who did not claim = %g\n\n" % mae)


evaluator2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="mse")
mse = evaluator.evaluate(pred_test_Noclaim)
print("\n\nMean Sqaured Error (MSE) on test data costumers who did not claim = %g\n\n" % mse)

print("Testing with costumer that actually claimed for money\n")


#pred_test_Noclaim.orderBy("label",ascending=False).show(1000)

pred_test_Noclaim.describe().show()


## Training a Logistic Regression Model: binary classification Qs 2.3

In [None]:
###################################################################################
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=100, regParam=0.9)
pipeline_lr = Pipeline(stages=[lr])
model_bern = pipeline_lr.fit(trainData)
print('Logistic those who claimed:\n')
model_bern.transform(test_Claim).show(100)
print('Logistic for those who did not claim:\n')
model_bern.transform(test_NoClaim).show(100)
###################################################################################

## Training a Gamma Regression Model

In [None]:
####### We select the features and the proper label (Claim_Amount) #######
trainData_gamma=train_Claim.select(["Row_ID","features","Claim_Amount"])
trainData_gamma=trainData_gamma.withColumnRenamed("Claim_Amount","label")
#trainData_gamma.show(100)
testData_gamma=test_Claim.select(["Row_ID","features","Claim_Amount"])
testData_gamma=testData_gamma.withColumnRenamed("Claim_Amount","label")
#testData_gamma.show(100)

from pyspark.ml.regression import GeneralizedLinearRegression
glr = GeneralizedLinearRegression(family="gamma", link="identity", maxIter=100, regParam=0.001)

# Fit the model
model = glr.fit(trainData_gamma)

## Validation Error

In [None]:
# Select prediction and true label so as to compute training and test error
from pyspark.ml.evaluation import RegressionEvaluator

pred_bern_Claim=model_bern.transform(test_Claim).withColumnRenamed("prediction","pred_bern")
pred_bern_gamma_Claim=model.transform(pred_bern_Claim).withColumnRenamed("prediction","pred_gamma")
final_pred_Claim = pred_bern_gamma_Claim.select("Row_ID","Claim_Amount","pred_bern","pred_gamma",(pred_bern_gamma_Claim.pred_bern*pred_bern_gamma_Claim.pred_gamma).alias("prediction"))

evaluator = RegressionEvaluator(labelCol="Claim_Amount", predictionCol="prediction", metricName="mae")
mae = evaluator.evaluate(final_pred_Claim)
print("\n\nThe MAE for Bernoulli+Gamma Model on test data costumer who claimed = %g\n\n" % mae)
final_pred_Claim.orderBy("Claim_Amount",ascending=False).show(1000)

### We can check some statistics of the results for those who claimed
final_pred_Claim.describe().show()

pred_bern_NoClaim=model_bern.transform(test_NoClaim).withColumnRenamed("prediction","pred_bern")
pred_bern_gamma_NoClaim=model.transform(pred_bern_NoClaim).withColumnRenamed("prediction","pred_gamma")
final_pred_NoClaim = pred_bern_gamma_NoClaim.select("Row_ID","Claim_Amount","pred_bern","pred_gamma",(pred_bern_gamma_NoClaim.pred_bern*pred_bern_gamma_NoClaim.pred_gamma).alias("prediction"))

evaluator = RegressionEvaluator(labelCol="Claim_Amount", predictionCol="prediction", metricName="mae")
mae = evaluator.evaluate(final_pred_NoClaim)
print("\n\nThe MAE for Bernoulli+Gamma Model on test data costumer who did not claim = %g\n\n" % mae)
final_pred_NoClaim.orderBy("Claim_Amount",ascending=False).show(1000)

### We can check some statistics of the results for those who did not claim
final_pred_NoClaim.describe().show()