# Testing On Linear Regression

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression

In [None]:
spark = SparkSession.builder.appName("linear_reg").getOrCreate()

### To check on vector which would affect the crew count in regression

In [None]:
cruise_final_data = spark.read.csv('cruise_ship_info.csv', inferSchema=True, header=True)
cruise_final_data.printSchema()

In [None]:
from pyspark.ml.feature import VectorAssembler, StringIndexer

In [None]:
cruise_indexer = StringIndexer(inputCol="Cruise_line",outputCol="cruise_index")
cruise_indexer.fit(cruise_final_data).transform(cruise_final_data).head(2)

In [None]:
assembler = VectorAssembler(inputCols=["cruise_index", "Age", "Tonnage", "passengers", "length", "cabins", "passenger_density"], outputCol="features")

### Split into 70% 30%

In [None]:
train_cruise_data, test_cruise_data = cruise_final_data.randomSplit([0.7,0.3])

In [None]:
regressionModel = LinearRegression(labelCol="crew")

### Put Into Pipeline Failed (only applicable to Logistics Regression

In [None]:
from pyspark.ml import Pipeline

In [None]:
pipeline = Pipeline(stages=[cruise_indexer, assembler, regressionModel])

In [None]:
trained_data_set = pipeline.fit(train_cruise_data)

In [None]:
result = trained_data_set.transform(test_cruise_data)

In [None]:
assembler_cruise = assembler.transform(cruise_indexer.fit(cruise_final_data).transform(cruise_final_data))
assembler_cruise = assembler_cruise.select("features","crew")
assembler_cruise.show(2)

In [None]:
final_a_cruise_data, test_a_cruise_data = assembler_cruise.randomSplit([0.7,0.3])

In [None]:
lr = regressionModel.fit(final_a_cruise_data)

In [None]:
prediction = lr.evaluate(test_a_cruise_data)

In [None]:
prediction.r2

In [None]:
lr.summary.r2

In [None]:
prediction.rootMeanSquaredError

In [None]:
test_a_cruise_data.describe().show()

In [None]:
lr_df = lr.transform(test_a_cruise_data)
lr_df.select("prediction","crew","features").show(5)

In [None]:
lr.summary.residuals.show()
