## Imports

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('ship').getOrCreate()

In [3]:
data = spark.read.csv('/FileStore/tables/cruise_ship_info.csv',inferSchema=True,header=True)

In [4]:
data.columns

## Using String Indexer to transform string

In [5]:
from pyspark.ml.feature import StringIndexer

In [6]:
si = StringIndexer(inputCol='Cruise_line',outputCol='indexed_cruise')
si.setHandleInvalid('error')

In [7]:
model = si.fit(data)

In [8]:
new_data = model.transform(data)

In [9]:
new_data.head(1)

In [10]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [11]:
new_data.columns

## Processing data

In [12]:
assembler = VectorAssembler(inputCols=['indexed_cruise','Age','Tonnage','passengers','length','cabins','passenger_density'],
                           outputCol='features')

In [13]:
assembled_data = assembler.transform(new_data)

In [14]:
assembled_data.head(1)

In [15]:
final_data = assembled_data.select('features','crew')

In [16]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [17]:
train_data.describe().show()

## Training and Evaluating the data

In [18]:
from pyspark.ml.regression import LinearRegression

In [19]:
lr = LinearRegression(labelCol='crew')

In [20]:
lr_model = lr.fit(train_data)

In [21]:
test_result = lr_model.evaluate(test_data)

In [22]:
test_result.residuals.show()

In [23]:
test_result.meanSquaredError

In [24]:
test_result.rootMeanSquaredError

In [25]:
test_result.r2

In [26]:
test_data.show()

## Creating unlabeled data to test model

In [27]:
unlabeled_data = test_data.select('features')

In [28]:
unlabeled_data.show()

In [29]:
result = lr_model.transform(unlabeled_data)

In [30]:
result.show()

In [31]:
test_data.show()

### Printing predicted vs original for comparison