In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("lrProject").getOrCreate()

In [3]:
data = spark.read.csv("cruise_ship_info.csv", inferSchema=True, 
                      header=True)

In [4]:
data.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [5]:
data.head(1)[0]

Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55)

In [6]:
from pyspark.ml.feature import StringIndexer

In [7]:
si = StringIndexer(inputCol="Cruise_line", 
                   outputCol="Indexed_Cruise_line")

In [8]:
si_model = si.fit(data)

In [9]:
data_with_indexed_cruise_line = si_model.transform(data)

In [10]:
data_with_indexed_cruise_line.select("Cruise_line", "Indexed_Cruise_line").show()

+-----------+-------------------+
|Cruise_line|Indexed_Cruise_line|
+-----------+-------------------+
|    Azamara|               16.0|
|    Azamara|               16.0|
|   Carnival|                1.0|
|   Carnival|                1.0|
|   Carnival|                1.0|
|   Carnival|                1.0|
|   Carnival|                1.0|
|   Carnival|                1.0|
|   Carnival|                1.0|
|   Carnival|                1.0|
|   Carnival|                1.0|
|   Carnival|                1.0|
|   Carnival|                1.0|
|   Carnival|                1.0|
|   Carnival|                1.0|
|   Carnival|                1.0|
|   Carnival|                1.0|
|   Carnival|                1.0|
|   Carnival|                1.0|
|   Carnival|                1.0|
+-----------+-------------------+
only showing top 20 rows



In [11]:
from pyspark.ml.feature import VectorAssembler

In [12]:
data_with_indexed_cruise_line.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew',
 'Indexed_Cruise_line']

## Feature Selection: 
#### This is where we select features fairly at random. I have found that with age and tonnage, I obtained a slightly higher rmse and lower r2 value on the test dataset. Hence I removed these features from consideration. The correct way to select features, however, would be to use a held out test dataset and an additional validation data set. We would then choose the features that perform best on the validation data set and finally evaluate the model performance on the test dataset. This way we prevent overfitting to the test dataset. 

Note: The particular results obtained may also vary slightly with each run since the train test splitting is done by randomly selecting 70% data for training and 30% data for testing/evaluation. 

In [13]:
assembler = VectorAssembler(inputCols=['Indexed_Cruise_line', 
#                                        'Age', 
#                                        'Tonnage', 
                                       'passengers',
                                       'length', 
                                       'cabins', 
                                       'passenger_density'],
                           outputCol="features")

In [14]:
output = assembler.transform(data_with_indexed_cruise_line)

In [15]:
# This shows an additional features column that was the output column
# for the VectorAssembler. 
output.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)
 |-- Indexed_Cruise_line: double (nullable = false)
 |-- features: vector (nullable = true)



In [16]:
# Shows that "features" column has the requiisite DenseVector 
# representation
output.head(1)

[Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, Indexed_Cruise_line=16.0, features=DenseVector([16.0, 6.94, 5.94, 3.55, 42.64]))]

In [17]:
final_data = output.select("features", "crew")

In [18]:
final_data.show()

+--------------------+----+
|            features|crew|
+--------------------+----+
|[16.0,6.94,5.94,3...|3.55|
|[16.0,6.94,5.94,3...|3.55|
|[1.0,14.86,7.22,7...| 6.7|
|[1.0,29.74,9.53,1...|19.1|
|[1.0,26.42,8.92,1...|10.0|
|[1.0,20.52,8.55,1...| 9.2|
|[1.0,20.52,8.55,1...| 9.2|
|[1.0,20.56,8.55,1...| 9.2|
|[1.0,20.52,8.55,1...| 9.2|
|[1.0,37.0,9.51,14...|11.5|
|[1.0,29.74,9.51,1...|11.6|
|[1.0,14.52,7.27,7...| 6.6|
|[1.0,20.52,8.55,1...| 9.2|
|[1.0,20.52,8.55,1...| 9.2|
|[1.0,21.24,9.63,1...| 9.3|
|[1.0,29.74,9.51,1...|11.6|
|[1.0,21.24,9.63,1...|10.3|
|[1.0,20.52,8.55,1...| 9.2|
|[1.0,21.24,9.63,1...| 9.3|
|[1.0,20.52,8.55,1...| 9.2|
+--------------------+----+
only showing top 20 rows



In [19]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [20]:
train_data.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|               110|
|   mean| 7.945090909090913|
| stddev|3.5355120427603914|
|    min|               0.6|
|    max|              21.0|
+-------+------------------+



In [21]:
test_data.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|                48|
|   mean| 7.448333333333333|
| stddev|3.4405407376422517|
|    min|              0.59|
|    max|              13.6|
+-------+------------------+



In [22]:
from pyspark.ml.regression import LinearRegression

In [23]:
lr = LinearRegression(labelCol="crew")

In [24]:
lr_model = lr.fit(train_data)

In [25]:
test_results = lr_model.evaluate(test_data)

In [26]:
test_results.residuals.show()

+--------------------+
|           residuals|
+--------------------+
| -0.7427130418986199|
| -1.6483959229232834|
| -1.0710804329679195|
| -0.9810804329679197|
|-0.22448552165879399|
| 0.39677221572180876|
|  0.8410991863472539|
|-0.20104478953028249|
| -1.0856830249242044|
| -0.5122928589069335|
|  0.5066047519067176|
|  0.5066047519067176|
|-0.15621285040799648|
|  0.0939124698965923|
|-0.00815665509676...|
|  0.9876009509399672|
|   1.140909152073883|
| 0.14190709605654117|
|  -1.260418835227048|
| -0.7177465039877777|
+--------------------+
only showing top 20 rows



In [27]:
test_results.rootMeanSquaredError

0.929765001249304

In [28]:
test_results.r2

0.925417599244097