In [1]:
import findspark
findspark.init('/home/ubuntu/spark-2.2.0-bin-hadoop2.7')
from pyspark.sql import SparkSession
import pandas as pd

import os
os.getcwd()
data_folder = '/home/ubuntu/data/raw'

In [2]:
spark = SparkSession.builder.appName('lrProj').getOrCreate()

# 1. Import Data

In [3]:
file = '/cruise_ship_info.csv'
data = spark.read.csv(data_folder+file, inferSchema=True, header=True)

df = pd.read_csv(data_folder+file)

In [6]:
data.printSchema()
print('# of Rows: ', data.count())

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)

# of Rows:  158


In [7]:
for row in data.head(5):
    print(row[0:10])

('Journey', 'Azamara', 6, 30.276999999999997, 6.94, 5.94, 3.55, 42.64, 3.55)
('Quest', 'Azamara', 6, 30.276999999999997, 6.94, 5.94, 3.55, 42.64, 3.55)
('Celebration', 'Carnival', 26, 47.262, 14.86, 7.22, 7.43, 31.8, 6.7)
('Conquest', 'Carnival', 11, 110.0, 29.74, 9.53, 14.88, 36.99, 19.1)
('Destiny', 'Carnival', 17, 101.353, 26.42, 8.92, 13.21, 38.36, 10.0)


In [8]:
df.tail()
#data.columns

Unnamed: 0,Ship_name,Cruise_line,Age,Tonnage,passengers,length,cabins,passenger_density,crew
153,Taurus,Star,22,3.341,0.66,2.79,0.33,50.62,0.59
154,Virgo,Star,14,76.8,19.6,8.79,9.67,39.18,12.0
155,Spirit,Windstar,25,5.35,1.58,4.4,0.74,33.86,0.88
156,Star,Windstar,27,5.35,1.67,4.4,0.74,32.04,0.88
157,Surf,Windstar,23,14.745,3.08,6.17,1.56,47.87,1.8


# 2. Data Transformation

In [9]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer

In [10]:
stringIndexer = StringIndexer(inputCol="Cruise_line", outputCol="categoryIndex")
model = stringIndexer.fit(data)
indexed = model.transform(data)

encoder = OneHotEncoder(inputCol="categoryIndex", outputCol="categoryVec")
encoded = encoder.transform(indexed)
encoded.show(2)

+---------+-----------+---+------------------+----------+------+------+-----------------+----+-------------+---------------+
|Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|categoryIndex|    categoryVec|
+---------+-----------+---+------------------+----------+------+------+-----------------+----+-------------+---------------+
|  Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|         16.0|(19,[16],[1.0])|
|    Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|         16.0|(19,[16],[1.0])|
+---------+-----------+---+------------------+----------+------+------+-----------------+----+-------------+---------------+
only showing top 2 rows



In [11]:
encoded.head()[0:12]

('Journey',
 'Azamara',
 6,
 30.276999999999997,
 6.94,
 5.94,
 3.55,
 42.64,
 3.55,
 16.0,
 SparseVector(19, {16: 1.0}))

In [12]:
encoded.describe()
encoded.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew',
 'categoryIndex',
 'categoryVec']

## Prep Spark Format

In [13]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['Age','Tonnage','passengers','length',
                                       'cabins','passenger_density',
                                       'categoryVec'],
                                       outputCol='features')
output = assembler.transform(encoded)
output.printSchema()
final_data = output.select('features','crew')


root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)
 |-- categoryIndex: double (nullable = true)
 |-- categoryVec: vector (nullable = true)
 |-- features: vector (nullable = true)



In [14]:
train_data,test_data = final_data.randomSplit([0.7,0.3])
train_data.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|               112|
|   mean| 7.827678571428577|
| stddev|3.0827395041923102|
|    min|              0.59|
|    max|              13.6|
+-------+------------------+



In [15]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(labelCol='crew',featuresCol='features')

In [16]:
lr_model = lr.fit(train_data)

In [17]:
test_results = lr_model.evaluate(test_data)
print(test_results.rootMeanSquaredError)
print(test_results.r2)

1.3500139995125406
0.9038335144713381


In [18]:
train_data.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|               112|
|   mean| 7.827678571428577|
| stddev|3.0827395041923102|
|    min|              0.59|
|    max|              13.6|
+-------+------------------+



In [19]:
##Test Correlation
#ARe there columns that are highly ocrrelated?

In [20]:
from pyspark.sql.functions import corr
#data.describe().show()
data.select(corr('crew','passengers')).show()
data.select(corr('crew','cabins')).show()

+----------------------+
|corr(crew, passengers)|
+----------------------+
|    0.9152341306065384|
+----------------------+

+------------------+
|corr(crew, cabins)|
+------------------+
|0.9508226063578497|
+------------------+

