# CONSULTING PROJECT

## Nhập dữ liệu

In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
import findspark
findspark.init()

In [2]:
sc= SparkContext(appName= 'Chapter7-Excersice1', master= 'local')
ss= SparkSession(sc)

In [3]:
path= '/Users/vovanthuong/Desktop/9 - Big Data in Machine Learning/Data/Chapter7/cruise_ship_info.csv'
df= ss.read.csv(path, inferSchema= True, header= True)

## Kiểm tra các đặc tính của dữ liệu

In [4]:
df.show(3)

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
only showing top 3 rows



In [5]:
df.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [6]:
df.count()

158

In [7]:
from pyspark.sql.functions import count, when, isnan, isnull, col
nan_data= df.select([count(when(isnan(c), c)).alias(c + '_nan') for c in df.columns]).toPandas().T
nan_data

Unnamed: 0,0
Ship_name_nan,0
Cruise_line_nan,0
Age_nan,0
Tonnage_nan,0
passengers_nan,0
length_nan,0
cabins_nan,0
passenger_density_nan,0
crew_nan,0


In [8]:
null_data= df.select([count(when(isnull(c), c)).alias(c + '_null') for c in df.columns]).toPandas().T
null_data

Unnamed: 0,0
Ship_name_null,0
Cruise_line_null,0
Age_null,0
Tonnage_null,0
passengers_null,0
length_null,0
cabins_null,0
passenger_density_null,0
crew_null,0


In [33]:
from pyspark.sql.functions import col
df.groupBy(df.columns).count().where(col('count') > 1).count()

0

In [20]:
df.describe().toPandas()

Unnamed: 0,summary,Ship_name,Cruise_line,Age,Tonnage,passengers,length,cabins,passenger_density,crew
0,count,158,158,158.0,158.0,158.0,158.0,158.0,158.0,158.0
1,mean,Infinity,,15.689873417721518,71.28467088607599,18.45740506329114,8.130632911392404,8.830000000000005,39.90094936708861,7.794177215189873
2,stddev,,,7.615691058751413,37.229540025907866,9.677094775143416,1.793473548054825,4.4714172221480615,8.63921711391542,3.503486564627034
3,min,Adventure,Azamara,4.0,2.329,0.66,2.79,0.33,17.7,0.59
4,max,Zuiderdam,Windstar,48.0,220.0,54.0,11.82,27.0,71.43,21.0


In [28]:
df.select("Ship_name").distinct().count()

138

In [29]:
df.select("Cruise_line").distinct().count()

20

Nhận xét:
- Dữ liệu có 158 dòng và 9 cột.
- Dữ liệu không có giá trị trống.
- Cột Ship_name là tên riêng của mỗi tàu vì vậy không hữu dụng.
- Cruise_line là biến số danh định cần mã hóa onehot encoding

## Tách dữ liệu train, test

In [34]:
train, test= df.randomSplit([0.8, 0.2])

## Làm sạch và chuẩn hóa dữ liệu. Sử dụng pipeline để dễ dàng tái sử dụng

In [104]:
# Lọc ra các cột cần sử dụng
choose_columns= train.columns
choose_columns.remove('Ship_name')
from pyspark.ml.feature import SQLTransformer
fillter= SQLTransformer(statement= 'SELECT {choose_columns} FROM __THIS__'.format(choose_columns= ','.join(choose_columns)))

In [105]:
# mã hóa biên số Cruise_line
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer
Cruise_line_indexer= StringIndexer(inputCol= 'Cruise_line', outputCol= 'Cruise_line_idx')
oh_encoder= OneHotEncoderEstimator(inputCols= ['Cruise_line_idx'], outputCols= ['Cruise_line_dummy'], dropLast= True)

In [106]:
# Tạo vector
from pyspark.ml.feature import VectorAssembler
choose_columns_to_vec= ['Cruise_line_dummy', 'Age', 'Tonnage', 'passengers', 'length', 'cabins', 'passenger_density']
vec_assembler= VectorAssembler(inputCols= choose_columns_to_vec, outputCol= 'features')

In [107]:
# Scale dữ liệu
from pyspark.ml.feature import MinMaxScaler
mm_scaler= MinMaxScaler(inputCol= 'features', outputCol= 'features_scale')

In [108]:
# Tạo pipeline
from pyspark.ml import Pipeline
pipe_preprocess= Pipeline(stages= [fillter, Cruise_line_indexer, oh_encoder, vec_assembler, mm_scaler])

In [109]:
preprocess= pipe_preprocess.fit(train)
train= preprocess.transform(train)

In [111]:
train.select('features_scale', 'crew').show(3)

+--------------------+-----+
|      features_scale| crew|
+--------------------+-----+
|[1.0,0.0,0.0,0.0,...|11.85|
|[0.0,0.0,0.0,0.0,...|  4.0|
|[0.0,0.0,0.0,0.0,...|  6.0|
+--------------------+-----+
only showing top 3 rows



## Tạo mô hình Linear regression

In [116]:
from pyspark.ml.regression import LinearRegression
lir= LinearRegression(featuresCol= 'features_scale', predictionCol='crew_prediction', labelCol= 'crew')
lir_model= lir.fit(train)

In [117]:
lir_model.coefficients

DenseVector([-0.9501, 0.4715, 0.2655, 0.5494, -0.3023, -0.2167, 0.7826, 0.0589, 0.1706, 0.6814, 1.6679, 0.7513, 0.2264, 0.2703, 0.8935, 0.5324, 0.227, 0.361, 0.2981, 0.1864, 2.2307, -4.4071, 4.0792, 19.6637, 0.7036])

In [119]:
lir_model.intercept

-0.6169945871814025

## Đánh giá mô hình với tập train

In [125]:
train_result= lir_model.transform(train)
train_result.select('crew', 'crew_prediction').show(10)

+-----+------------------+
| crew|   crew_prediction|
+-----+------------------+
|11.85|12.270421748895288|
|  4.0|3.4209024935361643|
|  6.0| 6.071971866540759|
| 8.69| 9.158154493139959|
| 0.59|1.5686216716209263|
|  7.0| 7.061311908089699|
|  5.2| 5.254447058136965|
|  9.2| 9.268646582500496|
|  8.5|   8.5725793904166|
| 8.48| 8.172594926819938|
+-----+------------------+
only showing top 10 rows



In [122]:
train_evaluater= lir_model.evaluate(dataset= train)

In [123]:
train_evaluater.meanAbsoluteError

0.4652837205483678

In [124]:
train_evaluater.meanSquaredError

0.7364468311180296

In [127]:
train_evaluater.rootMeanSquaredError

0.8581648041711042

In [126]:
train_evaluater.r2

0.9413961371694509

In [138]:
train_evaluater.r2adj

0.9274428364955106

In [128]:
from pyspark.sql.functions import corr
train_result.select(corr('crew', 'crew_prediction')).show()

+---------------------------+
|corr(crew, crew_prediction)|
+---------------------------+
|         0.9702557071048079|
+---------------------------+



## Đánh giá mô hình trên tập test

In [130]:
# xử lý dữ liệu tập test
test= preprocess.transform(test)

In [131]:
test_result= lir_model.transform(test)
test_result.select('crew', 'crew_prediction').show(10)

+----+------------------+
|crew|   crew_prediction|
+----+------------------+
|12.0|12.848511294923927|
|11.0|10.470968659515403|
| 9.0| 9.304898100986222|
| 9.2| 9.096789187462553|
| 9.2| 9.069189599917394|
| 4.7| 4.637197077759823|
|11.0|11.211133264094851|
| 7.6| 7.754966209037292|
| 9.2| 9.064953531342027|
| 9.2| 9.060717462766657|
+----+------------------+
only showing top 10 rows



In [132]:
test_evaluater= lir_model.evaluate(dataset= test)

In [133]:
test_evaluater.meanAbsoluteError

0.3544426520555372

In [134]:
test_evaluater.meanSquaredError

0.21657908382816704

In [135]:
test_evaluater.rootMeanSquaredError

0.46538057955631007

In [136]:
test_evaluater.r2

0.9776776006376462

In [137]:
test_evaluater.r2adj

0.41961761657880103

In [139]:
test_result.select(corr('crew', 'crew_prediction')).show()

+---------------------------+
|corr(crew, crew_prediction)|
+---------------------------+
|         0.9888581178858645|
+---------------------------+

