In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

In [3]:
spark=SparkSession.builder.appName("ML_Spark").getOrCreate()

In [4]:
spark.version

'3.5.1'

In [23]:
sdf=spark.read.csv("cruise_ship_info.csv",inferSchema=True,header=True)
sdf.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Elation|   Carnival| 15|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Fantasy|   Carnival| 23| 

In [6]:
sdf.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew']

In [7]:
from pyspark.ml.feature import StringIndexer
indexer=StringIndexer(inputCol='Cruise_line',outputCol='cruise_cat')
indexed = indexer.fit(sdf).transform(sdf)

In [8]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

assembler=VectorAssembler(inputCols=['Age','Tonnage','passengers','length','cabins','passenger_density','cruise_cat'],outputCol='features')

In [9]:
output=assembler.transform(indexed)

In [10]:
output.select('features','crew').show(5)

+--------------------+----+
|            features|crew|
+--------------------+----+
|[6.0,30.276999999...|3.55|
|[6.0,30.276999999...|3.55|
|[26.0,47.262,14.8...| 6.7|
|[11.0,110.0,29.74...|19.1|
|[17.0,101.353,26....|10.0|
+--------------------+----+
only showing top 5 rows



In [11]:
final_data=output.select('features','crew')

In [32]:
train_data,test_data=final_data.randomSplit([0.7,0.3])

In [33]:
train_data.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|               121|
|   mean| 7.821074380165299|
| stddev|3.3695110430775412|
|    min|              0.59|
|    max|              21.0|
+-------+------------------+



In [34]:
test_data.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|                37|
|   mean| 7.706216216216214|
| stddev|3.9590868415101252|
|    min|              0.59|
|    max|              19.1|
+-------+------------------+



In [35]:
from pyspark.ml.regression import LinearRegression

In [36]:
ship_lr=LinearRegression(featuresCol='features',labelCol='crew')

In [37]:
trained_ship_model=ship_lr.fit(train_data)

In [38]:
pred=trained_ship_model.evaluate(test_data)
pred.predictions.show()

+--------------------+-----+------------------+
|            features| crew|        prediction|
+--------------------+-----+------------------+
|[5.0,133.5,39.59,...|13.13|13.031105416646406|
|[5.0,160.0,36.34,...| 13.6|15.049619738300464|
|[6.0,30.276999999...| 3.55| 4.430697241108387|
|[6.0,90.0,20.0,9....|  9.0|10.132724616013254|
|[6.0,110.23899999...| 11.5|10.868022443361259|
|[9.0,88.5,21.24,9...| 10.3| 9.498052893282482|
|[9.0,113.0,26.74,...|12.38|11.276080537412568|
|[10.0,90.09,25.01...| 8.58| 8.820559241653402|
|[11.0,58.6,15.66,...|  7.6|  7.38824888619991|
|[11.0,86.0,21.24,...|  9.3| 9.435229877923602|
|[11.0,91.0,20.32,...| 9.99|  9.26327907192606|
|[11.0,110.0,29.74...| 19.1|11.899427234042555|
|[11.0,138.0,31.14...|11.85|12.929268876147024|
|[12.0,42.0,14.8,7...|  6.8| 6.597575929844923|
|[12.0,58.6,15.66,...|  7.0| 7.379944039754382|
|[12.0,88.5,21.24,...|10.29| 9.413529299101574|
|[12.0,138.0,31.14...|11.85|12.916754550317465|
|[13.0,25.0,3.82,5...| 2.95| 2.918720034

In [42]:
from pyspark.ml.evaluation import RegressionEvaluator

# Define evaluators for different metrics
evaluator_r2 = RegressionEvaluator(labelCol='crew', predictionCol='prediction', metricName='r2')
evaluator_rmse = RegressionEvaluator(labelCol='crew', predictionCol='prediction', metricName='rmse')
evaluator_mae = RegressionEvaluator(labelCol='crew', predictionCol='prediction', metricName='mae')

# Evaluate
r2 = evaluator_r2.evaluate(predictions)
rmse = evaluator_rmse.evaluate(predictions)
mae = evaluator_mae.evaluate(predictions)

# Print results
print("📊 Model Evaluation Metrics:")
print(f"✅ R²: {r2:.4f}")
print(f"✅ RMSE: {rmse:.4f}")
print(f"✅ MAE: {mae:.4f}")


📊 Model Evaluation Metrics:
✅ R²: 0.8521
✅ RMSE: 1.4218
✅ MAE: 0.7325


In [43]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import (
    LinearRegression,
    RandomForestRegressor,
    DecisionTreeRegressor,
    GBTRegressor
)
from pyspark.ml.evaluation import RegressionEvaluator

evaluator_r2 = RegressionEvaluator(labelCol='crew', predictionCol='prediction', metricName='r2')
evaluator_rmse = RegressionEvaluator(labelCol='crew', predictionCol='prediction', metricName='rmse')
evaluator_mae = RegressionEvaluator(labelCol='crew', predictionCol='prediction', metricName='mae')

# 5️⃣ Define models to train
models = {
    "Linear Regression": LinearRegression(featuresCol='features', labelCol='crew'),
    "Random Forest": RandomForestRegressor(featuresCol='features', labelCol='crew', numTrees=100),
    "Decision Tree": DecisionTreeRegressor(featuresCol='features', labelCol='crew'),
    "GBT Regressor": GBTRegressor(featuresCol='features', labelCol='crew', maxIter=100)
}

# 6️⃣ Train and evaluate each model
results = []

for name, model in models.items():
    print(f"\n🚀 Training {name}...")
    trained_model = model.fit(train_data)
    predictions = trained_model.transform(test_data)

    r2 = evaluator_r2.evaluate(predictions)
    rmse = evaluator_rmse.evaluate(predictions)
    mae = evaluator_mae.evaluate(predictions)

    results.append((name, r2, rmse, mae))

# 7️⃣ Display results
print("\n📈 Model Performance Comparison:")
print(f"{'Model':<20} {'R²':<10} {'RMSE':<10} {'MAE':<10}")
for name, r2, rmse, mae in results:
    print(f"{name:<20} {r2:<10.4f} {rmse:<10.4f} {mae:<10.4f}")


🚀 Training Linear Regression...

🚀 Training Random Forest...

🚀 Training Decision Tree...

🚀 Training GBT Regressor...

📈 Model Performance Comparison:
Model                R²         RMSE       MAE       
Linear Regression    0.8795     1.3554     0.7137    
Random Forest        0.8899     1.2957     0.4643    
Decision Tree        0.8890     1.3013     0.4955    
GBT Regressor        0.8872     1.3116     0.4674    
