In [0]:
from pyspark.sql import SparkSession

# Get the active SparkSession (Databricks provides this automatically as `spark`)
spark = SparkSession.builder.getOrCreate()

# Set catalog and schema (adjust if needed)
spark.sql("USE CATALOG dp_ml_raw")
spark.sql("USE dp_ml_raw.dp_ml_titanic")

# Create the Delta table directly from the CSV files in the Volume
spark.sql("""
CREATE table IF NOT EXISTS dp_ml_raw.dp_ml_titanic.dp_ml_titanic_train_raw
USING DELTA
CLUSTER BY (PassengerId)
SELECT *
FROM read_files(
  'dbfs:/Volumes/dp_ml_raw/dp_ml_titanic/titanic_raw_data/train.csv',
  format => 'csv',
  header => true,
  inferSchema => true
)
""")

spark.sql("""
CREATE table IF NOT EXISTS dp_ml_raw.dp_ml_titanic.dp_ml_titanic_test_raw
USING DELTA
CLUSTER BY (PassengerId)
SELECT *
FROM read_files(
  'dbfs:/Volumes/dp_ml_raw/dp_ml_titanic/titanic_raw_data/test.csv',
  format => 'csv',
  header => true,
  inferSchema => true
)
          """)

# Preview a few rows
display(spark.table("dp_ml_raw.dp_ml_titanic.dp_ml_titanic_train_raw").limit(10))
df_train = spark.sql("SELECT * FROM dp_ml_raw.dp_ml_titanic.dp_ml_titanic_train_raw")
df_test = spark.sql("SELECT * FROM dp_ml_raw.dp_ml_titanic.dp_ml_titanic_test_raw")


In [0]:
# data cleanup
print(df_train.columns)
print(df_test.columns)
# drop columns
df_train = df_train.drop('PassengerId', 'Name', 'Ticket','_rescued_data', 'Cabin')
df_test = df_test.drop('PassengerId', 'Name', 'Ticket','_rescued_data', 'Cabin')
display(df_train.fillna(0))
df.select("Survived").distinct().show()
df.select("Pclass").distinct().show()
df.select("Sex").distinct().show()
df.select("Embarked").distinct().show()

# one hot encoding of "embarked column"
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline

# Assume your DataFrame is called df and has a column "Embarked"

# Step 1: Convert string categories into numeric indices
indexer = StringIndexer(inputCol="Embarked", outputCol="Embarked_index")

# Step 2: One-hot encode the indices
encoder = OneHotEncoder(inputCols=["Embarked_index"], outputCols=["Embarked_vec"])

# Step 3: Build pipeline
pipeline = Pipeline(stages=[indexer, encoder])

# Fit and transform
model = pipeline.fit(df)
df_encoded = model.transform(df)

# Show result
df_encoded.select("Embarked", "Embarked_index", "Embarked_vec").show(truncate=False)
