In [0]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler


In [0]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("PySpark Example") \
    .getOrCreate()


In [0]:
# Sample data in a list format
data = [
    ("Alice", "Female", 29, 3000),
    ("Bob", "Male", 34, 4000),
    ("Charlie", "Female", 25, 3500),
    ("David", "Male", 40, 4500),
    ("Eve", "Female", 30, 3200)
]

# Define column names
columns = ["Name", "Gender", "Age", "Salary"]

# Create a DataFrame
df = spark.createDataFrame(data, columns)

# Show the original DataFrame
print("Original Data:")
df.show()



In [0]:
# Step 1: Index the "Gender" column (convert categorical data to numerical)
indexer = StringIndexer(inputCol="Gender", outputCol="GenderIndex")
df_indexed = indexer.fit(df).transform(df)

print("After StringIndexer (Gender -> GenderIndex):")
df_indexed.show()

In [0]:
# Step 2: One-hot encode the "GenderIndex" column
encoder = OneHotEncoder(inputCol="GenderIndex", outputCol="GenderVec")
df_encoded = encoder.fit(df_indexed).transform(df_indexed)

print("After OneHotEncoder (GenderIndex -> GenderVec):")
df_encoded.show()

In [0]:
# Step 3: Assemble the features (Age, Salary, and GenderVec) into a single feature vector
assembler = VectorAssembler(
    inputCols=["Age", "Salary", "GenderVec"],  # Input features
    outputCol="features"  # Output column with assembled vector
)

df_final = assembler.transform(df_encoded)

print("After VectorAssembler (Assembled Features):")
df_final.select("Name", "features").show(truncate=False)

# Stop the Spark session
spark.stop()