In [None]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Predict Player Points").getOrCreate()

# Load data
data = spark.read.csv("/input/basketball_ml_data.csv", header=True, inferSchema=True)

# Prepare features
feature_columns = ["team_strength", "opponent_strength", "previous_avg_points"]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(data).select("features", col("points").alias("label"))

# Split data
train_data, test_data = data.randomSplit([0.8, 0.2])

# Train model
lr = LinearRegression()
lr_model = lr.fit(train_data)

# Evaluate model
predictions = lr_model.transform(test_data)
predictions.select("features", "label", "prediction").show()
