In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, count, when

spark = SparkSession.builder.appName("Basketball Analysis").getOrCreate()

# Load the dataset
df = spark.read.csv("/input/basketball_pbp.csv", header=True, inferSchema=True)

# Calculate points per player per game
player_scores = df.groupBy("GAME_ID", "PLAYER1_NAME").agg(sum("SCORE").alias("total_points"))

# Filter players who scored 40 or more points
players_40_plus = player_scores.filter(col("total_points") >= 40)

# Calculate percentage
total_players = player_scores.select("PLAYER1_NAME").distinct().count()
percentage = (players_40_plus.count() / total_players) * 100

print(f"Percentage of players scoring 40+ points: {percentage:.2f}%")
