In [1]:
from pyspark.sql import SparkSession

# Initialize Spark session with configurations
spark = SparkSession.builder.config("spark.app.name", "Data Investigation") \
                         .config("spark.executor.memory", "8g") \
                         .config("spark.executor.cores", "4") \
                         .config("spark.driver.memory", "4g") \
                         .config("spark.sql.shuffle.partitions", "200") \
                         .getOrCreate()

# Path to the CSV file in Google Cloud Storage
csv_path1 = "gs://msca-bdp-student-gcs/Group5/Steam_data/paid_data/part-00000-bd5f2055-83c5-471b-ba9c-1036cce464a9-c000.csv"
csv_path2 = "gs://msca-bdp-student-gcs/Group5/Steam_data/free_data/part-00000-cf669363-29bf-4c70-be25-878359d34cb3-c000.csv"

# Load the CSV file into a Spark DataFrame
df_p = spark.read.csv(csv_path1, header=True, inferSchema=True)
df_f = spark.read.csv(csv_path2, header=True, inferSchema=True)

                                                                                

In [2]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import when
#Paid
df_p = df_p.withColumn(
    "ccu_category", 
    when(df_p["ccu"] < 500, "low")
    .when((df_p["ccu"] >= 500) & (df_p["ccu"] < 5000), "medium_low")
    .when((df_p["ccu"] >= 5000) & (df_p["ccu"] < 20000), "medium_high")
    .when((df_p["ccu"] >= 20000) & (df_p["ccu"] < 100000), "high")
    .otherwise("Very high")
)


#Free[Different standard]
df_f = df_f.withColumn(
    "ccu_category", 
    when(df_f["ccu"] < 500, "low")
    .when((df_f["ccu"] >= 500) & (df_f["ccu"] < 5000), "medium_low")
    .when((df_f["ccu"] >= 5000) & (df_f["ccu"] < 50000), "medium_high")
    .when((df_f["ccu"] >= 50000) & (df_f["ccu"] < 200000), "high")
    .otherwise("Very high")
)


In [6]:
df_f_cc=df_f['appid',"ccu_category"]
df_p_cc=df_p['appid',"ccu_category"]

In [7]:
# Output paths for each DataFrame
output_path_df1 = "gs://msca-bdp-student-gcs/Group5/Steam_data/df_f_cc.parquet"
output_path_df2 = "gs://msca-bdp-student-gcs/Group5/Steam_data/df_p_cc.parquet"

# Write df1 to GCS
df_f_cc.write \
    .format("parquet") \
    .mode("overwrite") \
    .save(output_path_df1)

print(f"df1 successfully stored at: {output_path_df1}")

# Write df2 to GCS
df_p_cc.write \
    .format("parquet") \
    .mode("overwrite") \
    .save(output_path_df2)

print(f"df2 successfully stored at: {output_path_df2}")

                                                                                

df1 successfully stored at: gs://msca-bdp-student-gcs/Group5/Steam_data/df_f_cc.parquet


                                                                                

df2 successfully stored at: gs://msca-bdp-student-gcs/Group5/Steam_data/df_p_cc.parquet
