In [1]:
from pyspark.sql import SparkSession

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MLAnalysisAdClicks") \
    .config("spark.driver.memory", "32g") \
    .config("spark.executor.memory", "32g") \
    .getOrCreate()

# 获取 SparkContext
sc = spark.sparkContext



24/04/25 13:22:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
# Load normalized data
df = spark.read.option("delimiter", "\t") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("/home/bw2298/combined_training_before_normalize.txt")

# Select necessary columns and cast them as needed
from pyspark.sql.functions import col

df = df.select(
    col("numClick").cast("int"),
    col("region").cast("int"),
    col("city").cast("int"),
    col("adExchange").cast("int"),
    col("width").cast("int"),
    col("height").cast("int"),
    col("floorPrice").cast("int"),
    col("weekday"),
    col("hour").cast("int")
)

# Check the data
df.show(5)

                                                                                

+--------+------+----+----------+-----+------+----------+-------+----+
|numClick|region|city|adExchange|width|height|floorPrice|weekday|hour|
+--------+------+----+----------+-----+------+----------+-------+----+
|       0|    94| 100|         2|  468|    60|        13| Monday|   0|
|       0|    40|  42|         2|  728|    90|         5| Monday|   0|
|       0|    40|  45|         1|  160|   600|         0| Monday|   0|
|       0|    80|  85|         2|  300|   250|         5| Monday|   0|
|       0|     0|   0|         2|  300|   250|         5| Monday|   0|
+--------+------+----+----------+-----+------+----------+-------+----+
only showing top 5 rows



In [4]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

# Indexing and encoding categorical columns
categoricalColumns = ['region', 'city', 'adExchange', 'weekday']
stages = []

for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    stages += [stringIndexer, encoder]

# Assembling vector
assemblerInputs = [c + "classVec" for c in categoricalColumns] + ['width', 'height', 'floorPrice', 'hour']
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [5]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, GBTClassifier, LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Set up the evaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="numClick", metricName="areaUnderROC")

# Logistic Regression
lr = LogisticRegression(labelCol="numClick", featuresCol="features", maxIter=10)
pipeline_lr = Pipeline(stages=stages + [lr])
model_lr = pipeline_lr.fit(df)
predictions_lr = model_lr.transform(df)
print("Logistic Regression AUC: ", evaluator.evaluate(predictions_lr))

24/04/25 13:23:26 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
24/04/25 13:23:26 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
                                                                                

Logistic Regression AUC:  0.6114016201900049


In [6]:
# Gradient Boosting Trees
gbt = GBTClassifier(labelCol="numClick", featuresCol="features", maxIter=10)
pipeline_gbt = Pipeline(stages=stages + [gbt])
model_gbt = pipeline_gbt.fit(df)
predictions_gbt = model_gbt.transform(df)
print("Gradient Boosting AUC: ", evaluator.evaluate(predictions_gbt))



Gradient Boosting AUC:  0.636350438444325


                                                                                

In [7]:
# Support Vector Machine
svm = LinearSVC(labelCol="numClick", featuresCol="features", maxIter=10)
pipeline_svm = Pipeline(stages=stages + [svm])
model_svm = pipeline_svm.fit(df)
predictions_svm = model_svm.transform(df)
print("SVM AUC: ", evaluator.evaluate(predictions_svm))

                                                                                

SVM AUC:  0.51549962635207
