In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("MLAnalysisAdClicks").getOrCreate()
sc = spark.sparkContext

24/04/22 13:08:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
# Load normalized data
df = spark.read.option("delimiter", "\t") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("/home/bw2298/combined_training_before_normalize.txt")

# Select necessary columns and cast them as needed
from pyspark.sql.functions import col

df = df.select(
    col("numClick").cast("int"),
    col("region").cast("int"),
    col("city").cast("int"),
    col("adExchange").cast("int"),
    col("width").cast("int"),
    col("height").cast("int"),
    col("floorPrice").cast("int"),
    col("weekday"),
    col("hour").cast("int")
)

# Check the data
df.show(5)

                                                                                

+--------+------+----+----------+-----+------+----------+-------+----+
|numClick|region|city|adExchange|width|height|floorPrice|weekday|hour|
+--------+------+----+----------+-----+------+----------+-------+----+
|       0|    94| 100|         2|  468|    60|        13| Monday|   0|
|       0|    40|  42|         2|  728|    90|         5| Monday|   0|
|       0|    40|  45|         1|  160|   600|         0| Monday|   0|
|       0|    80|  85|         2|  300|   250|         5| Monday|   0|
|       0|     0|   0|         2|  300|   250|         5| Monday|   0|
+--------+------+----+----------+-----+------+----------+-------+----+
only showing top 5 rows



In [5]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

# Indexing and encoding categorical columns
categoricalColumns = ['region', 'city', 'adExchange', 'weekday']
stages = []

for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    stages += [stringIndexer, encoder]

# Assembling vector
assemblerInputs = [c + "classVec" for c in categoricalColumns] + ['width', 'height', 'floorPrice', 'hour']
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [6]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, GBTClassifier, LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Set up the evaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="numClick", metricName="areaUnderROC")

# Logistic Regression
lr = LogisticRegression(labelCol="numClick", featuresCol="features", maxIter=10)
pipeline_lr = Pipeline(stages=stages + [lr])
model_lr = pipeline_lr.fit(df)
predictions_lr = model_lr.transform(df)
print("Logistic Regression AUC: ", evaluator.evaluate(predictions_lr))

24/04/22 13:15:57 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
24/04/22 13:15:57 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
24/04/22 13:15:57 WARN MemoryStore: Not enough space to cache rdd_63_12 in memory! (computed 31.2 MiB so far)
24/04/22 13:15:57 WARN BlockManager: Persisting block rdd_63_12 to disk instead.
24/04/22 13:15:57 WARN MemoryStore: Not enough space to cache rdd_63_8 in memory! (computed 30.0 MiB so far)
24/04/22 13:15:57 WARN BlockManager: Persisting block rdd_63_8 to disk instead.
24/04/22 13:15:58 WARN MemoryStore: Not enough space to cache rdd_63_11 in memory! (computed 30.9 MiB so far)
24/04/22 13:15:58 WARN BlockManager: Persisting block rdd_63_11 to disk instead.
24/04/22 13:15:58 WARN MemoryStore: Not enough space to cache rdd_63_8 in memory! (computed 17.0 MiB so far)
24/04/22 13:15:58 WARN MemoryStore: Not enough space to cache rdd_63_7 in memory! (computed 33.0 MiB so far)


Logistic Regression AUC:  0.6114053567631033


In [7]:
# Gradient Boosting Trees
gbt = GBTClassifier(labelCol="numClick", featuresCol="features", maxIter=10)
pipeline_gbt = Pipeline(stages=stages + [gbt])
model_gbt = pipeline_gbt.fit(df)
predictions_gbt = model_gbt.transform(df)
print("Gradient Boosting AUC: ", evaluator.evaluate(predictions_gbt))

24/04/22 13:17:51 WARN MemoryStore: Not enough space to cache rdd_203_8 in memory! (computed 19.1 MiB so far)
24/04/22 13:17:51 WARN BlockManager: Persisting block rdd_203_8 to disk instead.
24/04/22 13:17:51 WARN MemoryStore: Not enough space to cache rdd_203_6 in memory! (computed 19.1 MiB so far)
24/04/22 13:17:51 WARN BlockManager: Persisting block rdd_203_6 to disk instead.
24/04/22 13:17:51 WARN MemoryStore: Not enough space to cache rdd_203_1 in memory! (computed 29.8 MiB so far)
24/04/22 13:17:51 WARN BlockManager: Persisting block rdd_203_1 to disk instead.
24/04/22 13:17:52 WARN MemoryStore: Not enough space to cache rdd_203_14 in memory! (computed 29.8 MiB so far)
24/04/22 13:17:52 WARN BlockManager: Persisting block rdd_203_14 to disk instead.
24/04/22 13:17:52 WARN MemoryStore: Not enough space to cache rdd_203_0 in memory! (computed 19.1 MiB so far)
24/04/22 13:17:52 WARN BlockManager: Persisting block rdd_203_0 to disk instead.
24/04/22 13:17:52 WARN MemoryStore: Not eno

Gradient Boosting AUC:  0.6363557826714065


                                                                                

In [8]:
# Support Vector Machine
svm = LinearSVC(labelCol="numClick", featuresCol="features", maxIter=10)
pipeline_svm = Pipeline(stages=stages + [svm])
model_svm = pipeline_svm.fit(df)
predictions_svm = model_svm.transform(df)
print("SVM AUC: ", evaluator.evaluate(predictions_svm))

24/04/22 13:23:08 WARN MemoryStore: Not enough space to cache rdd_490_15 in memory! (computed 31.2 MiB so far)
24/04/22 13:23:08 WARN BlockManager: Persisting block rdd_490_15 to disk instead.
24/04/22 13:23:09 WARN MemoryStore: Not enough space to cache rdd_490_14 in memory! (computed 31.1 MiB so far)
24/04/22 13:23:09 WARN BlockManager: Persisting block rdd_490_14 to disk instead.
24/04/22 13:23:09 WARN MemoryStore: Not enough space to cache rdd_490_15 in memory! (computed 31.2 MiB so far)
24/04/22 13:23:09 WARN MemoryStore: Not enough space to cache rdd_490_4 in memory! (computed 33.0 MiB so far)
24/04/22 13:23:09 WARN BlockManager: Persisting block rdd_490_4 to disk instead.
24/04/22 13:23:09 WARN MemoryStore: Not enough space to cache rdd_490_6 in memory! (computed 33.0 MiB so far)
24/04/22 13:23:09 WARN BlockManager: Persisting block rdd_490_6 to disk instead.
24/04/22 13:23:09 WARN MemoryStore: Not enough space to cache rdd_490_6 in memory! (computed 17.0 MiB so far)
24/04/22 13

SVM AUC:  0.5139024891407198
