In [29]:
t0_df = spark.read.csv("sample-data/train_sample.csv",header=True,inferSchema=True)
t0_df.printSchema()

root
 |-- ip: integer (nullable = true)
 |-- app: integer (nullable = true)
 |-- device: integer (nullable = true)
 |-- os: integer (nullable = true)
 |-- channel: integer (nullable = true)
 |-- click_time: timestamp (nullable = true)
 |-- attributed_time: timestamp (nullable = true)
 |-- is_attributed: integer (nullable = true)



### Original Schema
- ip - IP address of click
- app - app id for marketing
- device - device **type** id of user mobile phone (e.g. iphone 6, iphone 7, etc.)
- os - os version id of user mobile phone
- channel - channel id of mobile ad publisher
- click_time - timestamp of click (UTC)
- attributed_time - if user downloaded the app after clicking an ad, this is the time of the app download
- is_attributed - the target that is to be predicted

In [30]:
from pyspark.sql.functions import *
from pyspark.sql import functions as F

#Extract day and hour from click time
t0_df = t0_df.withColumn("day",dayofmonth(t0_df["click_time"]))
t0_df = t0_df.withColumn("hour",hour(t0_df["click_time"]))

#Add binary features

#Top Channels for positive examples
t0_df = t0_df.withColumn("isChannel213",F.when(t0_df["channel"] == 213,1).otherwise(0))
t0_df = t0_df.withColumn("isChannel113",F.when(t0_df["channel"] == 113,1).otherwise(0))
t0_df = t0_df.withColumn("isChannel21",F.when(t0_df["channel"] == 211,1).otherwise(0))

#Top apps for positive examples
t0_df = t0_df.withColumn("isApp19",F.when(t0_df["app"] == 19,1).otherwise(0))
t0_df = t0_df.withColumn("isApp35",F.when(t0_df["app"] == 35,1).otherwise(0))
t0_df = t0_df.withColumn("isApp29",F.when(t0_df["app"] == 29,1).otherwise(0))
t0_df = t0_df.withColumn("isApp10",F.when(t0_df["app"] == 10,1).otherwise(0))

t0_df.printSchema()

root
 |-- ip: integer (nullable = true)
 |-- app: integer (nullable = true)
 |-- device: integer (nullable = true)
 |-- os: integer (nullable = true)
 |-- channel: integer (nullable = true)
 |-- click_time: timestamp (nullable = true)
 |-- attributed_time: timestamp (nullable = true)
 |-- is_attributed: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- isChannel213: integer (nullable = false)
 |-- isChannel113: integer (nullable = false)
 |-- isChannel21: integer (nullable = false)
 |-- isApp19: integer (nullable = false)
 |-- isApp35: integer (nullable = false)
 |-- isApp29: integer (nullable = false)
 |-- isApp10: integer (nullable = false)



### Features added
- day - day of month of click
- hour - hour of day of click
- Top Channel features: "isChannelX"
    - 213
    - 113
    - 21
- Top App features: "isAppX"
    - 19
    - 35
    - 10

In [31]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer

#Select features to actually use in training
vectorAssembler = VectorAssembler(inputCols=[
    "app",
    "device",
    "os",
    "channel",
    "day",
    "hour",
    "isChannel213",
    "isChannel113",
    "isChannel21",
    "isApp19",
    "isApp35",
    "isApp29",
    "isApp10"
], outputCol="features")

v_t0_df = vectorAssembler.transform(t0_df)
v_t0_df.select("features","is_attributed").show()

+--------------------+-------------+
|            features|is_attributed|
+--------------------+-------------+
|(13,[0,1,2,3,4,5]...|            0|
|(13,[0,1,2,3,4,5]...|            0|
|(13,[0,1,2,3,4,5]...|            0|
|(13,[0,1,2,3,4,5]...|            0|
|(13,[0,1,2,3,4,5]...|            0|
|(13,[0,1,2,3,4,5]...|            0|
|(13,[0,1,2,3,4,5]...|            0|
|(13,[0,1,2,3,4,5]...|            0|
|(13,[0,1,2,3,4,5]...|            0|
|(13,[0,1,2,3,4,5]...|            0|
|(13,[0,1,2,3,4,5]...|            0|
|(13,[0,1,2,3,4,5]...|            0|
|(13,[0,1,2,3,4,5]...|            0|
|(13,[0,1,2,3,4,5]...|            0|
|(13,[0,1,2,3,4],[...|            0|
|(13,[0,1,2,3,4,5]...|            0|
|(13,[0,1,2,3,4,5]...|            0|
|(13,[0,1,2,3,4,5]...|            0|
|(13,[0,1,2,3,4,5]...|            0|
|(13,[0,1,2,3,4,5]...|            0|
+--------------------+-------------+
only showing top 20 rows



In [32]:
splits = v_final_df.randomSplit([0.6,0.4],1)

train_df = splits[0]
test_df = splits[1]

print(train_df.count(),test_df.count())

60059 39941


In [33]:
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder, CrossValidator
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

dt = DecisionTreeClassifier(labelCol="is_attributed",featuresCol="features")

dt_model = dt.fit(train_df)

dt_predictions = dt_model.transform(test_df)

evaluator = BinaryClassificationEvaluator(
    labelCol="is_attributed",
    rawPredictionCol="prediction")

accuracy = evaluator.evaluate(dt_predictions)
accuracy

0.5156890871962319

In [34]:
#test_df = spark.read.csv("sample-data/test.csv",header=True,inferSchema=True)
