In [1]:
df = sqlContext.read.csv('s3a://sparkdemonstration/10M.csv', header=True, inferSchema=True)

from pyspark.sql.types import IntegerType
from math import floor
from pyspark.sql.functions import rand
from pyspark.sql.functions import col

def stratifiedSample(df, N, labelCol="y"):
    ctx = df.groupby(labelCol).count()
    ctx = ctx.withColumn('frac', col("count") / df.count())
    frac = ctx.select("y", "frac").rdd.collectAsMap()
    pos = int(floor(frac[1] * N))
    neg = int(floor(frac[0] * N))
    posDF = df.filter(col(labelCol) == 1).orderBy(rand()).limit(pos)
    negDF = df.filter(col(labelCol) == 0).orderBy(rand()).limit(neg)
    return posDF.unionAll(negDF).orderBy(rand())

df = df.withColumn("y", df["click"].cast(IntegerType()))
xdf = stratifiedSample(df, 1_000_000)

xdf.printSchema()

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
0,application_1589623986229_0001,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- click: boolean (nullable = true)
 |-- C1: integer (nullable = true)
 |-- banner_pos: integer (nullable = true)
 |-- site_id: string (nullable = true)
 |-- site_domain: string (nullable = true)
 |-- site_category: string (nullable = true)
 |-- app_id: string (nullable = true)
 |-- app_domain: string (nullable = true)
 |-- app_category: string (nullable = true)
 |-- device_id: string (nullable = true)
 |-- device_ip: string (nullable = true)
 |-- device_model: string (nullable = true)
 |-- device_type: integer (nullable = true)
 |-- device_conn_type: integer (nullable = true)
 |-- C14: integer (nullable = true)
 |-- C15: integer (nullable = true)
 |-- C16: integer (nullable = true)
 |-- C17: integer (nullable = true)
 |-- C18: integer (nullable = true)
 |-- C19: integer (nullable = true)
 |-- C20: integer (nullable = true)
 |-- C21: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- dayofweek: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- 

In [2]:
cCols = ['C1', 'banner_pos', 'site_domain', 'site_category', 'app_domain',
           'app_category', 'device_model', 'device_type', 'device_conn_type']
cCols += ['C{}'.format(i) for i in range(14, 22)]
xdf.select(*cCols).show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+----------+-----------+-------------+----------+------------+------------+-----------+----------------+-----+---+---+----+---+----+------+---+
|  C1|banner_pos|site_domain|site_category|app_domain|app_category|device_model|device_type|device_conn_type|  C14|C15|C16| C17|C18| C19|   C20|C21|
+----+----------+-----------+-------------+----------+------------+------------+-----------+----------------+-----+---+---+----+---+----+------+---+
|1005|         0|   d262cf1e|     f66779e6|  7801e8d9|    07d7df22|    c1a17447|          1|               0|22202|320| 50|2558|  0| 417|100004|163|
|1005|         0|   f3845767|     28905ebd|  7801e8d9|    07d7df22|    be87996b|          1|               2|15699|320| 50|1722|  0|  35|100084| 79|
|1005|         1|   9cf7de2f|     f028772b|  7801e8d9|    07d7df22|    1ccc7835|          1|               0|16615|320| 50|1863|  3|  39|    -1| 23|
|1005|         0|   c4e18dd6|     50e219e0|  5c5a694b|    0f2161f8|    d4897fef|          1|              

In [3]:
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator, VectorAssembler

stringCols = ['site_domain', 'site_category', 'app_domain', 'app_category', 'device_model']
for c in stringCols:
    outCol = c + '_ix'
    si = StringIndexer(inputCol=c, outputCol=outCol)
    xdf = si.fit(xdf).transform(xdf)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

posMapper = udf(lambda x: 0 if x < 0 else x)
xdf = xdf.withColumn('C20_1', posMapper(xdf['C20']))
xdf = xdf.withColumn("C20_1int", xdf['C20_1'].cast(IntegerType()))

categoricalCols = [c+'_ix' for c in ['site_domain', 'site_category', 'app_domain', 'app_category', 'device_model']]
categoricalCols += ['C1', 'C14', 'C15', 'C16', 'C17', 'C19', 'C20_1int', 'C21']

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
ohe = OneHotEncoderEstimator(inputCols=categoricalCols, outputCols=[c+'Enc' for c in categoricalCols])
enc_model = ohe.fit(xdf)
xdf = enc_model.transform(xdf)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
trainCols = [c+'Enc' for c in categoricalCols]
trainCols += ['hour', 'day', 'dayofweek']
xdf.select('month').distinct().show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----+
|month|
+-----+
|   10|
+-----+

In [7]:
assembler = VectorAssembler(inputCols=trainCols, outputCol='features')
xdf = assembler.transform(xdf)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol='features', labelCol='y')
model = lr.fit(xdf)
result = model.evaluate(xdf)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
result.areaUnderROC

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

0.7467668629930857

In [10]:
result.recallByLabel

VBox()

An error was encountered:
Session 0 unexpectedly reached final status 'dead'. See logs:
stdout: 

stderr: 
20/05/16 05:22:50 INFO TaskSetManager: Finished task 13.0 in stage 633.0 (TID 25764) in 1212 ms on ip-172-31-85-145.ec2.internal (executor 1) (12/200)
20/05/16 05:22:50 INFO TaskSetManager: Starting task 22.0 in stage 633.0 (TID 25775, ip-172-31-85-145.ec2.internal, executor 1, partition 22, PROCESS_LOCAL, 7767 bytes)
20/05/16 05:22:50 INFO TaskSetManager: Finished task 14.0 in stage 633.0 (TID 25766) in 1173 ms on ip-172-31-85-145.ec2.internal (executor 1) (13/200)
20/05/16 05:22:50 INFO TaskSetManager: Starting task 23.0 in stage 633.0 (TID 25776, ip-172-31-85-145.ec2.internal, executor 1, partition 23, PROCESS_LOCAL, 7767 bytes)
20/05/16 05:22:50 INFO TaskSetManager: Finished task 12.0 in stage 633.0 (TID 25763) in 1255 ms on ip-172-31-85-145.ec2.internal (executor 1) (14/200)
20/05/16 05:22:50 INFO TaskSetManager: Starting task 18.0 in stage 633.0 (TID 25777, ip-172-31-80-178.

In [None]:
L1 = [0.001, 0.005, 0.01, 0.05, 0.1]
recall = []
auc = []

for l1 in L1:
    lr = LogisticRegression(featuresCol='features', labelCol='y', elasticNetParam=1, regParam=l1)
    model = lr.fit(xdf)
    result = model.evaluate(xdf)
    recall.append(result.recallByLabel)
    auc.append(result.areaUnderROC)
    print(recall[-1], auc[-1])

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…