In [1]:
spark

In [2]:
df = sqlContext.read.csv('../data/avazu/1M.csv', header=True, inferSchema=True)

In [3]:
catCols = ['C1'] + [f'C{k}' for k in range(14, 22)]
catCols += ['banner_pos', 'site_category', 'device_type', 'device_conn_type',
            'site_id', 'site_domain', 'app_id', 'app_domain', 'app_category', 'device_model']
df.select(catCols).show()

+----+-----+---+---+----+---+----+------+---+----------+-------------+-----------+----------------+--------+-----------+--------+----------+------------+------------+
|  C1|  C14|C15|C16| C17|C18| C19|   C20|C21|banner_pos|site_category|device_type|device_conn_type| site_id|site_domain|  app_id|app_domain|app_category|device_model|
+----+-----+---+---+----+---+----+------+---+----------+-------------+-----------+----------------+--------+-----------+--------+----------+------------+------------+
|1005|15699|320| 50|1722|  0|  35|    -1| 79|         0|     28905ebd|          1|               0|1fbe01fe|   f3845767|ecad2386|  7801e8d9|    07d7df22|    c6263d8a|
|1005|19733|320| 50|2260|  0| 171|100156| 91|         0|     50e219e0|          1|               0|85f751fd|   c4e18dd6|3f2a6cbb|  33da2e74|    cef3e649|    03683bd4|
|1005|15703|320| 50|1722|  0|  35|    -1| 79|         0|     28905ebd|          1|               0|1fbe01fe|   f3845767|ecad2386|  7801e8d9|    07d7df22|    76dc4769

In [4]:
from pyspark.sql.functions import udf

In [5]:
posMapper = udf(lambda x: 0 if x < 0 else x)
df = df.withColumn('C20_1', posMapper(df['C20']))

In [6]:
catCols.remove('C20')
catCols.append('C20_1')

In [7]:
from pyspark.ml.feature import OneHotEncoderEstimator

In [8]:
from pyspark.ml.feature import StringIndexer
for c in ['site_category', 'site_id', 'site_domain', 'app_id', 'app_domain', 'app_category', 'device_model']:
    si = StringIndexer(inputCol=c, outputCol=f'{c}_ix')
    df = si.fit(df).transform(df)
    catCols.remove(c)
    catCols.append(f'{c}_ix')

In [9]:
from pyspark.sql.types import IntegerType
df = df.withColumn("C20_1int", df['C20_1'].cast(IntegerType()))
catCols.remove('C20_1')
catCols.append('C20_1int')

In [10]:
encoder = OneHotEncoderEstimator(inputCols=catCols, outputCols=[c + 'Enc' for c in catCols])
enc_model = encoder.fit(df)
encoded = enc_model.transform(df)

In [11]:
trainCols = [c for c in encoded.columns if c.endswith('Enc')] + ['day', 'hour', 'dayofweek']

In [12]:
encoded = encoded.withColumn('label', encoded['click'].cast(IntegerType()))

In [13]:
from pyspark.ml.classification import LogisticRegression

In [14]:
from pyspark.ml.feature import VectorAssembler

In [15]:
va = VectorAssembler(inputCols=trainCols, outputCol='features')
encoded = va.transform(encoded)

In [16]:
from tqdm import tqdm

In [24]:
auc = []
recall = []
L1 = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5]
for l in tqdm(L1):
    lr = LogisticRegression(featuresCol='features', labelCol='label', elasticNetParam=1, regParam=l)
    model = lr.fit(encoded)
    result = model.evaluate(encoded)
    auc.append(result.areaUnderROC)
    recall.append(result.recallByLabel)

100%|██████████| 6/6 [19:08<00:00, 136.17s/it]


In [25]:
auc

[0.7408263421238128,
 0.7076476053984169,
 0.6804391846225136,
 0.587018143629099,
 0.5,
 0.5]

In [28]:
recall

[[0.9920536645651499, 0.06402638320426371],
 [0.996516476871671, 0.029463207797179116],
 [0.9996386386796339, 0.005323753717499485],
 [1.0, 0.0],
 [1.0, 0.0],
 [1.0, 0.0]]

In [29]:
auc = []
recall = []
L1 = [0.0001, 0.0002, 0.0005, 0.0007, 0.001]
for l in tqdm(L1):
    lr = LogisticRegression(featuresCol='features', labelCol='label', elasticNetParam=1, regParam=l)
    model = lr.fit(encoded)
    result = model.evaluate(encoded)
    auc.append(result.areaUnderROC)
    recall.append(result.recallByLabel)

100%|██████████| 5/5 [18:31<00:00, 225.08s/it]


In [18]:
auc

[0.7617144797424745,
 0.7616922664507153,
 0.7612807183694248,
 0.7602531474308007,
 0.7577812378170214]

In [19]:
recall

[[0.9881304851637087, 0.09845999823326756],
 [0.9882099846541893, 0.09809487353140367],
 [0.9882665979277133, 0.09754718647860781],
 [0.9887243222668437, 0.09458496510703454],
 [0.9891290469456536, 0.09082182503459851]]

In [17]:
auc = []
recall = []
L1 = [0.00001, 0.00002, 0.00005, 0.0001, 0.0002]
for l in tqdm(L1):
    lr = LogisticRegression(featuresCol='features', labelCol='label', elasticNetParam=1, regParam=l)
    model = lr.fit(encoded)
    result = model.evaluate(encoded)
    auc.append(result.areaUnderROC)
    recall.append(result.recallByLabel)

100%|██████████| 5/5 [18:33<00:00, 219.25s/it]


In [20]:
result.precisionByLabel

[0.8417480564849439, 0.630834049167587]

In [21]:
auc

[0.7617144797424745,
 0.7616922664507153,
 0.7612807183694248,
 0.7602531474308007,
 0.7577812378170214]

In [22]:
recall

[[0.9881304851637087, 0.09845999823326756],
 [0.9882099846541893, 0.09809487353140367],
 [0.9882665979277133, 0.09754718647860781],
 [0.9887243222668437, 0.09458496510703454],
 [0.9891290469456536, 0.09082182503459851]]

In [28]:
auc = []
recall = []
L1 = [0.000001, 0.000002, 0.000005, 0.00001]
for l in tqdm(L1):
    lr = LogisticRegression(featuresCol='features', labelCol='label', elasticNetParam=1, regParam=l)
    model = lr.fit(encoded)
    result = model.evaluate(encoded)
    auc.append(result.areaUnderROC)
    recall.append(result.recallByLabel)

100%|██████████| 4/4 [12:39<00:00, 191.37s/it]


In [29]:
auc

[0.7617552201376266,
 0.7617331959413733,
 0.7617276905715271,
 0.7617163913010492]

In [30]:
recall

[[0.9879775088714204, 0.09903713082653633],
 [0.9880003950883769, 0.09906068725891463],
 [0.988013645003457, 0.09891934866464475],
 [0.9881304851637087, 0.09845999823326756]]

In [33]:
auc = []
recall = []
L1 = [0.0000001, 0.0000002, 0.0000005, 0.000001]
for l in tqdm(L1):
    lr = LogisticRegression(featuresCol='features', labelCol='label', elasticNetParam=1, regParam=l)
    model = lr.fit(encoded)
    result = model.evaluate(encoded)
    auc.append(result.areaUnderROC)
    recall.append(result.recallByLabel)


  0%|          | 0/4 [00:00<?, ?it/s][A
 25%|██▌       | 1/4 [04:47<14:21, 287.03s/it][A
 50%|█████     | 2/4 [08:11<08:44, 262.13s/it][A
 75%|███████▌  | 3/4 [11:26<04:02, 242.06s/it][A
100%|██████████| 4/4 [14:39<00:00, 227.52s/it][A

In [34]:
auc

[0.7617689728508258,
 0.7617775921073082,
 0.7617591900401888,
 0.7617539369029065]

In [35]:
recall

[[0.987811282664052, 0.09985571685168282],
 [0.9877956236735028, 0.0999734990135744],
 [0.9879642589563403, 0.0992668060422249],
 [0.9879775088714204, 0.09903713082653633]]