In [1]:
spark

In [36]:
df = sqlContext.read.csv('../data/avazu/1M.csv', header=True, inferSchema=True)
df.show()

+-----+----+----------+--------+-----------+-------------+--------+----------+------------+---------+---------+------------+-----------+----------------+-----+---+---+----+---+----+------+---+---------+---+----+
|click|  C1|banner_pos| site_id|site_domain|site_category|  app_id|app_domain|app_category|device_id|device_ip|device_model|device_type|device_conn_type|  C14|C15|C16| C17|C18| C19|   C20|C21|dayofweek|day|hour|
+-----+----+----------+--------+-----------+-------------+--------+----------+------------+---------+---------+------------+-----------+----------------+-----+---+---+----+---+----+------+---+---------+---+----+
|false|1005|         0|1fbe01fe|   f3845767|     28905ebd|ecad2386|  7801e8d9|    07d7df22| a99f214a| c2085c57|    c6263d8a|          1|               0|15699|320| 50|1722|  0|  35|    -1| 79|        1| 21|   6|
|false|1005|         0|85f751fd|   c4e18dd6|     50e219e0|3f2a6cbb|  33da2e74|    cef3e649| a99f214a| af62faf4|    03683bd4|          1|               0

In [37]:
catCols = ['C1'] + [f'C{k}' for k in range(14, 22)]
catCols += ['banner_pos', 'site_category', 'device_type', 'device_conn_type']

In [18]:
df.select(catCols).show()

+----+-----+---+---+----+---+----+------+---+----------+-------------+-----------+----------------+
|  C1|  C14|C15|C16| C17|C18| C19|   C20|C21|banner_pos|site_category|device_type|device_conn_type|
+----+-----+---+---+----+---+----+------+---+----------+-------------+-----------+----------------+
|1005|15699|320| 50|1722|  0|  35|    -1| 79|         0|     28905ebd|          1|               0|
|1005|19733|320| 50|2260|  0| 171|100156| 91|         0|     50e219e0|          1|               0|
|1005|15703|320| 50|1722|  0|  35|    -1| 79|         0|     28905ebd|          1|               0|
|1005|15705|320| 50|1722|  0|  35|    -1| 79|         0|     28905ebd|          1|               0|
|1005| 6563|320| 50| 572|  2|  39|    -1| 32|         0|     50e219e0|          1|               0|
|1005|19998|216| 36|2281|  3|  47|100182| 42|         2|     f66779e6|          1|               0|
|1005|21191|320| 50|2424|  1| 161|    -1| 71|         0|     50e219e0|          1|               3|


In [30]:
from pyspark.sql.functions import udf

In [38]:
posMapper = udf(lambda x: 0 if x < 0 else x)
df = df.withColumn('C20_1', posMapper(df['C20']))

In [39]:
from pyspark.ml.feature import OneHotEncoderEstimator

In [40]:
from pyspark.ml.feature import StringIndexer
si = StringIndexer(inputCol='site_category', outputCol='site_category_ix')
df = si.fit(df).transform(df)
catCols.remove('site_category')
catCols.remove('C20')
catCols.append('site_category_ix')
catCols.append('C20_1')

In [42]:
df.select('C20_1').show()

+------+
| C20_1|
+------+
|     0|
|100156|
|     0|
|     0|
|     0|
|100182|
|     0|
|100148|
|100084|
|100084|
|100020|
|     0|
|     0|
|     0|
|     0|
|     0|
|100081|
|     0|
|100081|
|     0|
+------+
only showing top 20 rows



In [43]:
from pyspark.sql.types import IntegerType
df = df.withColumn("C20_1int", df['C20_1'].cast(IntegerType()))
catCols.remove('C20_1')
catCols.append('C20_1int')

In [None]:
encoder = OneHotEncoderEstimator(inputCols=catCols, outputCols=[c + 'Enc' for c in catCols])
enc_model = encoder.fit(df)
encoded = enc_model.transform(df)

In [50]:
encoded.select('C20_1intEnc').head().C20_1intEnc

SparseVector(100248, {0: 1.0})

In [51]:
encoded.columns

['click',
 'C1',
 'banner_pos',
 'site_id',
 'site_domain',
 'site_category',
 'app_id',
 'app_domain',
 'app_category',
 'device_id',
 'device_ip',
 'device_model',
 'device_type',
 'device_conn_type',
 'C14',
 'C15',
 'C16',
 'C17',
 'C18',
 'C19',
 'C20',
 'C21',
 'dayofweek',
 'day',
 'hour',
 'C20_1',
 'site_category_ix',
 'C20_1int',
 'C15Enc',
 'C21Enc',
 'C17Enc',
 'device_conn_typeEnc',
 'C19Enc',
 'C14Enc',
 'site_category_ixEnc',
 'device_typeEnc',
 'C1Enc',
 'C16Enc',
 'C18Enc',
 'banner_posEnc',
 'C20_1intEnc']

In [52]:
trainCols = [c for c in encoded.columns if c.endswith('Enc')] + ['day', 'hour', 'dayofweek']

In [55]:
encoded = encoded.withColumn('label', encoded['click'].cast(IntegerType()))

In [None]:
encoded.select(*trainCols).show()

In [57]:
from pyspark.ml.classification import LogisticRegression

In [59]:
lr = LogisticRegression(featuresCol=trainCols, labelCol='label')

TypeError: Invalid param value given for param "featuresCol". Could not convert <class 'list'> to string type

In [60]:
trainCols

['C15Enc',
 'C21Enc',
 'C17Enc',
 'device_conn_typeEnc',
 'C19Enc',
 'C14Enc',
 'site_category_ixEnc',
 'device_typeEnc',
 'C1Enc',
 'C16Enc',
 'C18Enc',
 'banner_posEnc',
 'C20_1intEnc',
 'day',
 'hour',
 'dayofweek']

In [61]:
from pyspark.ml.feature import VectorAssembler

In [62]:
va = VectorAssembler(inputCols=trainCols, outputCol='features')
encoded = va.transform(encoded)

In [63]:
lr = LogisticRegression(featuresCol='features', labelCol='label')

In [64]:
lr.fit(encoded)

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/home/jaidevd/src/spark/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/home/jaidevd/src/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value
    format(target_id, ".", name), value)
py4j.protocol.Py4JJavaError: An error occurred while calling o712.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 3 in stage 59.0 failed 1 times, most recent failure: Lost task 3.0 in stage 59.0 (TID 1912, localhost, executor driver): java.lang.OutOfMemoryError: Java heap space
	at java.io.ObjectInputStream$HandleTable.grow(ObjectInputStream.java:3898)
	at java.io.ObjectInputStream$HandleTable.assign(ObjectInputStream.java:3705)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2115)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1625)
	at java.io.ObjectInputStream.defaultReadFields(Objec

Py4JError: An error occurred while calling None.None

----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 59860)
Traceback (most recent call last):
  File "/home/jaidevd/anaconda3/lib/python3.6/socketserver.py", line 320, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/home/jaidevd/anaconda3/lib/python3.6/socketserver.py", line 351, in process_request
    self.finish_request(request, client_address)
  File "/home/jaidevd/anaconda3/lib/python3.6/socketserver.py", line 364, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/home/jaidevd/anaconda3/lib/python3.6/socketserver.py", line 724, in __init__
    self.handle()
  File "/home/jaidevd/src/spark/python/pyspark/accumulators.py", line 269, in handle
    poll(accum_updates)
  File "/home/jaidevd/src/spark/python/pyspark/accumulators.py", line 241, in poll
    if func():
  File "/home/jaidevd/src/spark/python/pyspark/accumulators.py", line 245, in accum_updates
  