In [6]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
import gzip, os

In [2]:
sc = SparkContext()
sql = SQLContext(sc)

In [7]:
with gzip.open('big-data-3/game-clicks.csv.gz') as f0:
    with open('game-clicks.csv.tmp','wb') as f:
        f.write(f0.read())

In [15]:
df = sql.read.csv('game-clicks.csv.tmp',
                 header='true',
                 inferSchema='true')
df.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- clickId: integer (nullable = true)
 |-- userId: integer (nullable = true)
 |-- userSessionId: integer (nullable = true)
 |-- isHit: integer (nullable = true)
 |-- teamId: integer (nullable = true)
 |-- teamLevel: integer (nullable = true)



In [14]:
df.count()

755806

In [16]:
df.show(5)

+-------------------+-------+------+-------------+-----+------+---------+
|          timestamp|clickId|userId|userSessionId|isHit|teamId|teamLevel|
+-------------------+-------+------+-------------+-----+------+---------+
|2016-05-26 15:06:55|    105|  1038|         5916|    0|    25|        1|
|2016-05-26 15:07:09|    154|  1099|         5898|    0|    44|        1|
|2016-05-26 15:07:14|    229|   899|         5757|    0|    71|        1|
|2016-05-26 15:07:14|    322|  2197|         5854|    0|    99|        1|
|2016-05-26 15:07:20|     22|  1362|         5739|    0|    13|        1|
+-------------------+-------+------+-------------+-----+------+---------+
only showing top 5 rows



In [18]:
df.select('userid','teamlevel').show(5)

+------+---------+
|userid|teamlevel|
+------+---------+
|  1038|        1|
|  1099|        1|
|   899|        1|
|  2197|        1|
|  1362|        1|
+------+---------+
only showing top 5 rows



In [20]:
df.filter(df['teamlevel']>1).select('userid','teamlevel').show(5)

+------+---------+
|userid|teamlevel|
+------+---------+
|  1513|        2|
|   868|        2|
|  1453|        2|
|  1282|        2|
|  1473|        2|
+------+---------+
only showing top 5 rows



In [24]:
df.groupBy('isHit').count().show()

+-----+------+
|isHit| count|
+-----+------+
|    1| 83383|
|    0|672423|
+-----+------+



In [26]:
from pyspark.sql.functions import mean,sum
df.select(mean('isHit'),sum('isHit')).show()

+------------------+----------+
|        avg(isHit)|sum(isHit)|
+------------------+----------+
|0.1103232840173272|     83383|
+------------------+----------+



In [27]:
with gzip.open('big-data-3/ad-clicks.csv.gz') as f0:
    with open('ad-clicks.csv.tmp','wb') as f:
        f.write(f0.read())
df2 = sql.read.csv('ad-clicks.csv.tmp',
                 header='true',
                 inferSchema='true')
df2.printSchema()


root
 |-- timestamp: timestamp (nullable = true)
 |-- txId: integer (nullable = true)
 |-- userSessionId: integer (nullable = true)
 |-- teamId: integer (nullable = true)
 |-- userId: integer (nullable = true)
 |-- adId: integer (nullable = true)
 |-- adCategory: string (nullable = true)



In [28]:
merge = df.join(df2, 'userId')

In [29]:
merge.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- clickId: integer (nullable = true)
 |-- userSessionId: integer (nullable = true)
 |-- isHit: integer (nullable = true)
 |-- teamId: integer (nullable = true)
 |-- teamLevel: integer (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- txId: integer (nullable = true)
 |-- userSessionId: integer (nullable = true)
 |-- teamId: integer (nullable = true)
 |-- adId: integer (nullable = true)
 |-- adCategory: string (nullable = true)



In [30]:
merge.show(5)

+------+-------------------+-------+-------------+-----+------+---------+-------------------+-----+-------------+------+----+-----------+
|userId|          timestamp|clickId|userSessionId|isHit|teamId|teamLevel|          timestamp| txId|userSessionId|teamId|adId| adCategory|
+------+-------------------+-------+-------------+-----+------+---------+-------------------+-----+-------------+------+----+-----------+
|  1362|2016-05-26 15:07:20|     22|         5739|    0|    13|        1|2016-06-16 10:21:01|39733|        34223|    13|   1|     sports|
|  1362|2016-05-26 15:07:20|     22|         5739|    0|    13|        1|2016-06-15 23:52:15|38854|        34223|    13|   3|electronics|
|  1362|2016-05-26 15:07:20|     22|         5739|    0|    13|        1|2016-06-15 12:23:31|37940|        34223|    13|  15|     sports|
|  1362|2016-05-26 15:07:20|     22|         5739|    0|    13|        1|2016-06-13 00:12:01|32627|        26427|    13|  14|    fashion|
|  1362|2016-05-26 15:07:20|     2

In [31]:
os.remove('game-clicks.csv.tmp')
os.remove('ad-clicks.csv.tmp')