### Import PySpark
Nogle få kommentarer:
 - Download spark pre-built for hadoop 2.6, jeg vil også anbefale jer at bruge spark 1.6.0 da der er nogle problemer med 1.6.1 [hent den her http://www.apache.org/dyn/closer.lua/spark/spark-1.6.0/spark-1.6.0-bin-hadoop2.6.tgz]
 - husk at ændre paths i denne notebook
 - `os.environ["PYSPARK_SUBMIT_ARGS"] = "--packages com.databricks:spark-avro_2.10:2.0.1 pyspark-shell"` vil give jer mulighed for direkte at loade avro filer

In [1]:
import sys
import os
import os.path

SPARK_HOME = """spark-1.6.0-bin-hadoop2.6/""" ## PATH TO SPARK

sys.path.append(os.path.join(SPARK_HOME, "python", "lib", "py4j-0.9-src.zip"))
sys.path.append(os.path.join(SPARK_HOME, "python", "lib", "pyspark.zip"))
os.environ["SPARK_HOME"] = SPARK_HOME
os.environ["PYSPARK_SUBMIT_ARGS"] = "--packages com.databricks:spark-avro_2.10:2.0.1 pyspark-shell"

from pyspark import SparkConf, SparkContext, StorageLevel
from pyspark.sql import SQLContext

conf = (SparkConf()
         .setMaster("local[*]")
         .setAppName("My app"))
sc = SparkContext(conf = conf)
sqlContext = SQLContext(sc)

In [2]:
# load data to dataframe
df = (sqlContext.read.format("com.databricks.spark.avro")
      .load("data/201512/*.avro")
      )

In [3]:
# I kan se strukturen på data her
df.printSchema()

root
 |-- timestamp_seen: long (nullable = false)
 |-- id: string (nullable = false)
 |-- useruuid: string (nullable = false)
 |-- start_time: string (nullable = false)
 |-- end_time: string (nullable = false)
 |-- name: string (nullable = false)
 |-- area: string (nullable = false)
 |-- country: string (nullable = false)
 |-- region: string (nullable = false)
 |-- latitude: double (nullable = false)
 |-- longitude: double (nullable = false)
 |-- altitude: integer (nullable = false)
 |-- accuracy: integer (nullable = false)
 |-- datum: string (nullable = false)
 |-- devices: array (nullable = false)
 |    |-- element: struct (containsNull = false)
 |    |    |-- name: string (nullable = false)
 |    |    |-- type: string (nullable = false)
 |    |    |-- id: string (nullable = false)



In [4]:
# kig på data
df.show(10)

+--------------+--------------------+--------------------+--------------------+--------------------+------------+-----+-------+------+-----------------+------------------+--------+--------+-----+--------------------+
|timestamp_seen|                  id|            useruuid|          start_time|            end_time|        name| area|country|region|         latitude|         longitude|altitude|accuracy|datum|             devices|
+--------------+--------------------+--------------------+--------------------+--------------------+------------+-----+-------+------+-----------------+------------------+--------+--------+-----+--------------------+
| 1448989426233|f1f7caba-c371-428...|ab65c6e5-5ee9-45d...|2015-12-01T18:03:...|2015-12-01T18:03:...|       Malmo|Skane| Sweden|Europe|       55.6032658|        13.0185448|   12000|   31000|WGS84|[[E5823,PHONE,61c...|
| 1448989426517|2b8f652c-cd71-4f6...|4e3d24a8-1a1e-488...|2015-12-01T16:18:...|2015-12-01T16:18:...|          Va|Skane| Sweden|Europ

In [5]:
# transform til rdd
data = df.rdd

In [6]:
data.count()

687718

In [7]:
#more imports
from DatabaseHelper import DatabaseHelper
from dateutil import parser

db = DatabaseHelper()

first_period_min_date = parser.parse("2015-12-01 00:00:00+00:00")
first_period_max_date = parser.parse("2015-12-31 23:59:59+00:00")
second_period_min_date = parser.parse("2016-01-01 00:00:00+00:00")
second_period_max_date = parser.parse("2016-01-31 23:59:59+00:00")
third_period_min_date = parser.parse("2016-02-01 00:00:00+00:00")
third_period_max_date = parser.parse("2016-02-29 23:59:59+00:00")


In [8]:
#filtering
swe_data = data.filter(lambda row: "Sweden" in row["country"]).filter(lambda row: parser.parse(row["start_time"]) >= first_period_min_date
                           and parser.parse(row["end_time"]) <= third_period_max_date)

# convert to ((spatial, time)[useruuid]) rows
def convert_time_and_spatial(row):
    return [((db.calculate_spatial_bin(row["longitude"], row["latitude"]),x),[row["useruuid"]]) for x in db.calculate_time_bins(row["start_time"], row["end_time"])]

swe_data = swe_data.flatMap(convert_time_and_spatial)
# remove duplicates
swe_data = swe_data.reduceByKey(lambda a, b: a+b if b[0] not in a else a)

In [9]:
swe_data.take(10)

[((26227453209, 3165), ['dc40af5f-8da4-47c6-932b-3ec87264d2a4']),
 ((26209993017, 3443), ['fd90f2c4-1284-4b8f-b5f6-df9c67e2d653']),
 ((26295672690, 3178), ['a0e91405-da1d-4cb5-9a80-b48a14989383']),
 ((26241493238, 3384), ['a221e5b4-ce58-49ba-9385-6b544cc8a5ae']),
 ((26211433141, 2969), ['7b14a8b4-2ad3-428c-9ad8-78cfc08acebb']),
 ((26199733000, 3420), ['64975a94-e4e7-4eea-b174-0a54185acd74']),
 ((26884097946, 3212), ['2e427198-5b42-4d1b-b7fe-76972e8c0dbc']),
 ((26307373766, 3230), ['761e6458-78f8-43ca-b941-8cfd36d3ef61']),
 ((26213953222, 2868), ['b3be75b8-6c9a-40d0-a9f9-fb74848fb7aa']),
 ((26209093023, 2947), ['ee933feb-fa30-4ca2-89c1-5d40a74ee345'])]

In [12]:
first_period_min_bin = db.calculate_time_bins("2015-12-01 00:00:00+00:00")
first_period_max_bin = db.calculate_time_bins("2015-12-31 23:59:59+00:00")
second_period_min_bin = db.calculate_time_bins("2016-01-01 00:00:00+00:00")
second_period_max_bin = db.calculate_time_bins("2016-01-31 23:59:59+00:00")
third_period_min_bin = db.calculate_time_bins("2016-02-01 00:00:00+00:00")
third_period_max_bin = db.calculate_time_bins("2016-02-29 23:59:59+00:00")

period_1_data = swe_data.filter(lambda row: row[0][0] >= db.calculate_time_bins(first_period_min_date)
                                and row[0][0] < first_period_max_date)
period_2_data = swe_data.filter(lambda row: row[0][0] >= first_period_min_date
                                and row[0][0] < first_period_max_date)
period_2_data = swe_data.filter(lambda row: row[0][0] >= first_period_min_date
                                and row[0][0] < first_period_max_date)