### Import PySpark
Nogle få kommentarer:
 - Download spark pre-built for hadoop 2.6, jeg vil også anbefale jer at bruge spark 1.6.0 da der er nogle problemer med 1.6.1 [hent den her http://www.apache.org/dyn/closer.lua/spark/spark-1.6.0/spark-1.6.0-bin-hadoop2.6.tgz]
 - husk at ændre paths i denne notebook
 - `os.environ["PYSPARK_SUBMIT_ARGS"] = "--packages com.databricks:spark-avro_2.10:2.0.1 pyspark-shell"` vil give jer mulighed for direkte at loade avro filer

In [None]:
import sys
import os
import os.path

SPARK_HOME = """spark-1.6.0-bin-hadoop2.6/""" ## PATH TO SPARK

sys.path.append(os.path.join(SPARK_HOME, "python", "lib", "py4j-0.9-src.zip"))
sys.path.append(os.path.join(SPARK_HOME, "python", "lib", "pyspark.zip"))
os.environ["SPARK_HOME"] = SPARK_HOME
os.environ["PYSPARK_SUBMIT_ARGS"] = "--packages com.databricks:spark-avro_2.10:2.0.1 pyspark-shell"
os.environ["PYSPARK_PYTHON"]="/usr/bin/python3"


from pyspark import SparkConf, SparkContext, StorageLevel
from pyspark.sql import SQLContext

conf = (SparkConf()
         .setMaster("local[*]")
         .setAppName("My app"))
sc = SparkContext(conf = conf)
sqlContext = SQLContext(sc)

In [None]:
# load data to dataframe
df = (sqlContext.read.format("com.databricks.spark.avro")
      .load("data/201512/*.avro")
      )

In [None]:
# I kan se strukturen på data her
df.printSchema()

In [None]:
# kig på data
df.show(10)

In [None]:
# transform til rdd
data = df.rdd

In [None]:
data.count()

In [None]:
#more imports
from DatabaseHelper import DatabaseHelper
from dateutil import parser

db = DatabaseHelper()

first_period_min_date = parser.parse("2015-12-01 00:00:00+00:00")
first_period_max_date = parser.parse("2015-12-31 23:59:59+00:00")
second_period_min_date = parser.parse("2016-01-01 00:00:00+00:00")
second_period_max_date = parser.parse("2016-01-31 23:59:59+00:00")
third_period_min_date = parser.parse("2016-02-01 00:00:00+00:00")
third_period_max_date = parser.parse("2016-02-29 23:59:59+00:00")


In [None]:
#filtering
swe_data = data.filter(lambda row: "Sweden" in row["country"]).filter(lambda row: parser.parse(row["start_time"]) >= first_period_min_date
                           and parser.parse(row["end_time"]) <= third_period_max_date)

# convert to ((spatial, time)[useruuid]) rows
def convert_time_and_spatial(row):
    return [((db.calculate_spatial_bin(row["longitude"], row["latitude"]),x),[row["useruuid"]]) for x in db.calculate_time_bins(row["start_time"], row["end_time"])]


binned_swe_data = swe_data.flatMap(convert_time_and_spatial)

# remove duplicates and reduce to (spatial_bin, time_bin) -> [users]
bins_to_users = binned_swe_data.reduceByKey(lambda a, b: a+b if b[0] not in a else a)

In [None]:
# divide into periods
first_period_min_bin = db.calculate_time_bins("2015-12-01 00:00:00+00:00")[0]
first_period_max_bin = db.calculate_time_bins("2015-12-09 23:59:59+00:00")[0]
second_period_min_bin = db.calculate_time_bins("2015-12-10 00:00:00+00:00")[0]
second_period_max_bin = db.calculate_time_bins("2015-12-19 23:59:59+00:00")[0]
third_period_min_bin = db.calculate_time_bins("2015-12-20 00:00:00+00:00")[0]
third_period_max_bin = db.calculate_time_bins("2015-12-31 23:59:59+00:00")[0]

# remove duplicates and create (users) -> [(spatial_bin, time_bin)]
period_1_bins_to_users = bins_to_users.filter(lambda row: row[0][1] >= first_period_min_bin
                                and row[0][1] < first_period_max_bin)
period_1_bins_to_users.map(lambda row: ((row[1][0]),[row[0]])).reduceByKey(lambda a, b: a+b if b[0] not in a else a)
period_2_bins_to_users = bins_to_users.filter(lambda row: row[0][1] >= second_period_min_bin
                                and row[0][1] < second_period_max_bin)
period_2_users_to_bins = period_2_bins_to_users.map(lambda row: ((row[1][0]),[row[0]])).reduceByKey(lambda a, b: a+b if b[0] not in a else a)
period_3_bins_to_users = bins_to_users.filter(lambda row: row[0][1] >= third_period_min_bin
                                and row[0][1] < third_period_max_bin)
period_3_users_to_bins = period_3_bins_to_users.map(lambda row: ((row[1][0]),[row[0]])).reduceByKey(lambda a, b: a+b if b[0] not in a else a)

In [None]:
print(period_1_data.count())
print(period_2_data.count())
print(period_3_data.count())

In [None]:
from itertools import combinations
def generate_cooccurrences(row):
    return [(tuple(sorted(pair)),[row[0]]) for pair in combinations(row[1], 2)]
    
coocs_1 = period_1_data.flatMap(generate_cooccurrences).reduceByKey(lambda a,b: a+b)
coocs_2 = period_2_data.flatMap(generate_cooccurrences).reduceByKey(lambda a,b: a+b)
coocs_3 = period_3_data.flatMap(generate_cooccurrences).reduceByKey(lambda a,b: a+b)
                                                                    

In [None]:
coocs_1.take(5)

In [None]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.mllib.tree import RandomForest, RandomForestModel
import numpy as np

y_1_users = coocs_2.map(lambda row: row[0]).collect()
y_2_users = coocs_3.map(lambda row: row[0]).collect()

def compute_weighted_frequency(row, location_data):
    spatial_bins = [r[0] for r in row[1]]
    wf_value = 0
    for sb in set(spatial_bins):
        users = location_data.filter(lambda row: r[0][0] == sb).flatMap(lambda row: row[1]).foreach()
        wf_value += spatial_bins.count(sb)*np.exp()

def compute_features(y, row, location_data):
    num_coocs = len(row[1])
    num_unique_coocs = len(set([r[0] for r in row[1]]))
    spatial_bins = [r[0] for r in row[1]]
    
    diversity = -np.sum([spatial_bins.count(sb)/len(spatial_bins)*np.log2(spatial_bins.count(sb)/len(spatial_bins)) for sb in set(spatial_bins)])
    
    weighted_frequency = compute_weighted_frequency(row, location_data)
    return LabeledPoint(y, [num_coocs, num_unique_coocs, diversity])

def compute_train_features(row):
    y = 1 if row[0] in y_1_users else 0
    return compute_features(y, row, period_1_data)
def compute_test_features(row):
    y = 1 if row[0] in y_2_users else 0
    return compute_features(y, row, period_2_data)

X_train_num_coocs = coocs_1.map(compute_train_features)

X_test_num_coocs = coocs_2.map(compute_test_features)
# more features can be computed with join on different features maybe?
#X.take(1)
#X_train = RowMatrix(X)

In [None]:
model = RandomForest.trainClassifier(X_train_num_coocs, numClasses=2, categoricalFeaturesInfo={},
                                     numTrees=3, featureSubsetStrategy="auto",
                                     impurity='gini', maxDepth=4, maxBins=32)

predictions = model.predict(X_test_num_coocs)

predictions = model.predict(X_test_num_coocs.map(lambda x: x.features))
labelsAndPredictions = X_test_num_coocs.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda vp: (vp[0] - vp[1]) * (vp[0] - vp[1])).sum() /\
    float(X_test_num_coocs.count())
print("Test Mean Squared Error = {}".format(str(testMSE)))