In [1]:
import sys
import os

In [2]:
os.environ.get('JAVA_HOME')

'C:\\Program Files\\Java\\jdk1.8.0_311'

In [3]:
import findspark
findspark.init()

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
import numpy as np

In [5]:
spark = SparkSession.builder.config("spark.sql.warehouse.dir", "temp").appName("SparkSQL").getOrCreate()

In [7]:
rdd = spark.sparkContext.textFile("data/fakefriends.csv")

In [8]:
rdd.take(5)

['0,Will,33,385',
 '1,Jean-Luc,26,2',
 '2,Hugh,55,221',
 '3,Deanna,40,465',
 '4,Quark,68,21']

In [9]:
def map_data(line):
    fields = line.split(',')
    return Row(ID=int(fields[0]),
               name=str(fields[1].encode("utf-8")),
               age=int(fields[2]),
               numFriends=int(fields[3]))

In [10]:
data = rdd.map(map_data)

In [11]:
data.take(5)

[Row(ID=0, age=33, name="b'Will'", numFriends=385),
 Row(ID=1, age=26, name="b'Jean-Luc'", numFriends=2),
 Row(ID=2, age=55, name="b'Hugh'", numFriends=221),
 Row(ID=3, age=40, name="b'Deanna'", numFriends=465),
 Row(ID=4, age=68, name="b'Quark'", numFriends=21)]

In [12]:
df = spark.createDataFrame(data)

In [13]:
df.show()

+---+---+-----------+----------+
| ID|age|       name|numFriends|
+---+---+-----------+----------+
|  0| 33|    b'Will'|       385|
|  1| 26|b'Jean-Luc'|         2|
|  2| 55|    b'Hugh'|       221|
|  3| 40|  b'Deanna'|       465|
|  4| 68|   b'Quark'|        21|
|  5| 59|  b'Weyoun'|       318|
|  6| 37|  b'Gowron'|       220|
|  7| 54|    b'Will'|       307|
|  8| 38|  b'Jadzia'|       380|
|  9| 27|    b'Hugh'|       181|
| 10| 53|     b'Odo'|       191|
| 11| 57|     b'Ben'|       372|
| 12| 54|   b'Keiko'|       253|
| 13| 56|b'Jean-Luc'|       444|
| 14| 43|    b'Hugh'|        49|
| 15| 36|     b'Rom'|        49|
| 16| 22|  b'Weyoun'|       323|
| 17| 35|     b'Odo'|        13|
| 18| 45|b'Jean-Luc'|       455|
| 19| 60|  b'Geordi'|       246|
+---+---+-----------+----------+
only showing top 20 rows



In [14]:
df.createOrReplaceTempView("people")

In [15]:
teenagers = spark.sql("SELECT * FROM people WHERE age >= 13 AND age <= 19")

In [16]:
teenagers.show()

+---+---+----------+----------+
| ID|age|      name|numFriends|
+---+---+----------+----------+
| 21| 19|  b'Miles'|       268|
| 52| 19|b'Beverly'|       269|
| 54| 19|  b'Brunt'|         5|
|106| 18|b'Beverly'|       499|
|115| 18|  b'Dukat'|       397|
|133| 19|  b'Quark'|       265|
|136| 19|   b'Will'|       335|
|225| 19|   b'Elim'|       106|
|304| 19|   b'Will'|       404|
|341| 18|   b'Data'|       326|
|366| 19|  b'Keiko'|       119|
|373| 19|  b'Quark'|       272|
|377| 18|b'Beverly'|       418|
|404| 18| b'Kasidy'|        24|
|409| 19|    b'Nog'|       267|
|439| 18|   b'Data'|       417|
|444| 18|  b'Keiko'|       472|
|492| 19|  b'Dukat'|        36|
|494| 18| b'Kasidy'|       194|
+---+---+----------+----------+



In [18]:
df.groupBy("age").count().orderBy("age").show()

+---+-----+
|age|count|
+---+-----+
| 18|    8|
| 19|   11|
| 20|    5|
| 21|    8|
| 22|    7|
| 23|   10|
| 24|    5|
| 25|   11|
| 26|   17|
| 27|    8|
| 28|   10|
| 29|   12|
| 30|   11|
| 31|    8|
| 32|   11|
| 33|   12|
| 34|    6|
| 35|    8|
| 36|   10|
| 37|    9|
+---+-----+
only showing top 20 rows



# Friends By Age

In [23]:
lines = spark.sparkContext.textFile("data/fakefriends.csv")

In [24]:
def parseLine(line):
    fields = line.split(',')
    age = int(fields[2])
    numFriends = int(fields[3])
    return (age, numFriends)

In [25]:
rdd = lines.map(parseLine)

In [26]:
rdd.take(5)

[(33, 385), (26, 2), (55, 221), (40, 465), (68, 21)]

In [31]:
map_data = rdd.mapValues(lambda x: (x, 1))

In [32]:
map_data.take(5)

[(33, (385, 1)), (26, (2, 1)), (55, (221, 1)), (40, (465, 1)), (68, (21, 1))]

In [33]:
totals_by_age = map_data.reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))

In [34]:
totals_by_age.take(5)

[(26, (4115, 17)),
 (40, (4264, 17)),
 (68, (2696, 10)),
 (54, (3615, 13)),
 (38, (2903, 15))]

In [35]:
average_by_age = totals_by_age.mapValues(lambda x: x[0] / x[1])

In [36]:
average_by_age.take(5)

[(26, 242.05882352941177),
 (40, 250.8235294117647),
 (68, 269.6),
 (54, 278.0769230769231),
 (38, 193.53333333333333)]