In [1]:
import sys
import os

In [2]:
os.environ.get('JAVA_HOME')

'C:\\Program Files\\Java\\jdk1.8.0_311'

In [3]:
import findspark
findspark.init()

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
import numpy as np

In [5]:
spark = SparkSession.builder.config("spark.sql.warehouse.dir", "temp").appName("SparkSQL").getOrCreate()

In [6]:
rdd = spark.sparkContext.textFile("data/fakefriends.csv")

In [7]:
rdd.take(5)

['0,Will,33,385',
 '1,Jean-Luc,26,2',
 '2,Hugh,55,221',
 '3,Deanna,40,465',
 '4,Quark,68,21']

In [8]:
def map_data(line):
    fields = line.split(',')
    return Row(ID=int(fields[0]),
               name=str(fields[1].encode("utf-8")),
               age=int(fields[2]),
               numFriends=int(fields[3]))

In [9]:
data = rdd.map(map_data)

In [10]:
data.take(5)

[Row(ID=0, name="b'Will'", age=33, numFriends=385),
 Row(ID=1, name="b'Jean-Luc'", age=26, numFriends=2),
 Row(ID=2, name="b'Hugh'", age=55, numFriends=221),
 Row(ID=3, name="b'Deanna'", age=40, numFriends=465),
 Row(ID=4, name="b'Quark'", age=68, numFriends=21)]

In [11]:
df = spark.createDataFrame(data)

In [12]:
df.show()

+---+-----------+---+----------+
| ID|       name|age|numFriends|
+---+-----------+---+----------+
|  0|    b'Will'| 33|       385|
|  1|b'Jean-Luc'| 26|         2|
|  2|    b'Hugh'| 55|       221|
|  3|  b'Deanna'| 40|       465|
|  4|   b'Quark'| 68|        21|
|  5|  b'Weyoun'| 59|       318|
|  6|  b'Gowron'| 37|       220|
|  7|    b'Will'| 54|       307|
|  8|  b'Jadzia'| 38|       380|
|  9|    b'Hugh'| 27|       181|
| 10|     b'Odo'| 53|       191|
| 11|     b'Ben'| 57|       372|
| 12|   b'Keiko'| 54|       253|
| 13|b'Jean-Luc'| 56|       444|
| 14|    b'Hugh'| 43|        49|
| 15|     b'Rom'| 36|        49|
| 16|  b'Weyoun'| 22|       323|
| 17|     b'Odo'| 35|        13|
| 18|b'Jean-Luc'| 45|       455|
| 19|  b'Geordi'| 60|       246|
+---+-----------+---+----------+
only showing top 20 rows



In [13]:
df.createOrReplaceTempView("people")

In [14]:
teenagers = spark.sql("SELECT * FROM people WHERE age >= 13 AND age <= 19")

In [15]:
teenagers.show()

+---+----------+---+----------+
| ID|      name|age|numFriends|
+---+----------+---+----------+
| 21|  b'Miles'| 19|       268|
| 52|b'Beverly'| 19|       269|
| 54|  b'Brunt'| 19|         5|
|106|b'Beverly'| 18|       499|
|115|  b'Dukat'| 18|       397|
|133|  b'Quark'| 19|       265|
|136|   b'Will'| 19|       335|
|225|   b'Elim'| 19|       106|
|304|   b'Will'| 19|       404|
|341|   b'Data'| 18|       326|
|366|  b'Keiko'| 19|       119|
|373|  b'Quark'| 19|       272|
|377|b'Beverly'| 18|       418|
|404| b'Kasidy'| 18|        24|
|409|    b'Nog'| 19|       267|
|439|   b'Data'| 18|       417|
|444|  b'Keiko'| 18|       472|
|492|  b'Dukat'| 19|        36|
|494| b'Kasidy'| 18|       194|
+---+----------+---+----------+



In [16]:
df.groupBy("age").count().orderBy("age").show()

+---+-----+
|age|count|
+---+-----+
| 18|    8|
| 19|   11|
| 20|    5|
| 21|    8|
| 22|    7|
| 23|   10|
| 24|    5|
| 25|   11|
| 26|   17|
| 27|    8|
| 28|   10|
| 29|   12|
| 30|   11|
| 31|    8|
| 32|   11|
| 33|   12|
| 34|    6|
| 35|    8|
| 36|   10|
| 37|    9|
+---+-----+
only showing top 20 rows



# Friends By Age

In [17]:
lines = spark.sparkContext.textFile("data/fakefriends.csv")

In [18]:
def parseLine(line):
    fields = line.split(',')
    age = int(fields[2])
    numFriends = int(fields[3])
    return (age, numFriends)

In [19]:
rdd = lines.map(parseLine)

In [20]:
rdd.take(10)

[(33, 385),
 (26, 2),
 (55, 221),
 (40, 465),
 (68, 21),
 (59, 318),
 (37, 220),
 (54, 307),
 (38, 380),
 (27, 181)]

In [21]:
map_data = rdd.mapValues(lambda x: (x, 1))

In [22]:
map_data.take(10)

[(33, (385, 1)),
 (26, (2, 1)),
 (55, (221, 1)),
 (40, (465, 1)),
 (68, (21, 1)),
 (59, (318, 1)),
 (37, (220, 1)),
 (54, (307, 1)),
 (38, (380, 1)),
 (27, (181, 1))]

In [23]:
totals_by_age = map_data.reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))

In [24]:
totals_by_age.take(10)

[(26, (4115, 17)),
 (40, (4264, 17)),
 (68, (2696, 10)),
 (54, (3615, 13)),
 (38, (2903, 15)),
 (56, (1840, 6)),
 (36, (2466, 10)),
 (22, (1445, 7)),
 (60, (1419, 7)),
 (30, (2594, 11))]

In [25]:
average_by_age = totals_by_age.mapValues(lambda x: x[0] / x[1])

In [26]:
average_by_age.take(5)

[(26, 242.05882352941177),
 (40, 250.8235294117647),
 (68, 269.6),
 (54, 278.0769230769231),
 (38, 193.53333333333333)]