In [1]:
#import findspark
#findspark.init()

from pyspark.sql import SparkSession;

# warehouse_location points to the default location for managed databases and tables
from os.path import abspath
warehouse_location = abspath('spark-warehouse')

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("ISM6562 PySpark Tutorials") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .enableHiveSupport() \
    .getOrCreate()


# Let's get the SparkContext object. It's the entry point to the Spark API. It's created when you create a sparksession
sc = spark.sparkContext

# note: If you have multiple spark sessions running (like from a previous notebook you've run), 
# this spark session webUI will be on a different port than the default (4040). One way to 
# identify this part is with the following line. If there was only one spark session running, 
# this will be 4040. If it's higher, it means there are still other spark sesssions still running.
spark_session_port = spark.sparkContext.uiWebUrl.split(":")[-1]
print("Spark Session WebUI Port: " + spark_session_port)


23/10/26 15:13:29 WARN Utils: Your hostname, localhost.localdomain resolves to a loopback address: 127.0.0.1; using 10.21.5.100 instead (on interface eth0)
23/10/26 15:13:29 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/26 15:13:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark Session WebUI Port: 4040


In [2]:
spark

In [3]:
def parseLine(line):
    fields = line.split(',')
    age = int(fields[2])
    numFriends = int(fields[3])
    return (age, numFriends)

In [4]:
lines = sc.textFile("data/fakefriends.csv") # store this in the same directory as your notebook
rdd = lines.map(parseLine)

In [5]:
age_numFriends_1 = rdd.mapValues(lambda x: (x, 1))
results  = age_numFriends_1.collect()
results[0:10] # print the first 10 rows

                                                                                

[(33, (385, 1)),
 (26, (2, 1)),
 (55, (221, 1)),
 (40, (465, 1)),
 (68, (21, 1)),
 (59, (318, 1)),
 (37, (220, 1)),
 (54, (307, 1)),
 (38, (380, 1)),
 (27, (181, 1))]

In [6]:
totalsByAge = age_numFriends_1.reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))
results = totalsByAge.collect()
results[0:10] # print the first 10 rows

[(26, (4115, 17)),
 (40, (4264, 17)),
 (68, (2696, 10)),
 (54, (3615, 13)),
 (38, (2903, 15)),
 (56, (1840, 6)),
 (36, (2466, 10)),
 (22, (1445, 7)),
 (60, (1419, 7)),
 (30, (2594, 11))]

In [7]:
averagesByAge = totalsByAge.mapValues(lambda x: x[0] / x[1])
results = averagesByAge.collect()
results[0:10] # print the first 10 rows

[(26, 242.05882352941177),
 (40, 250.8235294117647),
 (68, 269.6),
 (54, 278.0769230769231),
 (38, 193.53333333333333),
 (56, 306.6666666666667),
 (36, 246.6),
 (22, 206.42857142857142),
 (60, 202.71428571428572),
 (30, 235.8181818181818)]

In [8]:
spark.stop()