In [5]:
# %%bash
# pip install --user s3fs==0.5.2

# Initialise Spark

In [2]:
from pyspark.sql import SparkSession

# Setup the spark executors
spark = (
    SparkSession.builder.config("spark.app.name", "spark-pi")
    .config("spark.executor.instances", "2")
    .getOrCreate()
)

### Test 1: generate and calculate pi

In [5]:
import random

NUM_SAMPLES = 100000000

def inside(p):
    x, y = random.random(), random.random()
    return x * x + y * y < 1

sc = spark.sparkContext
count = sc.parallelize(range(0, NUM_SAMPLES)).filter(inside).count()
print("Pi is roughly {}".format(4.0 * count / NUM_SAMPLES))

Pi is roughly 3.14154516


### Test 2: read a local CSV from Pandas to PySpark

In [12]:
import pandas as pd
import os

pdf = pd.read_csv("data_csv/iris.csv")
df = spark.createDataFrame(pdf)
df.show()

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
|         5.4|        3.9|         1.7|        0.4| setosa|
|         4.6|        3.4|         1.4|        0.3| setosa|
|         5.0|        3.4|         1.5|        0.2| setosa|
|         4.4|        2.9|         1.4|        0.2| setosa|
|         4.9|        3.1|         1.5|        0.1| setosa|
|         5.4|        3.7|         1.5|        0.2| setosa|
|         4.8|        3.4|         1.6|        0.2| setosa|
|         4.8|        3.0|         1.4|        0.1| setosa|
|         4.3|        3.0|         1.1| 

### Test 3: read a CSV from S3 into Spark

In [3]:
df2 = spark.read.csv("s3a://bdrk-uob-workstream2-sandbox-raw-data/iris/", header=True)
df2.show()

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
|         5.4|        3.9|         1.7|        0.4| setosa|
|         4.6|        3.4|         1.4|        0.3| setosa|
|         5.0|        3.4|         1.5|        0.2| setosa|
|         4.4|        2.9|         1.4|        0.2| setosa|
|         4.9|        3.1|         1.5|        0.1| setosa|
|         5.4|        3.7|         1.5|        0.2| setosa|
|         4.8|        3.4|         1.6|        0.2| setosa|
|         4.8|        3.0|         1.4|        0.1| setosa|
|         4.3|        3.0|         1.1| 

### Test 4: read CSV via SparkFiles (Not feasible on K8s Spark clusters)

In [11]:
# df3 = spark.read.csv("data_csv/iris.csv", header=True)

In [19]:
# from pyspark import SparkFiles
# sc = spark.sparkContext
# sc.addFile("data_csv/iris.csv")

# print(SparkFiles.get("iris.csv"))

In [9]:
# df3 = spark.read.csv(SparkFiles.get("iris.csv"), header=True)
# df3 = spark.read.csv("file://" + SparkFiles.get("iris.csv"), header=True)
# df3 = spark.read.csv("file:///" + SparkFiles.get("iris.csv"), header=True)

In [18]:
# df3 = spark.read.csv("local:///" + SparkFiles.get("iris.csv"), header=True)
# df3 = spark.read.csv("local:///notebooks/repo/data_csv/iris.csv")
# df3 = spark.read.format("csv").load("file:///notebooks/repo/data_csv/iris.csv")