<a href="https://colab.research.google.com/github/zacSimo/PysparkAdvanced/blob/main/pyspark_practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.3.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.7 (from pyspark)
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl.metadata (1.5 kB)
Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840625 sha256=35bd5f42fd994f7d2e5312f83ae1a8f5245504a454193036c458f6465b07ca73
  Stored in directory: /root/.cache/pip/wheels/1b/3a/92/28b93e2fbfdbb07509ca4d6f50c5e407f48dce4ddbda69a4ab
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully in

In [None]:
import pyspark.sql
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName('Practice').getOrCreate()

In [None]:
sd_df = spark.read.csv('sample_data/california_housing_test.csv', header=True, inferSchema=True)


In [None]:
sd_df.printSchema()

root
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- housing_median_age: double (nullable = true)
 |-- total_rooms: double (nullable = true)
 |-- total_bedrooms: double (nullable = true)
 |-- population: double (nullable = true)
 |-- households: double (nullable = true)
 |-- median_income: double (nullable = true)
 |-- median_house_value: double (nullable = true)



In [None]:
sd_df.count()

3000

#####Apache spark memory

In [None]:
from pyspark.sql.functions import *

In [None]:
sd_broad_df = broadcast(sd_df)

In [None]:
# UDF
def my_func(code: str) -> str:
  return bdData.value.get(code)



In [None]:
import pyspark.sql.functions as f
spark.udf.register("my_function", my_func, StringType())

In [None]:
# see how many cores available for spark engine
spark.sparkContext.defaultParallelism

96

In [None]:
# generate ramdom data
import random
randomlist = random.sample(range(1, 40),10)

In [None]:
print(randomlist)

[25, 8, 29, 33, 6, 28, 3, 18, 20, 16]


In [None]:
# parallelize and take it into 4 partitions
rdd1 = spark.sparkContext.parallelize(randomlist, 4)

In [None]:
rdd1

ParallelCollectionRDD[17] at readRDDFromFile at PythonRDD.scala:289

In [None]:
# show exactly the same data as created
rdd1.collect()

[25, 8, 29, 33, 6, 28, 3, 18, 20, 16]

In [None]:
# to see the numbers of partitions
rdd1.getNumPartitions()

4

In [None]:
# to see the data in partitions
rdd1.glom().collect()

[[25, 8], [29, 33], [6, 28], [3, 18, 20, 16]]

In [None]:
rdd1.take(rdd1.count()-1)

[25, 8, 29, 33, 6, 28, 3, 18, 20]

In [None]:
rdd1.glom().collect()[3]

[3, 18, 20, 16]

In [None]:
rdd1.top(3)

[33, 29, 28]

In [None]:
rdd_map = rdd1.map(lambda x: x*2)
rdd_map.collect()

[50, 16, 58, 66, 12, 56, 6, 36, 40, 32]

In [None]:
def myfunc(x):
  if x%2 == 0:
    return x * 2
  else:
    return x*3

rdd_map_2 = rdd1.map(myfunc)
rdd_map_2.collect()

[75, 16, 87, 99, 12, 56, 9, 36, 40, 32]

In [None]:
rdd_filter = rdd1.filter(lambda x: x%2 == 0)
rdd_filter.collect()

[8, 6, 28, 18, 20, 16]

In [None]:
# flatmap collect values into a single list
# used when we want aggregation of the result (reduce)
rdd_flatmap = rdd1.flatMap(lambda x: (x, x*10))
rdd_flatmap.collect()

[25,
 250,
 8,
 80,
 29,
 290,
 33,
 330,
 6,
 60,
 28,
 280,
 3,
 30,
 18,
 180,
 20,
 200,
 16,
 160]

In [None]:
rdd_map_3 = rdd1.map(lambda x: [x, x*10])
rdd_map_3.collect()

[[25, 250],
 [8, 80],
 [29, 290],
 [33, 330],
 [6, 60],
 [28, 280],
 [3, 30],
 [18, 180],
 [20, 200],
 [16, 160]]

In [None]:
rdd_map_3.glom().collect()

[[[25, 250], [8, 80]],
 [[29, 290], [33, 330]],
 [[6, 60], [28, 280]],
 [[3, 30], [18, 180], [20, 200], [16, 160]]]

In [None]:
print(rdd1.max())
print(rdd1.min())
print(rdd1.mean())
print(rdd1.sum())
print(rdd1.stdev())

33
3
18.6
186
9.840731680114036


In [None]:
# mapPartitions() map a function into each partitions
def my_partition_func(iterator):
  sum = 0
  for i in iterator:
    sum += i

  yield sum

rdd_map_partition = rdd1.mapPartitions(my_partition_func)
rdd_map_partition.collect()

[33, 62, 34, 57]

In [None]:
# union
rdd2 = spark.sparkContext.parallelize(random.sample(range(1,20), 10),2)
rdd2.collect()


[9, 6, 12, 14, 11, 2, 5, 7, 19, 15]

In [None]:
rdd1.union(rdd2).collect()

[25, 8, 29, 33, 6, 28, 3, 18, 20, 16, 9, 6, 12, 14, 11, 2, 5, 7, 19, 15]

In [None]:
rdd_union = rdd1.union(rdd2)
rdd_union.glom().collect()

[[25, 8],
 [29, 33],
 [6, 28],
 [3, 18, 20, 16],
 [9, 6, 12, 14, 11],
 [2, 5, 7, 19, 15]]

In [None]:
rdd1.union(rdd2).getNumPartitions()

6

In [None]:
rdd1.union(rdd2).glom().collect()

[[25, 8],
 [29, 33],
 [6, 28],
 [3, 18, 20, 16],
 [9, 6, 12, 14, 11],
 [2, 5, 7, 19, 15]]

In [None]:
rdd_inter = rdd1.intersection(rdd2)
rdd_inter.collect()

[6]

In [None]:
rdd_inter.glom().collect()

[[6], [], [], [], [], []]

In [None]:
rdd_inter.coalesce(1).glom().collect()

[[6]]

In [None]:
# takeSample withreplacement, num, [seed]
rdd1.takeSample(False, 5)

[28, 16, 6, 8, 25]

In [None]:
# takeOrdered
rdd1.takeOrdered(5)

[3, 6, 8, 16, 18]

In [None]:
rdd1.takeOrdered(5, key=lambda x: -x)

[33, 29, 28, 25, 20]

In [None]:
rdd1.reduce(lambda x, y: x+y)

186

In [None]:
rdd_rbk = rdd_union.map(lambda x: (x, x*x)).reduceByKey(lambda x,y: x+y)

In [None]:
rdd_rbk.toDF(schema=["key","value"]).show(vertical=False,truncate=False)

+---+-----+
|key|value|
+---+-----+
|6  |72   |
|18 |324  |
|12 |144  |
|25 |625  |
|7  |49   |
|19 |361  |
|8  |64   |
|20 |400  |
|14 |196  |
|2  |4    |
|33 |1089 |
|3  |9    |
|9  |81   |
|15 |225  |
|28 |784  |
|16 |256  |
|29 |841  |
|11 |121  |
|5  |25   |
+---+-----+



In [None]:
# sortByKey()
rdd_rbk.sortByKey().collect()

[(2, 4),
 (3, 9),
 (5, 25),
 (6, 72),
 (7, 49),
 (8, 64),
 (9, 81),
 (11, 121),
 (12, 144),
 (14, 196),
 (15, 225),
 (16, 256),
 (18, 324),
 (19, 361),
 (20, 400),
 (25, 625),
 (28, 784),
 (29, 841),
 (33, 1089)]

In [None]:
rdd_rbk.countByKey()

defaultdict(int,
            {6: 1,
             18: 1,
             12: 1,
             25: 1,
             7: 1,
             19: 1,
             8: 1,
             20: 1,
             14: 1,
             2: 1,
             33: 1,
             3: 1,
             9: 1,
             15: 1,
             28: 1,
             16: 1,
             29: 1,
             11: 1,
             5: 1})

In [None]:
rdd_gbk = rdd_rbk.groupByKey().collect()

In [None]:
for item in rdd_gbk:
  print(item[0], list(item[1]))

6 [72]
18 [324]
12 [144]
25 [625]
7 [49]
19 [361]
8 [64]
20 [400]
14 [196]
2 [4]
33 [1089]
3 [9]
9 [81]
15 [225]
28 [784]
16 [256]
29 [841]
11 [121]
5 [25]


In [None]:
rdd_gby_1 = rdd_rbk.groupByKey()

In [None]:
import os
os.environ["PYTHONHASHSEED"] = "false"

In [None]:

rdd_rbk.sortByKey().lookup(10)

[]

In [None]:
rdd_rbk.cache()

PythonRDD[94] at RDD at PythonRDD.scala:53

In [None]:
  rdd_rbk.is_cached

True

In [None]:
rdd_rbk.unpersist()

PythonRDD[94] at RDD at PythonRDD.scala:53

In [None]:
# DF
spark

In [None]:
df = spark.read.format("csv").option("header", True).option("inferSchema", True).load("dataset_fifa_18/CompleteDataset.csv")

AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/content/dataset_fifa_18/CompleteDataset.csv.

In [None]:
df.printSchema()

In [None]:
df.show(truncate=False)

In [None]:
df.count()

In [None]:
df.rdd.glom().collect()

In [None]:
df.rdd.getNumPartitions()

In [None]:
df2 = df.repartition(4)

In [None]:
df2.rdd.getNumPartitions()

In [None]:
df2.rdd.glom().collect()

In [None]:
df.withColumnRenamed("_c0", "id").show()

In [None]:
df2 = df.withColumnRenamed("_c0", "id")


In [None]:
df2.printSchema()

In [None]:
df2.rdd.getNumPartitions()

In [None]:
df2 = df2.repartition(6)

In [None]:
df2.rdd.getNumPartitions()

In [None]:
df3 = df2.repartition(3, "Nationality")

In [None]:
df3.rdd.glom().collect()

In [None]:
df3.rdd.getNumPartitions()

In [None]:
df3.show()

In [None]:
df3.na.fill({"CAM": 10, "CB": 1}).show()

In [None]:
df3.filter(df3["Nationality"] == "Germany").show()

In [None]:
df3.groupBy("Nationality").count().sort("Nationality").show(50)

In [None]:
df3.groupBy("Nationality", "Age").count().sort("Nationality","Age").show(50)

In [None]:
from pyspark.sql import *

In [None]:
# visualize with pandas
df3_res = df3.where(df3["Overall"]>70).filter(df3["Nationality"] == "Albania").groupBy("Nationality", "Age").count().sort("Nationality","Age").toPandas()
df3_res.plot(kind="bar", x="Nationality", y="count")


In [None]:
df3_res_2 = df3.where(df3["Overall"]>70).groupBy("Age").count().sort("Age").toPandas()
df3_res_2.plot(kind="bar", x="Age", y="count")


In [None]:
# UDF
def uppercase_lowercase(str):
  if len(str) > 10:
    return str.upper()
  else:
    return str.lower()

spark.udf.register("uppercase_lowercase", uppercase_lowercase, StringType())

In [None]:
# register df
df2.createOrReplaceTempView("fifa_table")

In [None]:
# df2.select("Name", uppercase_lowercase("Name")).show()
spark.sql("select Name, uppercase_lowercase(Name) as upper_Name from fifa_table").show()

In [None]:
broadcast(df2)

In [None]:
df2.select("Name", uppercase_lowercase("Name")).show()

In [None]:
df = spark.read.format("json").load('arxiv-metadata-oai-snapshot.json')

In [None]:
df.printSchema()

In [None]:
df.count()

In [None]:
df.rdd.getNumPartitions()

In [None]:
df.rdd.glom().collect()

In [None]:
df.show()

In [None]:
df.cache()

In [None]:
spark.sparkContext.defaultParallelism

In [None]:
df.count()

In [None]:
import json

In [None]:
rdd_json = spark.sparkContext.textFile('arxiv-metadata-oai-snapshot.json')

In [None]:
rdd = rdd_json.map(lambda x: json.loads(x))

In [None]:
rdd.getNumPartitions()

In [None]:
spark.sparkContext.defaultParallelism

In [None]:
rdd.persist()

In [None]:
rdd.take(2)

In [None]:
rdd.count()

In [None]:
rdd.flatMap(lambda x: x.keys()).distinct().collect()

In [None]:
rdd.map(lambda x: x['license']).distinct().collect()

In [None]:
# min title
min_title = rdd.map(lambda x: x['title']).distinct().reduce(lambda x,y: x if (len(x)<len(y)) else y)
print(min_title)

In [None]:
# max title
max_title = rdd.map(lambda x: x['title']).distinct().reduce(lambda x,y: x if (len(x)>len(y)) else y)
print(max_title)

In [None]:
import re

In [None]:
# find abbreeviations with 5 or more letters in the "abstract" column
# exept : [^_! /\\:;&#,~<>]
regx = r"\((\w[^_! /\\:;&#,~?${}<>]{5,})\)"

In [None]:
print(re.search(regx, "of the Large Hadron\nCollider (LHCOPS)"))

In [None]:

rdd.filter(lambda x: re.search(regx, x['abstract'])).count()

In [None]:
import re
rdd.filter(lambda x: re.search(regx, x['abstract'])).collect()

In [None]:
# Number of archive per month
rdd.map(lambda x: (x['update_date'][5:7], 1)).reduceByKey(lambda x,y: x+y).collect()

In [None]:
from datetime import datetime

def extract_date(line):
  d = datetime.strptime(line, '%Y-%m-%d')
  return d.month

extract_date("2008-12-20")

In [None]:
# Number of archive per month
rdd.map(lambda x: (extract_date(x['update_date']), 1)).reduceByKey(lambda x,y: x+y).collect()

In [None]:
# the average number of pages
# in comments column
# extract only the number be carefull with "None" value
pages = "pages"

def extract_num_pages(comments):
  regex = r"(\d+ pages)"
  r = re.findall(regex, comments)
  if r:
   return int(r.split(' ')[0])
  return 0

extract_num_pages("Extensively revised version. In the revised version, we have\n  discussed the effect of viscosity on (quark) transverse momentum distribution\n  and on elliptic flow. It was shown that elliptic flow saturates due to\n  non-equilirium correction to the equilibrium distribution function and can\n  not be mimicked by ideal dynamics. 15 pages, 16 figures")

In [None]:
rdd.map(lambda x: (x["comments"],extract_num_pages(x["comments"]))).collect()

In [None]:
rdd.map(lambda x: extract_num_pages(x['comments'])).mean()

In [None]:
# Number archive per year/month by created archive

In [None]:
!git status

In [None]:
!git commit  -a

In [None]:
!git add PysparkAdvanced/ && git commit -m "add file" && git push origin main

In [None]:
! cat /content/PysparkAdvanced/.git/config

In [None]:
%%sh
echo '[core]
	repositoryformatversion = 0
	filemode = true
	bare = false
	logallrefupdates = true
[remote "origin"]
	url = https://github.com/zacSimo/PysparkAdvanced.git
	fetch = +refs/heads/*:refs/remotes/origin/*
[branch "main"]
	remote = origin
	merge = refs/heads/main
[user]
  email = zsimale@gmail.com
  password = Simalone1' > /content/PysparkAdvanced/.git/config