In [1]:
%pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting py4j==0.10.9.7 (from pyspark)
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=bf14494aa68a04007f6b6a36927c3fab5e1255534d94f20d7c5bd52bf92be75e
  Stored in directory: /Users/isaac/Library/Caches/pip/wheels/38/df/61/8c121f50c3cffd77f8178180dd232d90b3b99d1bd61fb6d6be
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.

In [2]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local").setAppName("My App")


In [3]:
sc = SparkContext(conf = conf)

In [4]:
lines = sc.textFile("sample_data/README.md")

In [5]:
samplelines = lines.filter(lambda line: "sample" in line)

In [6]:
samplelines.count()

2

In [7]:
for line in samplelines.collect():
  print(line)

This directory includes a few sample datasets to get you started.
*   `mnist_*.csv` is a small sample of the


In [8]:
for line in samplelines.toLocalIterator():
  print(line)

This directory includes a few sample datasets to get you started.
*   `mnist_*.csv` is a small sample of the


In [9]:
lines = sc.parallelize(["pandas", "i like pandas"])

In [10]:
type(lines)

pyspark.rdd.RDD

A **Resilient Distributed Dataset (RDD)**, the basic abstraction in Spark.

https://spark.apache.org/docs/1.5.1/api/python/pyspark.html#pyspark.RDD

In [11]:
lines.count()


2

In [12]:
#len(lines)

TypeError: ignored

**Transformations** are operations on RDDs that return a new RDD. As discussed in “Lazy Evaluation” on page 29, transformed RDDs are computed lazily, only when you use them in an action. Many transformations are element-wise; that is, they work on one element at a time; but this is not true for all transformations.

Filter()

In [13]:
inputRDD = sc.textFile("sample_data/README.md")
samplesRDD = inputRDD.filter(lambda x: "sample" in x)
type(samplesRDD)

pyspark.rdd.PipelinedRDD

Union(), disctint(), intersection(), subtract(), cartesian()

In [14]:
esRDD = inputRDD.filter(lambda x: "es" in x)
esRDD.count()

4

In [15]:
uRDD = esRDD.union(samplesRDD)
uRDD.count()

6

Element-wise **transformations**
The two most common transformations you will likely be using are map() and filter(), distinct(), sample(withReplacement=Boolean, fraction=Double)

In [20]:
nums = sc.parallelize([1, 2, 3, 4])
squared = nums.map(lambda x: x * x).collect()
for num in squared:
  print("%i"%num)

1
4
9
16


Sometimes we want to produce multiple output elements for each input element. The operation to do this is called flatMap().

In [21]:
lines = sc.parallelize(["hello world", "hi"])
words = lines.flatMap(lambda line: line.split(" "))
words.first() # returns "hello"

'hello'

## **Actions**
They are the operations that return a final value to the driver program or write data to an external storage system. Actions force the evaluation of the transformations required for the RDD they were called on, since they need to actually produce output

- collect()
- count()
- countByValue()
- take(num)
- top(num)
- takeOrdered(num)(ordering)
- takeSample(...)
- reduce()
- fold
- aggregate
- foreach


In [16]:
print("Total Input: %i "%uRDD.count())
for line in uRDD.take(3):
  print(line)

Total Input: 6 
This directory includes a few sample datasets to get you started.
    https://developers.google.com/machine-learning/crash-course/california-housing-data-description
    described at: http://yann.lecun.com/exdb/mnist/


In [32]:
nums = sc.parallelize([1, 2, 2, 2])
nums.countByValue()

defaultdict(int, {1: 1, 2: 3})

**reduce()**, which takes a function that operates on two elements of the type in your RDD and returns a new element of the same type.

In [25]:
nums = sc.parallelize([1, 2, 3, 4])
sum = nums.reduce(lambda x, y: x + y)
print(sum)

10


takes a “zero value” to be used for the initial call on each partition.

In [28]:
sum = nums.fold(1,lambda x, y: x + y)
print(sum)

12


aggregate() function frees us from the constraint of having the return be the same type as the RDD we are working on.

In [31]:
seqOp = lambda acc, value: (acc[0] + value, acc[1] + 1)
combOp = lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1])
sumCount = nums.aggregate((0, 0),seqOp,combOp)

print(sumCount[0])
print(sumCount[1])

10
4


# **Pair RDDs**

Pair RDDs are a useful building block in many programs, as they expose operations that allow you to act on each key in parallel or regroup data across the network. For example, pair RDDs have a reduceByKey() method that can aggregate data separately for each key, and a join() method that can merge two RDDs together by grouping elements with the same key.

In [35]:
lines = sc.textFile("sample_data/README.md")
pairs = lines.map(lambda x: (x.split(" ")[0], x))
pairs.take(1)

[('This', 'This directory includes a few sample datasets to get you started.')]

- reduceByKey(func)
- groupByKey()
- combineByKey(...)
- mapValues(func)
- flatMapValues(func)
- keys()
- values()
- sortByKey()
- countByKey()
- collectAsMap()
- lookup(key)



In [37]:
pairs.keys().count()

19

In [39]:
pairs.keys().distinct().count()

3

In [40]:
words = lines.flatMap(lambda x: x.split(" "))
result = words.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)

In [48]:
for kv in result.sortByKey().collect():
  print(kv)

('', 51)
("'Graphs", 1)
('(1):', 1)
('(1973).', 1)
('*', 3)
('17-21.', 1)
('1990', 1)
('2682899.', 1)
('27', 1)
('American', 1)
("Analysis'.", 1)
('Anscombe,', 1)
('California', 1)
('Census;', 1)
('F.', 1)
('J.', 1)
('JSTOR', 1)
('Statistical', 1)
('Statistician.', 1)
('This', 1)
('US', 1)
("[Anscombe's", 1)
('[MNIST', 1)
('[vega_datasets', 1)
('`anscombe.json`', 1)
('`california_housing_data*.csv`', 1)
('`mnist_*.csv`', 1)
('a', 3)
('and', 1)
('at:', 2)
('available', 1)
('by', 1)
('contains', 1)
('copy', 2)
('data', 1)
('database](https://en.wikipedia.org/wiki/MNIST_database),', 1)
('datasets', 1)
('described', 2)
('directory', 1)
('few', 1)
('from', 1)
('get', 1)
('housing', 1)
('http://yann.lecun.com/exdb/mnist/', 1)
('https://developers.google.com/machine-learning/crash-course/california-housing-data-description', 1)
('in', 2)
('includes', 1)
('information', 1)
('is', 4)
('it', 1)
('library](https://github.com/altair-viz/vega_datasets/blob/4f67bdaad10f45e3549984e17e1b3088c731503d/v

The simple **join** operator is an inner join.

In [54]:
data1 = [("a", 3), ("b", 4), ("a", 1)]
data2 = [("a", 5), ("b", 1), ("c", 1)]
d1 = sc.parallelize(data1)
d2 = sc.parallelize(data2)

In [55]:
for kv in d1.join(d2).collect():
  print(kv)

('b', (4, 1))
('a', (3, 5))
('a', (1, 5))


leftOuterJoin(other) ,  rightOuterJoin(other)

In [58]:
for kv in d1.leftOuterJoin(d2).collect():
  print(kv)

('b', (4, 1))
('a', (3, 5))
('a', (1, 5))


In [59]:
for kv in d1.rightOuterJoin(d2).collect():
  print(kv)

('b', (4, 1))
('c', (None, 1))
('a', (3, 5))
('a', (1, 5))


# Activity

$$PageRank(A) = \frac{(1 - d)}{N} + d * \sum_{B\in in(A)} \frac{PageRank(B)}{L(B)}$$


Donde:

- A y B son páginas
- `PageRank(A)` es el valor de PageRank para la página A.
- `d` es el factor de amortiguación (generalmente se establece en 0.85 en la práctica).
- `N` es el número total de páginas en la red.
- `Σ` representa la suma sobre todas las páginas B que enlazan a la página A.
- in(A) es el conjunto de páginas que enlazan a la página A.
- `PageRank(B)` es el valor de PageRank de la página B.
- `L(B)` es el número de enlaces salientes desde la página B.


Supongamos que tenemos cuatro páginas web (A, B, C y D) en una red y que inicialmente todas tienen un PageRank igual. El factor de amortiguación (d) es 0.85.

Relaciones:

- A <- B
- B <- A, C
- C <- B
- D <- B

Iteraciones:

* Iteración 0 (valores iniciales):



PageRank(A) = PageRank(B) = PageRank(C) = PageRank(D) = 0.25

* Iteración 1:


\begin{align*}
PageRank(A) & = \frac{(1 - 0.85)}{4} + 0.85 \cdot \frac{PageRank(B)}{1} \\
& = 0.0375 + 0.85 \cdot 0.25 = 0.2875
\end{align*}

\begin{align*}
PageRank(B) & = \frac{(1 - 0.85)}{4} + 0.85 \cdot \left(\frac{PageRank(A)}{1} + \frac{PageRank(C)}{1}\right) \\
& = 0.0375 + 0.85 \cdot (0.2875 + 0.25) = 0.675
\end{align*}

\begin{align*}
PageRank(C) & = \frac{(1 - 0.85)}{4} + 0.85 \cdot \frac{PageRank(B)}{1} \\
& = 0.0375 + 0.85 \cdot 0.675 = 0.6025
\end{align*}


\begin{align*}
PageRank(D) & = \frac{(1 - 0.85)}{4} + 0.85 \cdot \frac{PageRank(B)}{1} \\
& = 0.0375 + 0.85 \cdot 0.675 = 0.6025
\end{align*}



In [3]:
from pyspark.sql import SparkSession


spark = SparkSession\
    .builder\
    .appName("PageRank")\
    .getOrCreate()

lines = spark.read.text("pageRank_data.txt")
lines = lines.rdd.map(lambda r: r[0])
for i in lines.collect():
  print(i)

1 2
1 3
1 4
2 1
3 1
4 2
5 1
6 1
7 2
8 2


                                                                                