In [None]:
%pip install pyspark

In [1]:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local").setAppName("My App")

In [2]:
sc = SparkContext(conf = conf)

23/11/14 15:35:21 WARN Utils: Your hostname, MacBook-Pro-de-Isaac-2.local resolves to a loopback address: 127.0.0.1; using 10.162.15.104 instead (on interface en0)
23/11/14 15:35:21 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/14 15:35:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


23/11/14 15:35:35 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
23/11/15 19:44:57 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 3848366 ms exceeds timeout 120000 ms
23/11/15 19:44:57 WARN SparkContext: Killing executors is not supported by current scheduler.
23/11/15 19:45:03 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.Rp

In [None]:
lines = sc.textFile("sample_data/README.md")

In [None]:
samplelines = lines.filter(lambda line: "sample" in line)

In [None]:
samplelines.count()

In [None]:
for line in samplelines.collect():
  print(line)

In [None]:
for line in samplelines.toLocalIterator():
  print(line)

In [None]:
lines = sc.parallelize(["pandas", "i like pandas"])

In [None]:
type(lines)

A **Resilient Distributed Dataset (RDD)**, the basic abstraction in Spark.

https://spark.apache.org/docs/1.5.1/api/python/pyspark.html#pyspark.RDD

In [None]:
lines.count()


In [None]:
#len(lines)

**Transformations** are operations on RDDs that return a new RDD. As discussed in “Lazy Evaluation” on page 29, transformed RDDs are computed lazily, only when you use them in an action. Many transformations are element-wise; that is, they work on one element at a time; but this is not true for all transformations.

Filter()

In [None]:
inputRDD = sc.textFile("sample_data/README.md")
samplesRDD = inputRDD.filter(lambda x: "sample" in x)
type(samplesRDD)

Union(), disctint(), intersection(), subtract(), cartesian()

In [None]:
esRDD = inputRDD.filter(lambda x: "es" in x)
esRDD.count()

In [None]:
uRDD = esRDD.union(samplesRDD)
uRDD.count()

Element-wise **transformations**
The two most common transformations you will likely be using are map() and filter(), distinct(), sample(withReplacement=Boolean, fraction=Double)

In [None]:
nums = sc.parallelize([1, 2, 3, 4])
squared = nums.map(lambda x: x * x).collect()
for num in squared:
  print("%i"%num)

Sometimes we want to produce multiple output elements for each input element. The operation to do this is called flatMap().

In [None]:
lines = sc.parallelize(["hello world", "hi"])
words = lines.flatMap(lambda line: line.split(" "))
words.first() # returns "hello"

## **Actions**
They are the operations that return a final value to the driver program or write data to an external storage system. Actions force the evaluation of the transformations required for the RDD they were called on, since they need to actually produce output

- collect()
- count()
- countByValue()
- take(num)
- top(num)
- takeOrdered(num)(ordering)
- takeSample(...)
- reduce()
- fold
- aggregate
- foreach


In [None]:
print("Total Input: %i "%uRDD.count())
for line in uRDD.take(3):
  print(line)

In [None]:
nums = sc.parallelize([1, 2, 2, 2])
nums.countByValue()

**reduce()**, which takes a function that operates on two elements of the type in your RDD and returns a new element of the same type.

In [None]:
nums = sc.parallelize([1, 2, 3, 4])
sum = nums.reduce(lambda x, y: x + y)
print(sum)

takes a “zero value” to be used for the initial call on each partition.

In [None]:
sum = nums.fold(1,lambda x, y: x + y)
print(sum)

aggregate() function frees us from the constraint of having the return be the same type as the RDD we are working on.

In [None]:
seqOp = lambda acc, value: (acc[0] + value, acc[1] + 1)
combOp = lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1])
sumCount = nums.aggregate((0, 0),seqOp,combOp)

print(sumCount[0])
print(sumCount[1])

# **Pair RDDs**

Pair RDDs are a useful building block in many programs, as they expose operations that allow you to act on each key in parallel or regroup data across the network. For example, pair RDDs have a reduceByKey() method that can aggregate data separately for each key, and a join() method that can merge two RDDs together by grouping elements with the same key.

In [None]:
lines = sc.textFile("sample_data/README.md")
pairs = lines.map(lambda x: (x.split(" ")[0], x))
pairs.take(1)

- reduceByKey(func)
- groupByKey()
- combineByKey(...)
- mapValues(func)
- flatMapValues(func)
- keys()
- values()
- sortByKey()
- countByKey()
- collectAsMap()
- lookup(key)



In [None]:
pairs.keys().count()

In [None]:
pairs.keys().distinct().count()

In [None]:
words = lines.flatMap(lambda x: x.split(" "))
result = words.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)

In [None]:
for kv in result.sortByKey().collect():
  print(kv)

The simple **join** operator is an inner join.

In [None]:
data1 = [("a", 3), ("b", 4), ("a", 1)]
data2 = [("a", 5), ("b", 1), ("c", 1)]
d1 = sc.parallelize(data1)
d2 = sc.parallelize(data2)

In [None]:
for kv in d1.join(d2).collect():
  print(kv)

leftOuterJoin(other) ,  rightOuterJoin(other)

In [None]:
for kv in d1.leftOuterJoin(d2).collect():
  print(kv)

In [None]:
for kv in d1.rightOuterJoin(d2).collect():
  print(kv)

# Activity

$$PageRank(A) = \frac{(1 - d)}{N} + d * \sum_{B\in in(A)} \frac{PageRank(B)}{L(B)}$$


Donde:

- A y B son páginas
- `PageRank(A)` es el valor de PageRank para la página A.
- `d` es el factor de amortiguación (generalmente se establece en 0.85 en la práctica).
- `N` es el número total de páginas en la red.
- `Σ` representa la suma sobre todas las páginas B que enlazan a la página A.
- in(A) es el conjunto de páginas que enlazan a la página A.
- `PageRank(B)` es el valor de PageRank de la página B.
- `L(B)` es el número de enlaces salientes desde la página B.


Supongamos que tenemos cuatro páginas web (A, B, C y D) en una red y que inicialmente todas tienen un PageRank igual. El factor de amortiguación (d) es 0.85.

Relaciones:

- A <- B
- B <- A, C
- C <- B
- D <- B

Iteraciones:

* Iteración 0 (valores iniciales):



PageRank(A) = PageRank(B) = PageRank(C) = PageRank(D) = 0.25

* Iteración 1:


\begin{align*}
PageRank(A) & = \frac{(1 - 0.85)}{4} + 0.85 \cdot \frac{PageRank(B)}{1} \\
& = 0.0375 + 0.85 \cdot 0.25 = 0.2875
\end{align*}

\begin{align*}
PageRank(B) & = \frac{(1 - 0.85)}{4} + 0.85 \cdot \left(\frac{PageRank(A)}{1} + \frac{PageRank(C)}{1}\right) \\
& = 0.0375 + 0.85 \cdot (0.2875 + 0.25) = 0.675
\end{align*}

\begin{align*}
PageRank(C) & = \frac{(1 - 0.85)}{4} + 0.85 \cdot \frac{PageRank(B)}{1} \\
& = 0.0375 + 0.85 \cdot 0.675 = 0.6025
\end{align*}


\begin{align*}
PageRank(D) & = \frac{(1 - 0.85)}{4} + 0.85 \cdot \frac{PageRank(B)}{1} \\
& = 0.0375 + 0.85 \cdot 0.675 = 0.6025
\end{align*}



In [None]:
from pyspark.sql import SparkSession


spark = SparkSession\
    .builder\
    .appName("PageRank")\
    .getOrCreate()

lines = spark.read.text("pageRank_data.txt")
lines = lines.rdd.map(lambda r: r[0])
for i in lines.collect():
  print(i)

In [None]:
lines = lines.rdd.map(lambda r: r[0])

In [None]:
for i in lines.collect():
  print(i)

In [None]:

# Loads all URLs from input file and initialize their neighbors.
links = lines.map(lambda urls: parseNeighbors(urls)).distinct().groupByKey().cache()


In [None]:
for i in links.collect():
  print(i[0])
  for j in i[1]:
    print("\t",j)

In [None]:

# Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one.
ranks = links.map(lambda url_neighbors: (url_neighbors[0], 1.0))

In [None]:
for i in ranks.collect():
  print(i[0],i[1])


In [None]:
t = links.join(ranks)
t.take(1)

In [None]:
contribs = links.join(ranks).flatMap(lambda url_urls_rank: computeContribs(
    url_urls_rank[1][0], url_urls_rank[1][1]  # type: ignore[arg-type]
))

In [None]:
a = links.join(ranks).flatMap(lambda url_urls_rank: (url_urls_rank[1][0], url_urls_rank[1][1]))


In [None]:
for i in a.collect():
 if (type(i)!=float):
  for x in i:
    print(x)
 else:
  print(i)


In [None]:
contribs.take(1)

In [None]:
for i in contribs.collect():
  print(i)

In [None]:
ranks = contribs.reduceByKey(add).mapValues(lambda rank: rank * 0.85 + 0.15)
for i in ranks.collect():
  print(i)

In [None]:
# Calculates and updates URL ranks continuously using PageRank algorithm.
for iteration in range(5):
    # Calculates URL contributions to the rank of other URLs.
    contribs = links.join(ranks).flatMap(lambda url_urls_rank: computeContribs(
        url_urls_rank[1][0], url_urls_rank[1][1]  # type: ignore[arg-type]
    ))

    # Re-calculates URL ranks based on neighbor contributions.
    ranks = contribs.reduceByKey(add).mapValues(lambda rank: rank * 0.85 + 0.15)

# Collects all URL ranks and dump them to console.
for (link, rank) in ranks.collect():
    print("%s has rank: %s." % (link, rank))

