![image](images/rdd.png)

![image](images/transformacao_acao.png)

In [1]:
import pyspark
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
sc= spark.sparkContext

In [3]:
spark

In [5]:
sc

In [6]:
numeros = sc.parallelize([1,2,3,4,5,6,7,8,9,10])

In [8]:
numeros.take(5)

[1, 2, 3, 4, 5]

In [9]:
numeros.top(5)

[10, 9, 8, 7, 6]

In [11]:
#Não muito recomendado, pois traz todos os dados
numeros.collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [13]:
numeros.count()

10

In [14]:
numeros.mean()

5.5

In [15]:
numeros.sum()

55

In [16]:
numeros.max()

10

In [17]:
numeros.min()

1

In [18]:
numeros.stdev()

2.8722813232690143

In [19]:
#Aplicando filtro
#Filtro é uma transformação, logo teremos o resultado apenas quando aplicarmos uma ação
filtro = numeros.filter(lambda filtro: filtro>2)
filtro.collect()

[3, 4, 5, 6, 7, 8, 9, 10]

In [22]:
#https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.RDD.sample.html
amostra = numeros.sample(True,0.5,1)
amostra.collect()

[3, 4, 4, 5, 6, 10]

In [24]:
mapa = numeros.map(lambda mapa: mapa *2)
mapa.collect()

[2, 4, 6, 8, 10, 12, 14, 16, 18, 20]

In [25]:
numeros.collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

### Operações entre RDDs

In [27]:
numeros2 = sc.parallelize([6,7,8,9,10])

In [28]:
#União
uniao = numeros.union(numeros2)
uniao.collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 6, 7, 8, 9, 10]

In [29]:
#Verificar elementos em comum
interseccao = numeros.intersection(numeros2)
interseccao.collect()

[6, 7, 8, 9, 10]

In [30]:
#Subtração dos elementos. Elementos que tem no numeros, mas não estão no numeros2
subtrai = numeros.subtract(numeros2)
subtrai.collect()

[1, 2, 3, 4, 5]

In [31]:
#Calculando o produto cartesiano
#https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.RDD.cartesian.html?highlight=cartesian
cartesiano = numeros.cartesian(numeros2)
cartesiano.collect()

[(1, 6),
 (1, 7),
 (1, 8),
 (1, 9),
 (1, 10),
 (2, 6),
 (2, 7),
 (2, 8),
 (2, 9),
 (2, 10),
 (3, 6),
 (3, 7),
 (3, 8),
 (3, 9),
 (3, 10),
 (4, 6),
 (5, 6),
 (4, 7),
 (5, 7),
 (4, 8),
 (5, 8),
 (4, 9),
 (5, 9),
 (4, 10),
 (5, 10),
 (6, 6),
 (6, 7),
 (6, 8),
 (6, 9),
 (6, 10),
 (7, 6),
 (7, 7),
 (7, 8),
 (7, 9),
 (7, 10),
 (8, 6),
 (8, 7),
 (8, 8),
 (8, 9),
 (8, 10),
 (9, 6),
 (10, 6),
 (9, 7),
 (10, 7),
 (9, 8),
 (10, 8),
 (9, 9),
 (10, 9),
 (9, 10),
 (10, 10)]

In [32]:
#Contar quantas vezes aparece cada elemento do produto cartesiano
cartesiano.countByValue()

defaultdict(int,
            {(1, 6): 1,
             (1, 7): 1,
             (1, 8): 1,
             (1, 9): 1,
             (1, 10): 1,
             (2, 6): 1,
             (2, 7): 1,
             (2, 8): 1,
             (2, 9): 1,
             (2, 10): 1,
             (3, 6): 1,
             (3, 7): 1,
             (3, 8): 1,
             (3, 9): 1,
             (3, 10): 1,
             (4, 6): 1,
             (5, 6): 1,
             (4, 7): 1,
             (5, 7): 1,
             (4, 8): 1,
             (5, 8): 1,
             (4, 9): 1,
             (5, 9): 1,
             (4, 10): 1,
             (5, 10): 1,
             (6, 6): 1,
             (6, 7): 1,
             (6, 8): 1,
             (6, 9): 1,
             (6, 10): 1,
             (7, 6): 1,
             (7, 7): 1,
             (7, 8): 1,
             (7, 9): 1,
             (7, 10): 1,
             (8, 6): 1,
             (8, 7): 1,
             (8, 8): 1,
             (8, 9): 1,
             (8, 10): 1,
             (9

In [33]:
compras = sc.parallelize([(1,200),(2,300),(3,120),(4,250),(5,78)])

In [34]:
#Ver todos os clientes do RDD
#Chave:valor
chaves = compras.keys()
chaves.collect()

[1, 2, 3, 4, 5]

In [35]:
valores = compras.values()
valores.collect()

[200, 300, 120, 250, 78]

In [36]:
#Contagem por chave
compras.countByKey()

defaultdict(int, {1: 1, 2: 1, 3: 1, 4: 1, 5: 1})

In [37]:
#Somando 1 em cada valor
soma = compras.mapValues(lambda soma: soma +1)
soma.collect()

[(1, 201), (2, 301), (3, 121), (4, 251), (5, 79)]

In [38]:
compras.collect()

[(1, 200), (2, 300), (3, 120), (4, 250), (5, 78)]

In [39]:
debitos = sc.parallelize([(1,20),(2,300)])

In [41]:
#Join entre as compras e debitos
resultados = compras.join(debitos)
resultados.collect()

[(1, (200, 20)), (2, (300, 300))]

In [42]:
# Mostrar apenas os clientes sem debitos
semdebito = compras.subtractByKey(debitos)
semdebito.collect()

[(3, 120), (4, 250), (5, 78)]