Seção Pyspark: Tratamento de dados e Big Data

Módulo: Definição e Coleta de Dados

Aula 3: Transformações e Ações em RDDs

In [1]:
! pip install pyspark



In [2]:
import pyspark
from pyspark import SparkContext, SparkConf

In [3]:
conf = SparkConf().setAppName("Leitura de Arquivo Texto")
sc = SparkContext(conf=conf).getOrCreate()

In [4]:
rdd = sc.textFile("sample_data/README.md")

In [5]:
rdd.count()

19

In [6]:
rdd.take(10)

['This directory includes a few sample datasets to get you started.',
 '',
 '*   `california_housing_data*.csv` is California housing data from the 1990 US',
 '    Census; more information is available at:',
 '    https://docs.google.com/document/d/e/2PACX-1vRhYtsvc5eOR2FWNCwaBiKL6suIOrxJig8LcSBbmCbyYsayia_DvPOOBlXZ4CAlQ5nlDD8kTaIDRwrN/pub',
 '',
 '*   `mnist_*.csv` is a small sample of the',
 '    [MNIST database](https://en.wikipedia.org/wiki/MNIST_database), which is',
 '    described at: http://yann.lecun.com/exdb/mnist/',
 '']

In [7]:
palavra = rdd.flatMap(lambda x: x.split(" "))
palavra.take(5)

['This', 'directory', 'includes', 'a', 'few']

In [8]:
palavraMinuscula = palavra.map(lambda x: x.lower())
print('Map: ', palavraMinuscula.take(5))

palavraMaiusculaFlatMap = palavra.flatMap(lambda x: x.upper())
print('FlatMap: ', palavraMaiusculaFlatMap.take(5))

Map:  ['this', 'directory', 'includes', 'a', 'few']
FlatMap:  ['T', 'H', 'I', 'S', 'D']


In [11]:
palavraComecaT = palavraMinuscula.filter(lambda x: x.startswith('t'))
print('Palavras que começam com a letra t: ', palavraComecaT.take(5))

palavraMin2 = palavraMinuscula.filter(lambda x: len(x) > 2)
print('Palavras com mais de 2 letras: ', palavraMin2.take(5))

Palavras que começam com a letra t:  ['this', 'to', 'the', 'the', 'the']
Palavras com mais de 2 letras:  ['this', 'directory', 'includes', 'few', 'sample']


In [12]:
palavraChaveValor = palavraMin2.map(lambda x: (x,1))
palavraChaveValor.take(5)

[('this', 1), ('directory', 1), ('includes', 1), ('few', 1), ('sample', 1)]

In [14]:
palavraContar = palavraChaveValor.reduceByKey(lambda x,y: x+y)

palavraContarOrd = palavraContar.sortByKey(ascending=-1)
palavraContarOrd.take(20)

[("'graphs", 1),
 ('(1):', 1),
 ('(1973).', 1),
 ('17-21.', 1),
 ('1990', 1),
 ('2682899.', 1),
 ("[anscombe's", 1),
 ('[mnist', 1),
 ('[vega_datasets', 1),
 ('`anscombe.json`', 1),
 ('`california_housing_data*.csv`', 1),
 ('`mnist_*.csv`', 1),
 ('american', 1),
 ("analysis'.", 1),
 ('and', 1),
 ('anscombe,', 1),
 ('at:', 2),
 ('available', 1),
 ('california', 1),
 ('census;', 1)]

In [15]:
palavraContar.saveAsTextFile('contar_palavras_out')

In [16]:
!ls contar_palavras_out/

part-00000  part-00001	_SUCCESS


In [17]:
rddContarPalavras = sc.textFile('contar_palavras_out')
rddContarPalavras.take(10)

["('directory', 1)",
 "('includes', 1)",
 "('few', 1)",
 "('get', 1)",
 "('california', 1)",
 "('housing', 1)",
 "('data', 1)",
 "('the', 3)",
 "('census;', 1)",
 "('at:', 2)"]

In [18]:
sc.stop()