# Taller 1 (Corregido)
Text taken from [Reuters](https://www.reuters.com/business/finance/banks-beware-outsiders-are-cracking-code-finance-2021-09-17/).
* Martin Hernández Encarnación 
* Cándido Méndez Baltazar

In [1]:
import nltk
from pyspark import SparkContext
import pandas as pd

In [2]:
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\candi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\candi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
# Entry point for working with RDD
sc = SparkContext(appName = "pyspark-pos-analysis")

In [4]:
# Loading a text file
rdd_reuters = sc.textFile("./data/reuters.txt")

In [5]:
# What is the data type?
type(rdd_reuters)

pyspark.rdd.RDD

In [6]:
rdd_reuters.count()

87

In [7]:
# Printing the first 5 records
rdd_reuters.take(5)

['Banks beware, Amazon and Walmart are cracking the code for finance',
 '',
 'LONDON, Sept 17 (Reuters) - Anyone can be a banker these days, you just need the right code.',
 '',
 'Global brands from Mercedes and Amazon (AMZN.O) to IKEA and Walmart (WMT.N) are cutting out the traditional financial middleman and plugging in software from tech startups to offer customers everything from banking and credit to insurance.']

### MapReduce over RDD

In [8]:
# Using a map for splitting words from text
# By default, the result is something like a list of lists
rdd_reuters.map(lambda x: x.split(" ")).take(3)

[['Banks',
  'beware,',
  'Amazon',
  'and',
  'Walmart',
  'are',
  'cracking',
  'the',
  'code',
  'for',
  'finance'],
 [''],
 ['LONDON,',
  'Sept',
  '17',
  '(Reuters)',
  '-',
  'Anyone',
  'can',
  'be',
  'a',
  'banker',
  'these',
  'days,',
  'you',
  'just',
  'need',
  'the',
  'right',
  'code.']]

In [9]:
# What if I want to get a flat structure => Use flatMap function
rdd_reuters.flatMap(lambda x: x.split(" ")).take(15)

['Banks',
 'beware,',
 'Amazon',
 'and',
 'Walmart',
 'are',
 'cracking',
 'the',
 'code',
 'for',
 'finance',
 '',
 'LONDON,',
 'Sept',
 '17']

### Building the wordcount in pySpark version

In [10]:
rdd_reuters.flatMap(lambda line: line.split(" ")) \
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda a, b: a + b) \
    .map(lambda x: (x[1], x[0])) \
    .sortByKey(ascending = False) \
    .map(lambda x: (x[1], x[0])) \
    .take(10)

[('', 43),
 ('the', 41),
 ('to', 41),
 ('and', 34),
 ('of', 25),
 ('in', 18),
 ('for', 18),
 ('a', 16),
 ('is', 14),
 ('are', 12)]

# Count the words

In [11]:
count = rdd_reuters.flatMap(lambda line: line.split(" ")) \
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda a, b: a + b) \
    .map(lambda x: (x[1], x[0])) \
    .sortByKey(ascending = False) \
    .map(lambda x: (x[1], x[0])) 

In [12]:
count.take(10)

[('', 43),
 ('the', 41),
 ('to', 41),
 ('and', 34),
 ('of', 25),
 ('in', 18),
 ('for', 18),
 ('a', 16),
 ('is', 14),
 ('are', 12)]

In [13]:
count1 = count.collect()

In [14]:
df1 = pd.DataFrame(count1)
df1

Unnamed: 0,0,1
0,,43
1,the,41
2,to,41
3,and,34
4,of,25
...,...,...
628,(AAPL.O),1
629,between,1
630,"Moinian,",1
631,wholesale,1


In [16]:
df1.set_axis(['palabra', 'veces'], axis=1, inplace=True)
df1

Unnamed: 0,palabra,veces
0,,43
1,the,41
2,to,41
3,and,34
4,of,25
...,...,...
628,(AAPL.O),1
629,between,1
630,"Moinian,",1
631,wholesale,1


# Type of words

In [17]:
types = rdd_reuters.flatMap(lambda x: x.split(" "))

In [18]:
types.take(10)

['Banks',
 'beware,',
 'Amazon',
 'and',
 'Walmart',
 'are',
 'cracking',
 'the',
 'code',
 'for']

In [19]:
types1 = types.collect()
types1

['Banks',
 'beware,',
 'Amazon',
 'and',
 'Walmart',
 'are',
 'cracking',
 'the',
 'code',
 'for',
 'finance',
 '',
 'LONDON,',
 'Sept',
 '17',
 '(Reuters)',
 '-',
 'Anyone',
 'can',
 'be',
 'a',
 'banker',
 'these',
 'days,',
 'you',
 'just',
 'need',
 'the',
 'right',
 'code.',
 '',
 'Global',
 'brands',
 'from',
 'Mercedes',
 'and',
 'Amazon',
 '(AMZN.O)',
 'to',
 'IKEA',
 'and',
 'Walmart',
 '(WMT.N)',
 'are',
 'cutting',
 'out',
 'the',
 'traditional',
 'financial',
 'middleman',
 'and',
 'plugging',
 'in',
 'software',
 'from',
 'tech',
 'startups',
 'to',
 'offer',
 'customers',
 'everything',
 'from',
 'banking',
 'and',
 'credit',
 'to',
 'insurance.',
 '',
 'For',
 'established',
 'financial',
 'institutions,',
 'the',
 'signs',
 'are',
 'flashing.',
 '',
 'So-called',
 'embedded',
 'finance',
 '-',
 'a',
 'fancy',
 'term',
 'for',
 'companies',
 'integrating',
 'software',
 'to',
 'offer',
 'financial',
 'services',
 '-',
 'means',
 'Amazon',
 'can',
 'let',
 'customers',
 '

In [20]:
pos = nltk.pos_tag(types1)
pos

[('Banks', 'NNS'),
 ('beware,', 'VBP'),
 ('Amazon', 'NNP'),
 ('and', 'CC'),
 ('Walmart', 'NNP'),
 ('are', 'VBP'),
 ('cracking', 'VBG'),
 ('the', 'DT'),
 ('code', 'NN'),
 ('for', 'IN'),
 ('finance', 'NN'),
 ('', 'NNP'),
 ('LONDON,', 'NNP'),
 ('Sept', 'NNP'),
 ('17', 'CD'),
 ('(Reuters)', 'NNP'),
 ('-', ':'),
 ('Anyone', 'NN'),
 ('can', 'MD'),
 ('be', 'VB'),
 ('a', 'DT'),
 ('banker', 'NN'),
 ('these', 'DT'),
 ('days,', 'IN'),
 ('you', 'PRP'),
 ('just', 'RB'),
 ('need', 'VB'),
 ('the', 'DT'),
 ('right', 'JJ'),
 ('code.', 'NN'),
 ('', 'NNP'),
 ('Global', 'NNP'),
 ('brands', 'NNS'),
 ('from', 'IN'),
 ('Mercedes', 'NNP'),
 ('and', 'CC'),
 ('Amazon', 'NNP'),
 ('(AMZN.O)', 'NNP'),
 ('to', 'TO'),
 ('IKEA', 'NNP'),
 ('and', 'CC'),
 ('Walmart', 'NNP'),
 ('(WMT.N)', 'NNP'),
 ('are', 'VBP'),
 ('cutting', 'VBG'),
 ('out', 'RP'),
 ('the', 'DT'),
 ('traditional', 'JJ'),
 ('financial', 'JJ'),
 ('middleman', 'NN'),
 ('and', 'CC'),
 ('plugging', 'NN'),
 ('in', 'IN'),
 ('software', 'NN'),
 ('from', 'IN'),

In [25]:
sc.stop()