In [1]:
sc

In [2]:
spark

### 1. Create Spark Session

In [12]:
# because a cluster is already running
sc.stop() # we wont run it any other system

In [13]:
from pyspark import SparkConf, SparkContext

# here 2 is number of cores of cpu
# setMaster is used to set spark conext manager which is local[cpu cores]
config = SparkConf().setMaster('local[2]').setAppName("RDDSession")
sc = SparkContext(conf=config)

In [14]:
sc

In [15]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("SQLSession").getOrCreate()

In [16]:
spark

### 2. Create RDD
- create RDD using sc.parallelize() using collections like numpy array lists, tuples etc. 

In [19]:
# sc.parallelize created spark rdd object
rdd1 = sc.parallelize([10, 20, 30, 40, 50 , 60 , 70 , 80 , 90])

In [20]:
type(rdd1)

pyspark.rdd.RDD

In [21]:
rdd1.collect()

[10, 20, 30, 40, 50, 60, 70, 80, 90]

In [25]:
rdd1.take(3) # take number of values want to display

[10, 20, 30]

In [27]:
rdd1.count() # nubmer of values (total count of values present in RDD)

9

In [29]:
rdd1.getNumPartitions() # Number of RDD partitions = num of workers

2

### 3. Narrow Transformation

In [31]:
# Map Transformations applies to all individual values
rdd2 = rdd1.map(lambda val: val * 3)

In [32]:
# call action() method - collect(), take()
rdd2.collect()

[30, 60, 90, 120, 150, 180, 210, 240, 270]

In [33]:
rdd3 = rdd1.map(lambda val: str(val) + " Number")

In [34]:
rdd3.collect()

['10 Number',
 '20 Number',
 '30 Number',
 '40 Number',
 '50 Number',
 '60 Number',
 '70 Number',
 '80 Number',
 '90 Number']

In [43]:
# narrow transformation that which applies condition to each element individually.
# all transformation methods like map and filter expect a function as an input.
rdd4 = rdd1.filter(lambda val: val < 70)
rdd4.collect()

[10, 20, 30, 40, 50, 60]

In [46]:
# create RDD using range
rdd5 = sc.parallelize(range(0, 30))
print(rdd5.take(10))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [51]:
rdd6 = rdd5.filter(lambda x: x % 2 == 0).map(lambda x: str(x) + " is Even")
print(rdd6.collect())

['0 is Even', '2 is Even', '4 is Even', '6 is Even', '8 is Even', '10 is Even', '12 is Even', '14 is Even', '16 is Even', '18 is Even', '20 is Even', '22 is Even', '24 is Even', '26 is Even', '28 is Even']


In [54]:
city_rdd = sc.parallelize([
    "Delhi,Kolkata,Kochi,Vizag,Varkala,Chennai,Bangalore,Pune",
    "Dubai,New York,Berlin,Noida,Bangalore,Vizag",
    "Venice,Dehradun,Munnar,Mumbai,Kochi,Kottayam"])

In [65]:
city_names = city_rdd.map(lambda line: line.split(','))
city_names.collect()

[['Delhi',
  'Kolkata',
  'Kochi',
  'Vizag',
  'Varkala',
  'Chennai',
  'Bangalore',
  'Pune'],
 ['Dubai', 'New York', 'Berlin', 'Noida', 'Bangalore', 'Vizag'],
 ['Venice', 'Dehradun', 'Munnar', 'Mumbai', 'Kochi', 'Kottayam']]

In [66]:
city_names2 = city_rdd.flatMap(lambda line: line.split(','))
city_names2.collect()

['Delhi',
 'Kolkata',
 'Kochi',
 'Vizag',
 'Varkala',
 'Chennai',
 'Bangalore',
 'Pune',
 'Dubai',
 'New York',
 'Berlin',
 'Noida',
 'Bangalore',
 'Vizag',
 'Venice',
 'Dehradun',
 'Munnar',
 'Mumbai',
 'Kochi',
 'Kottayam']

In [73]:
# Union: union of two RDD
rdd7 = rdd1.filter(lambda val: val > 20 and val <= 70)

unionRDD = rdd1.union(rdd7)
unionRDD.collect()

[10, 20, 30, 40, 50, 60, 70, 80, 90, 30, 40, 50, 60, 70]

In [75]:
# create a rdd to select city names starts with b or k
city_names3 = city_names2.filter(lambda val: val[0].upper() == 'B' or val[0].upper() == 'K')
city_names3.collect()

['Kolkata', 'Kochi', 'Bangalore', 'Berlin', 'Bangalore', 'Kochi', 'Kottayam']

In [77]:
city_names3.distinct().collect()

['Kochi', 'Kottayam', 'Kolkata', 'Bangalore', 'Berlin']

In [109]:
# sample(WithReplacement, fraction, seed)
# sample a fraction of data from a RDD with replacement

sampled_rdd = city_names2.sample(False, 0.5, seed=42)
sampled_rdd.collect()

['Delhi',
 'Kolkata',
 'Vizag',
 'Varkala',
 'Pune',
 'Vizag',
 'Venice',
 'Dehradun',
 'Munnar',
 'Mumbai',
 'Kochi']

In [110]:
# with replacement
sampled_rdd = city_names2.sample(True, 0.5, seed=42)
sampled_rdd.collect()

['Chennai', 'New York', 'Berlin', 'Berlin', 'Munnar', 'Kochi']

### Wide Transformation

In [111]:
city_names2.getNumPartitions()

2

In [113]:
pairs = [('Trivandrum', 24), ('Chennai', 30), ('Mumbai', 21), ('Kochi', 12), ('Chennai', 20), ('Trivandrum', 15), ('Delhi', 25), ('Delhi', 12), ('Kottayam', 60)]

In [114]:
pairs_rdd = sc.parallelize(pairs)
pairs_rdd.collect()

[('Trivandrum', 24),
 ('Chennai', 30),
 ('Mumbai', 21),
 ('Kochi', 12),
 ('Chennai', 20),
 ('Trivandrum', 15),
 ('Delhi', 25),
 ('Delhi', 12),
 ('Kottayam', 60)]

In [115]:
pairs_rdd.getNumPartitions()

2

In [117]:
# repartition will be used to create more number of partitions to the given RDD
new_pair_rdd = pairs_rdd.repartition(4)
new_pair_rdd.getNumPartitions()

4

In [120]:
# coalease() reduce the number of partitions from default
# existing RDD partitions into less number of partitions
new_pair_rdd1 = new_pair_rdd.coalesce(2)
new_pair_rdd1.getNumPartitions()

2

In [140]:
# reduceByKey(): aggregate each value of RDD
# it includes shuffling of values to ensure that all values with same keys are bought together
pairs_rdd_reduced = pairs_rdd.reduceByKey(lambda x, y: x + y)
pairs_rdd_reduced.collect()

[('Chennai', 50),
 ('Mumbai', 21),
 ('Kochi', 12),
 ('Kottayam', 60),
 ('Trivandrum', 39),
 ('Delhi', 37)]

In [126]:
orders = [('Fries', 2), ('Nuggets', 3), ('PaniPuri', 5), ('Chole Bhature', 4), ('Dosa', 1), ('Dosa', 1), ('Fries', 2), ('PaniPuri', 5), ('Vada', 2), ('Chole Bhature', 2)]
orders_rdd = sc.parallelize(orders)
orders_rdd.reduceByKey(lambda x, y: x + y).collect()

[('Nuggets', 3),
 ('Chole Bhature', 6),
 ('Dosa', 2),
 ('Vada', 2),
 ('Fries', 4),
 ('PaniPuri', 10)]

In [127]:
# Intersection: method to scan all rdd and find common among all partitions of rdd
rdd8 = sc.parallelize(['Delhi', 'Kolkata', 'Kochi', 'Vizag', 'Varkala', 'Chennai'])
rdd9 = sc.parallelize(['Banglore', 'Pune', 'Delhi', 'Vizag', 'Mumbai', 'Trivandrum'])

rdd8.intersection(rdd9).collect()

['Delhi', 'Vizag']

In [136]:
# we can pass number of partitions as an argument to the sc.parallelize function
# sc.parallelize(collection, number_of_partitions)
rdd10 = sc.parallelize(range(1, 50), 4)
print("Partitions: ", rdd10.getNumPartitions())
print(rdd10.collect())

Partitions:  4
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]


In [139]:
def partition_func(iterator):
    return [sum(iterator)]

partitioned_rdd = rdd10.mapPartitions(partition_func)  # sum of each partition
partitioned_rdd.collect()

[78, 222, 366, 559]

In [141]:
pairs = [('Trivandrum', 24), ('Chennai', 30), ('Mumbai', 21), ('Kochi', 12), ('Chennai', 20), ('Trivandrum', 15), ('Delhi', 25), ('Delhi', 12), ('Kottayam', 60)]
pairs_rdd = sc.parallelize(pairs)

In [148]:
# group by key groups the data by id which invloves shuffling of values for each key
pairs_rdd.groupByKey().collect()

[('Chennai', <pyspark.resultiterable.ResultIterable at 0x7fc9442bd0b8>),
 ('Mumbai', <pyspark.resultiterable.ResultIterable at 0x7fc9442bd438>),
 ('Kochi', <pyspark.resultiterable.ResultIterable at 0x7fc9442bd2e8>),
 ('Kottayam', <pyspark.resultiterable.ResultIterable at 0x7fc9442bd5f8>),
 ('Trivandrum', <pyspark.resultiterable.ResultIterable at 0x7fc9442bd1d0>),
 ('Delhi', <pyspark.resultiterable.ResultIterable at 0x7fc9448a8898>)]

In [149]:
grouped_rdd = pairs_rdd.groupByKey().mapValues(tuple)
grouped_rdd.collect()

[('Chennai', (30, 20)),
 ('Mumbai', (21,)),
 ('Kochi', (12,)),
 ('Kottayam', (60,)),
 ('Trivandrum', (24, 15)),
 ('Delhi', (25, 12))]

In [155]:
# len function returns count of city names
pairs_rdd.groupByKey().mapValues(len).collect()

[('Chennai', 2),
 ('Mumbai', 1),
 ('Kochi', 1),
 ('Kottayam', 1),
 ('Trivandrum', 2),
 ('Delhi', 2)]

In [157]:
# we can pass any array reducing function
pairs_rdd.groupByKey().mapValues(sum).collect()

[('Chennai', 50),
 ('Mumbai', 21),
 ('Kochi', 12),
 ('Kottayam', 60),
 ('Trivandrum', 39),
 ('Delhi', 37)]

In [191]:
# join() joins to rdds based on their keys which requires shuffling of the data
# to ensure all keys are correctly paired
# default it performs inner join
order_one = sc.parallelize([('Fries', 2), ('Fries', 5), ('Nuggets', 3), ('PaniPuri', 5), ('Dosa', 1), ('Vada', 2)])
order_two = sc.parallelize([('Idly', 2), ('Juice', 3), ('Juice', 6), ('PaniPuri', 5), ('Dosa', 1), ('BIriyani', 2)])

joined_rdd = order_one.join(order_two)
joined_rdd.collect()

[('Dosa', (1, 1)), ('PaniPuri', (5, 5))]

In [192]:
order_one.leftOuterJoin(order_two).collect()

[('Dosa', (1, 1)),
 ('Vada', (2, None)),
 ('PaniPuri', (5, 5)),
 ('Nuggets', (3, None)),
 ('Fries', (2, None)),
 ('Fries', (5, None))]

In [193]:
order_one.rightOuterJoin(order_two).collect()

[('Dosa', (1, 1)),
 ('Idly', (None, 2)),
 ('PaniPuri', (5, 5)),
 ('BIriyani', (None, 2)),
 ('Juice', (None, 3)),
 ('Juice', (None, 6))]

In [194]:
order_one.fullOuterJoin(order_two).collect()

[('Dosa', (1, 1)),
 ('Vada', (2, None)),
 ('Idly', (None, 2)),
 ('PaniPuri', (5, 5)),
 ('BIriyani', (None, 2)),
 ('Nuggets', (3, None)),
 ('Fries', (2, None)),
 ('Fries', (5, None)),
 ('Juice', (None, 3)),
 ('Juice', (None, 6))]

In [195]:
# co groups
# groups two rdds by keys, resulting into new rdd with keys and values grouped from both rdd
# reduction happens in co group

result = order_one.cogroup(order_two).mapValues(tuple).collect()
for key, (value1, value2) in result:
    print(key, list(value1), list(value2))

Dosa [1] [1]
Vada [2] []
Idly [] [2]
PaniPuri [5] [5]
BIriyani [] [2]
Nuggets [3] []
Fries [2, 5] []
Juice [] [3, 6]


### Create RDD using File Name

In [234]:
# load localFS use 'file://path/to/file/name'
emp_rdd = sc.textFile('file:///home/hadoop/Downloads/Employee_Advance.csv')
emp_rdd1 = emp_rdd.map(lambda x: x.split(','))
print(emp_rdd1.first())

['1', 'Binnie', 'Hovee', 'bhovee0@hubpages.com', 'Genderqueer', 'Support', '1419125', 'Lithuanian', 'Paris 11', 'France']


#### 1. show all employee working in department = "Business Development"

In [235]:
emp_bd = emp_rdd1.filter(lambda row: row[5] == 'Business Development')
print(emp_bd.take(3))

[['3', 'Siward', 'Struijs', 'sstruijs2@wikia.com', 'Male', 'Business Development', '432657', 'Armenian', 'London', 'United States'], ['6', 'Tova', 'Manclark', 'tmanclark5@army.mil', 'Female', 'Business Development', '659039', 'Hiri Motu', 'Le Mans', 'France'], ['9', 'Warde', 'Stenett', 'wstenett8@altervista.org', 'Genderqueer', 'Business Development', '807769', 'Moldovan', 'Bastia', 'France']]


#### 2. find average income of each department

In [281]:
def getavg(iterable):
    return sum(iterable) / len(iterable)

income = emp_rdd1.map(lambda x: (x[5], float(x[6])))

grouped = income.groupByKey().mapValues(list).collect()

for department, incomes in grouped:
    print(department, sum(incomes) / len(incomes))

Accounting 814474.3333333334
Human Resources 772824.3829787234
Legal 754482.0
Services 765440.4444444445
Sales 745501.5921052631
Product Management 696304.2985074627
Support 825695.0947368421
Business Development 783081.2710280374
Research and Development 789849.1486486486
Training 757752.380952381
Marketing 744170.7831325302
Engineering 757094.5466666666


In [285]:
income = emp_rdd1.map(lambda x: (x[5], float(x[6])))

grouped = income.groupByKey().mapValues(list).map(lambda x: x[0] + ": " + str(sum(x[1]) / len(x[1])))

grouped.collect()

['Accounting: 814474.3333333334',
 'Human Resources: 772824.3829787234',
 'Legal: 754482.0',
 'Services: 765440.4444444445',
 'Sales: 745501.5921052631',
 'Product Management: 696304.2985074627',
 'Support: 825695.0947368421',
 'Business Development: 783081.2710280374',
 'Research and Development: 789849.1486486486',
 'Training: 757752.380952381',
 'Marketing: 744170.7831325302',
 'Engineering: 757094.5466666666']

In [300]:
# alternative approach
counts = emp_rdd1.map(lambda x: (x[5], 1))
emp_income1 = income.reduceByKey(lambda x, y: x + y)
emp_income2 = counts.reduceByKey(lambda x, y: x + y)

emp_income1.join(emp_income2).mapValues(lambda x: x[0]/x[1]).collect()

[('Accounting', 814474.3333333334),
 ('Human Resources', 772824.3829787234),
 ('Legal', 754482.0),
 ('Services', 765440.4444444445),
 ('Sales', 745501.5921052631),
 ('Product Management', 696304.2985074627),
 ('Support', 825695.0947368421),
 ('Business Development', 783081.2710280374),
 ('Research and Development', 789849.1486486486),
 ('Training', 757752.380952381),
 ('Marketing', 744170.7831325302),
 ('Engineering', 757094.5466666666)]

#### 4. Count the number of employees working in each department

In [264]:
emps = emp_rdd1.map(lambda x: (x[5], 1))
grouped = emps.groupByKey().mapValues(len)
grouped.collect()

[('Accounting', 93),
 ('Human Resources', 94),
 ('Legal', 80),
 ('Services', 72),
 ('Sales', 76),
 ('Product Management', 67),
 ('Support', 95),
 ('Business Development', 107),
 ('Research and Development', 74),
 ('Training', 84),
 ('Marketing', 83),
 ('Engineering', 75)]

In [262]:
# another way
emps = emp_rdd1.map(lambda x: (x[5], 1))
grouped = emps.reduceByKey(lambda x, y: x + y)
grouped.collect()

[('Accounting', 93),
 ('Human Resources', 94),
 ('Legal', 80),
 ('Services', 72),
 ('Sales', 76),
 ('Product Management', 67),
 ('Support', 95),
 ('Business Development', 107),
 ('Research and Development', 74),
 ('Training', 84),
 ('Marketing', 83),
 ('Engineering', 75)]

#### 5. list all unique job titles in dataset

In [286]:
job_titles = emp_rdd1.map(lambda x: x[5])
print(job_titles.distinct().collect())

['Accounting', 'Human Resources', 'Legal', 'Services', 'Sales', 'Product Management', 'Support', 'Business Development', 'Research and Development', 'Training', 'Marketing', 'Engineering']


#### 6. Count all employees whose first letter should starts with A

In [297]:
emps = emp_rdd1.filter(lambda x: x[1][0].lower() == 'a')
emps.count()

81

#### 7. Word count example using spark RDD

In [302]:
!hdfs dfs -ls /

Found 3 items
drwxr-xr-x   - hadoop supergroup          0 2022-11-21 15:25 /hbase
drwxrwxrwx   - hadoop supergroup          0 2022-11-21 15:12 /tmp
drwxr-xr-x   - hadoop supergroup          0 2022-11-21 15:11 /user


In [303]:
!hdfs dfs -mkdir /wordcount

In [305]:
!hdfs dfs -put /home/hadoop/Downloads/Harry_Potter_and_the_Deathly_Hallows.txt /wordcount

In [306]:
hp_rdd = sc.textFile('/wordcount/Harry_Potter_and_the_Deathly_Hallows.txt')

In [316]:
rdd_word1 = hp_rdd.flatMap(lambda line: line.split(" "))
rdd_word2 = rdd_word1.map(lambda word: (word.lower(),1))
rdd_word3 = rdd_word2.reduceByKey(lambda x , y: x + y)
rdd_word4 = rdd_word3.filter(lambda x: x[1] > 10).sortBy(lambda x: x[1], ascending=False)

rdd_word4.collect()

[('the', 10280),
 ('and', 5283),
 ('to', 4843),
 ('of', 4121),
 ('he', 3845),
 ('a', 3523),
 ('was', 2681),
 ('his', 2602),
 ('in', 2159),
 ('had', 1990),
 ('harry', 1867),
 ('it', 1859),
 ('that', 1813),
 ('said', 1676),
 ('you', 1485),
 ('as', 1409),
 ('at', 1401),
 ('i', 1275),
 ('with', 1127),
 ('they', 1091),
 ('not', 1083),
 ('on', 984),
 ('for', 980),
 ('but', 975),
 ('she', 918),
 ('her', 864),
 ('—', 841),
 ('from', 836),
 ('have', 784),
 ('…', 756),
 ('were', 738),
 ('be', 737),
 ('him', 666),
 ('into', 655),
 ('hermione', 653),
 ('out', 650),
 ('could', 627),
 ('all', 583),
 ('ron', 578),
 ('what', 571),
 ('been', 567),
 ('“i', 535),
 ('up', 510),
 ('we', 498),
 ('there', 493),
 ('who', 490),
 ('is', 484),
 ('—”', 472),
 ('their', 465),
 ('did', 451),
 ('would', 443),
 ('them', 442),
 ('like', 424),
 ('if', 424),
 ('looked', 419),
 ('by', 419),
 ('harry,', 414),
 ('back', 409),
 ('so', 403),
 ('over', 383),
 ('this', 382),
 ('an', 381),
 ('then', 365),
 ('one', 364),
 ('wand

#### 8. Count the number of employees working each city and sorrt by employee count in descending order

In [323]:
emp_map = emp_rdd1.map(lambda x: (x[8], 1))
emp_count = emp_map.reduceByKey(lambda x, y: x + y).sortBy(lambda x: x[1], ascending=False)
emp_count.collect()

[('Washington', 13),
 ('Berlin', 13),
 ('Lyon', 13),
 ('Cincinnati', 9),
 ('Dallas', 9),
 ('Cergy-Pontoise', 9),
 ('Sacramento', 8),
 ('Strasbourg', 8),
 ('Lille', 7),
 ('Dijon', 7),
 ('Los Angeles', 7),
 ('Pittsburgh', 7),
 ('Bordeaux', 7),
 ('San Jose', 7),
 ('Denver', 7),
 ('El Paso', 7),
 ('Montpellier', 7),
 ('Chicago', 6),
 ('Tulsa', 6),
 ('Quimper', 6),
 ('Marseille', 6),
 ('New York City', 6),
 ('Pau', 6),
 ('Poitiers', 6),
 ('Montgomery', 6),
 ('Le Mans', 5),
 ('Avignon', 5),
 ('Seattle', 5),
 ('Marne-la-Vallée', 5),
 ('Limoges', 5),
 ('Oakland', 5),
 ('Fort Worth', 5),
 ('Houston', 5),
 ('San Diego', 5),
 ('Philadelphia', 5),
 ('Paris La Défense', 5),
 ('Rungis', 5),
 ('Perpignan', 5),
 ('Rouen', 5),
 ('Caen', 5),
 ('Minneapolis', 5),
 ('Nantes', 5),
 ('Orléans', 5),
 ('Nürnberg', 4),
 ('Mobile', 4),
 ('Aix-en-Provence', 4),
 ('Angoulême', 4),
 ('Charlotte', 4),
 ('Düsseldorf', 4),
 ('Topeka', 4),
 ('Kansas City', 4),
 ('Roissy Charles-de-Gaulle', 4),
 ('Saint Petersburg', 4)

### Saving Data using RDD

In [324]:
# Saving RDD data to local file system
rdd_word4.saveAsTextFile('file:///home/hadoop/Downloads/output/')

In [325]:
# Saving RDD data to hdfs
rdd_word4.saveAsTextFile('/wordcount/output')

In [327]:
!hdfs dfs -cat /wordcount/output/*

('the', 10280)
('and', 5283)
('to', 4843)
('of', 4121)
('he', 3845)
('a', 3523)
('was', 2681)
('his', 2602)
('in', 2159)
('had', 1990)
('harry', 1867)
('it', 1859)
('that', 1813)
('said', 1676)
('you', 1485)
('as', 1409)
('at', 1401)
('i', 1275)
('with', 1127)
('they', 1091)
('not', 1083)
('on', 984)
('for', 980)
('but', 975)
('she', 918)
('her', 864)
('—', 841)
('from', 836)
('have', 784)
('…', 756)
('were', 738)
('be', 737)
('him', 666)
('into', 655)
('hermione', 653)
('out', 650)
('could', 627)
('all', 583)
('ron', 578)
('what', 571)
('been', 567)
('“i', 535)
('up', 510)
('we', 498)
('there', 493)
('who', 490)
('is', 484)
('—”', 472)
('their', 465)
('did', 451)
('would', 443)
('them', 442)
('like', 424)
('if', 424)
('looked', 419)
('by', 419)
('harry,', 414)
('back', 409)
('so', 403)
('over', 383)
('this', 382)
('an', 381)
('then', 365)
('one', 364)
('wand', 352)
('do', 342)
('about', 339)
('know', 338)
('my', 334)

('apart', 24)
('pushed', 24)
('marble', 24)
('change', 24)
('you?', 24)
('finger', 24)
('mention', 24)
('together,', 24)
('ariana', 24)
('book', 24)
('may', 24)
('safe', 24)
('asked,', 24)
('mad-eye', 24)
('fred,', 24)
('thick', 24)
('harry!”', 24)
('ceiling', 24)
('“…', 24)
('lived', 24)
('darkness.', 24)
('goblin,', 24)
('silent', 24)
('directly', 24)
('us,', 24)
('once.', 24)
('aunt', 24)
('met', 24)
('do.', 24)
('miss', 24)
('all.', 24)
('skeeter', 24)
('telling', 24)
('loud', 24)
('pulling', 24)
('dementors', 24)
('sorry', 24)
('odd', 24)
('scream', 24)
('return', 24)
('mum', 24)
('meant', 24)
('around,', 24)
('snitch', 24)
('below', 23)
('means', 23)
('speak', 23)
('nor', 23)
('become', 23)
('cut', 23)
('although', 23)
('him:', 23)
('looks', 23)
('ready', 23)
('sorry,', 23)
('covered', 23)
('asked.', 23)
('hollow', 23)
('screamed', 23)
('terrible', 23)
('table,', 23)
('robes,', 23)
('happy', 23)
('bellatrix’s', 23)
(

#### RDD Statistical Functions

In [None]:
rdd_city = emp_rdd1.map(lambda row: (row[5], int(row[6])))

In [333]:
rdd_city.min(lambda x: x[1])

('Training', 12727)

In [334]:
rdd_city.max(lambda x: x[1])

('Support', 1496924)

In [341]:
rdd11 = sc.parallelize(range(1, 100)).sample(False, 0.3, seed=2)
print(rdd11.collect())
print("min: ", rdd11.min())
print("max: ", rdd11.max())
print("variance: ", rdd11.variance())
print("standard deviation: ", rdd11.stdev())

[16, 17, 18, 19, 20, 39, 40, 55, 61, 65, 77, 78, 79, 95]
min:  16
max:  95
variance:  719.2500000000002
standard deviation:  26.818836663807776


In [362]:
incomes_rdd = rdd_city.map(lambda row: row[1])

In [363]:
print(f"Standard Deviation:\t{incomes_rdd.stdev() : .2f}")

Standard Deviation:	 424405.95


In [364]:
print(f"Variance:\t{incomes_rdd.variance() : .2f}")

Variance:	 180120411989.86


In [365]:
print(f"Median:\t{incomes_rdd.mean() : .2f}")

Median:	 770051.42


In [372]:
print(f"Sum:{incomes_rdd.sum() : .2f}")

Sum: 770051419.00


In [368]:
print(f"Minimum: {incomes_rdd.min() : .2f}")
print(f"Maximum: {incomes_rdd.max() : .2f}")

Minimum:  12727.00
Maximum:  1496924.00


In [371]:
print(f"Stats: {incomes_rdd.stats()}")

Stats: (count: 1000, mean: 770051.4190000002, stdev: 424405.95187845733, max: 1496924, min: 12727)


In [376]:
emp_rdd.cache()

file:///home/hadoop/Downloads/Employee_Advance.csv MapPartitionsRDD[535] at textFile at NativeMethodAccessorImpl.java:0

In [395]:
from pyspark.storagelevel import StorageLevel

# persist the RDD in memory with default storage level (MEMORY_ONLY)
emp_rdd.unpersist()
emp_rdd = emp_rdd.persist(StorageLevel.MEMORY_ONLY)

In [397]:
emp_rdd.unpersist()
emp_rdd = emp_rdd.persist(StorageLevel.MEMORY_AND_DISK_SER)

In [398]:
emp_rdd1 = emp_rdd.map(lambda x: x.split(','))
emp_map = emp_rdd1.map(lambda x: (x[8], 1))
emp_map.reduceByKey(lambda x, y: x + y).collect()

[('Paris 11', 2),
 ('Stockton', 2),
 ('Toulouse', 3),
 ('Le Mans', 5),
 ('Bastia', 2),
 ('Nürnberg', 4),
 ('Lille', 7),
 ('Clermont-Ferrand', 2),
 ('Mobile', 4),
 ('Chicago', 6),
 ('Lees Summit', 3),
 ('Avignon', 5),
 ('Taverny', 1),
 ('Columbus', 3),
 ('Aix-en-Provence', 4),
 ('Plérin', 1),
 ('Cedar Rapids', 1),
 ('Landivisiau', 2),
 ('Saint-Pierre-des-Corps', 2),
 ('Villeneuve-lès-Avignon', 1),
 ('Decatur', 1),
 ('Bagnères-de-Bigorre', 1),
 ('Bloomington', 1),
 ('Seattle', 5),
 ('Chantepie', 1),
 ('La Gacilly', 1),
 ('Wissous', 2),
 ('Tulsa', 6),
 ('Sacramento', 8),
 ('Reims', 2),
 ('Marne-la-Vallée', 5),
 ('Cincinnati', 9),
 ('Bourges', 2),
 ('Amiens', 2),
 ('Fécamp', 2),
 ('Port-de-Bouc', 1),
 ('Beaufort', 1),
 ('Saint-Ouen', 2),
 ('Angoulême', 4),
 ('Strasbourg', 8),
 ('Béziers', 1),
 ('Grenoble', 1),
 ('Orange', 1),
 ('Charlotte', 4),
 ('Champagnole', 1),
 ('Paris 13', 3),
 ('Düsseldorf', 4),
 ('Topeka', 4),
 ('Quimper', 6),
 ('Brooklyn', 2),
 ('Kansas City', 4),
 ('Limoges', 5),