In [1]:
sc

In [2]:
spark

#### 1. Create New Spark Session.

In [5]:
sc.stop()

In [6]:
from pyspark import SparkConf, SparkContext
# setMaster() - Set Spark Content Manager which is local[cpu_cores] 
config = SparkConf().setMaster("local[2]").setAppName("RDDSession")
sc = SparkContext(conf=config)

In [7]:
sc

In [11]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("SQlSession").getOrCreate()

In [12]:
spark

#### 2. Create RDD
    * Create RDD using sc.paralleilize() using collections like numpy array, lists, tuples tc,

In [13]:
rdd1 = sc.parallelize([10, 20, 30, 40, 50, 60, 70, 80, 90])

In [14]:
# sc.parllelize() created spark rdd object
print(rdd1)

ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:195


In [15]:
type(rdd1)

pyspark.rdd.RDD

In [17]:
print(rdd1.collect())

[10, 20, 30, 40, 50, 60, 70, 80, 90]


In [20]:
print(rdd1.take(5))    # take(num of values as per the input count)

[10, 20, 30, 40, 50]


In [22]:
print("Total Count of RDD: ", rdd1.count())  # Return total count of values present in RDD

Total Count of RDD:  9


In [23]:
rdd1.getNumPartitions()   # Number of RDD partitions = Num of Workers

2

#### 3. Narrow Transformation

In [24]:
# Map Transformation - Applies to all individual values
rdd2 = rdd1.map(lambda val : val * 3)

In [25]:
# call action() method - collect() or take()
rdd2.collect()

[30, 60, 90, 120, 150, 180, 210, 240, 270]

In [27]:
rdd3 = rdd1.map(lambda val : str(val) + "Number")
rdd3.collect()

['10Number',
 '20Number',
 '30Number',
 '40Number',
 '50Number',
 '60Number',
 '70Number',
 '80Number',
 '90Number']

In [28]:
# Filter method - Narrow Transformation which applies condition to each element of RDD
# ALl transformation method like map(), filter() expect a function as an Input.
rdd4 = rdd1.filter(lambda x : x <= 70)
rdd4.collect()

[10, 20, 30, 40, 50, 60, 70]

In [30]:
# Create an RDD using range()
rdd5 = sc.parallelize(range(1,30))
print(rdd5.take(10))

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [33]:
rdd6 = rdd5.filter(lambda x : x % 2 == 0).map(lambda x : str(x) + 'Even')
print(rdd6.collect())

['2Even', '4Even', '6Even', '8Even', '10Even', '12Even', '14Even', '16Even', '18Even', '20Even', '22Even', '24Even', '26Even', '28Even']


In [38]:
city_rdd = sc.parallelize(["Delhi,Kolkata,Kochi,Vizag,Varkala,Chennai,Bangalore,Pune",
                          "Dubai,New York,Berlin,Noida,Bangalore,Vizag",
                          "Venice,Dehradun,Munnar,Mumbai,Kochi,Kottayam",
                          "London,Paris,Melbourne,Bali,Abu Dhabi"])
city_rdd.collect()

['Delhi,Kolkata,Kochi,Vizag,Varkala,Chennai,Bangalore,Pune',
 'Dubai,New York,Berlin,Noida,Bangalore,Vizag',
 'Venice,Dehradun,Munnar,Mumbai,Kochi,Kottayam',
 'London,Paris,Melbourne,Bali,Abu Dhabi']

In [40]:
city_names = city_rdd.map(lambda val : val.split(','))
print(city_names.collect())

[['Delhi', 'Kolkata', 'Kochi', 'Vizag', 'Varkala', 'Chennai', 'Bangalore', 'Pune'], ['Dubai', 'New York', 'Berlin', 'Noida', 'Bangalore', 'Vizag'], ['Venice', 'Dehradun', 'Munnar', 'Mumbai', 'Kochi', 'Kottayam'], ['London', 'Paris', 'Melbourne', 'Bali', 'Abu Dhabi']]


In [44]:
city_names_2 = city_names.flatMap(lambda val : val)
city_names_2.collect()

['Delhi',
 'Kolkata',
 'Kochi',
 'Vizag',
 'Varkala',
 'Chennai',
 'Bangalore',
 'Pune',
 'Dubai',
 'New York',
 'Berlin',
 'Noida',
 'Bangalore',
 'Vizag',
 'Venice',
 'Dehradun',
 'Munnar',
 'Mumbai',
 'Kochi',
 'Kottayam',
 'London',
 'Paris',
 'Melbourne',
 'Bali',
 'Abu Dhabi']

In [45]:
# Union - Union of Two RDDs
rdd7 = rdd1.filter(lambda val : val > 20 and val <= 70)
unionRDD = rdd1.union(rdd7)
unionRDD.collect()

[10, 20, 30, 40, 50, 60, 70, 80, 90, 30, 40, 50, 60, 70]

#### Task: Create RDD using city_names and generate City Name starting with 'B' or 'K'

In [46]:
city_names_3 = city_names_2.filter(lambda val : val[0].lower() == 'k' or
                                   val[0].lower() == 'b')

In [47]:
city_names_3.collect()

['Kolkata',
 'Kochi',
 'Bangalore',
 'Berlin',
 'Bangalore',
 'Kochi',
 'Kottayam',
 'Bali']

In [52]:
city_names_3 = city_names_2.filter(lambda val : val.startswith('K') or val.startswith('B'))

In [53]:
city_names_3.collect()

['Kolkata',
 'Kochi',
 'Bangalore',
 'Berlin',
 'Bangalore',
 'Kochi',
 'Kottayam',
 'Bali']

In [56]:
# distinct = Returns new RDD with distinct elements
distinct_city = city_names_2.distinct()
distinct_city.collect()

['Kochi',
 'Chennai',
 'Dubai',
 'Dehradun',
 'Mumbai',
 'Kottayam',
 'Bali',
 'Delhi',
 'Kolkata',
 'Vizag',
 'Varkala',
 'Bangalore',
 'Pune',
 'New York',
 'Berlin',
 'Noida',
 'Venice',
 'Munnar',
 'London',
 'Paris',
 'Melbourne',
 'Abu Dhabi']

In [68]:
# sample(withReplacement, fraction, seed)
# samples a fraction of data from a RDD with replacement
sampled_rdd = city_names_2.sample(False, 0.5, seed = 42)
sampled_rdd.collect()

['Delhi',
 'Kolkata',
 'Vizag',
 'Varkala',
 'Pune',
 'Berlin',
 'Vizag',
 'Kottayam',
 'London',
 'Paris',
 'Melbourne',
 'Bali',
 'Abu Dhabi']

In [69]:
# with Replacement
sampled_rdd = city_names_2.sample(True, 0.5, seed = 42)
sampled_rdd.collect()

['Chennai',
 'Dubai',
 'New York',
 'Bangalore',
 'Dehradun',
 'Munnar',
 'Munnar',
 'Melbourne',
 'Abu Dhabi']

In [88]:
# sc.parallelize(collections, number of partitions)
rdd_10 = sc.parallelize(range(1,50), 4)
rdd_10.getNumPartitions()

4

In [95]:
print(rdd_10.collect())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]


In [89]:
def partition_func(iterator):
    return [sum(iterator)]

In [93]:
partitioned_rdd = rdd_10.mapPartitions(partition_func)
partitioned_rdd.collect()

[78, 222, 366, 559]

#### Wide Transformation

In [70]:
city_names_3.getNumPartitions()

2

In [71]:
pairs = [('Trivandrum',24),('Chennai',30),('Mumbai',21),('Kochi',12),('Chennai',20),
         ('Trivandrum',15),('Delhi',25),('Delhi',12),('Kottayam', 60)]
pairs_rdd = sc.parallelize(pairs)
pairs_rdd.collect()

[('Trivandrum', 24),
 ('Chennai', 30),
 ('Mumbai', 21),
 ('Kochi', 12),
 ('Chennai', 20),
 ('Trivandrum', 15),
 ('Delhi', 25),
 ('Delhi', 12),
 ('Kottayam', 60)]

In [73]:
pairs_rdd.getNumPartitions()

2

In [74]:
# repartition() - To create more number of partition
new_pair_rdd = pairs_rdd.repartition(4)

In [76]:
new_pair_rdd.getNumPartitions()

4

In [77]:
# coalesce() - Combines existing RDD Partitions into less number of Partitions
new_pair_rdd1 = new_pair_rdd.coalesce(2)
new_pair_rdd1.getNumPartitions()

2

In [81]:
# ReduceByKey() - Aggregate each value of RDD by Keys. It does involves shuffling of values to ensure that all 
# values with same key are brought together.
pairs_rdd2 = pairs_rdd.reduceByKey(lambda x, y : x + y)
pairs_rdd2.collect()

[('Chennai', 50),
 ('Mumbai', 21),
 ('Kochi', 12),
 ('Kottayam', 60),
 ('Trivandrum', 39),
 ('Delhi', 37)]

In [83]:
orders = [('Fries',2),('Nuggets',3),('PaniPuri',5),('Chole Bhature',2),('Dosa',1),
         ('Dosa',1),('Fries',2),('PaniPuri',5),('Vada',2),('Chole Bhature',2)]
orders_rdd = sc.parallelize(orders)
orders_rdd.reduceByKey(lambda x, y : x + y).collect()

[('Nuggets', 3),
 ('Chole Bhature', 4),
 ('Dosa', 2),
 ('Vada', 2),
 ('Fries', 4),
 ('PaniPuri', 10)]

In [84]:
# Intersection : Method to scan all rdd and find common among all partitions of RDD
rdd8 = sc.parallelize(['Delhi','Kolkata','Kochi','Vizag','Varkala','Chennai'])
rdd9 = sc.parallelize(['Bangalore','Pune','Delhi','Vizag','Mumbai','Trivandrum'])

In [86]:
rdd8.intersection(rdd9).collect()

['Delhi', 'Vizag']

In [96]:
pairs = [('Trivandrum',24),('Chennai',30),('Mumbai',21),('Kochi',12),('Chennai',20),
         ('Trivandrum',15),('Delhi',25),('Delhi',12),('Kottayam', 60)]
pairs_rdd = sc.parallelize(pairs)

In [97]:
pairs_rdd.groupByKey().collect()

[('Chennai', <pyspark.resultiterable.ResultIterable at 0x7f0a9b6d0cc0>),
 ('Mumbai', <pyspark.resultiterable.ResultIterable at 0x7f0a9b6d0d68>),
 ('Kochi', <pyspark.resultiterable.ResultIterable at 0x7f0a9b6d0208>),
 ('Kottayam', <pyspark.resultiterable.ResultIterable at 0x7f0a9b6d0be0>),
 ('Trivandrum', <pyspark.resultiterable.ResultIterable at 0x7f0a9b6d0ba8>),
 ('Delhi', <pyspark.resultiterable.ResultIterable at 0x7f0a9b6d0cf8>)]

In [101]:
# groupByKey() - Groups data by key, which involves shuffling all values for each key.
grouped_rdd = pairs_rdd.groupByKey().mapValues(tuple)
grouped_rdd.collect()

[('Chennai', (30, 20)),
 ('Mumbai', (21,)),
 ('Kochi', (12,)),
 ('Kottayam', (60,)),
 ('Trivandrum', (24, 15)),
 ('Delhi', (25, 12))]

In [105]:
# len() function returns count of city names.
pairs_rdd.groupByKey().mapValues(len).collect()

[('Chennai', 2),
 ('Mumbai', 1),
 ('Kochi', 1),
 ('Kottayam', 1),
 ('Trivandrum', 2),
 ('Delhi', 2)]

In [125]:
# Join() - Joins two RDD based on their keys which requires shuffling the data to ensure all keys are correctly
# paired
order1 = sc.parallelize([('Fries',2),('Nuggets',3),('PaniPuri',5),('Dosa',1),('Vada',2),('Fries',2),('PaniPuri',5)])
order2 = sc.parallelize([('Juice',3),('Idly',3),('PaniPuri',6),('Dosa',4),('Sweets',5),('Biryani',1)])

In [110]:
# Join will scan all partitioned rdd to find common keys
joined_rdd = order1.join(order2)
joined_rdd.collect()

[('Dosa', (1, 4)), ('PaniPuri', (5, 6))]

In [111]:
order1.leftOuterJoin(order2).collect()

[('Dosa', (1, 4)),
 ('Vada', (2, None)),
 ('PaniPuri', (5, 6)),
 ('Nuggets', (3, None)),
 ('Fries', (2, None))]

In [112]:
order1.rightOuterJoin(order2).collect()

[('Dosa', (1, 4)),
 ('Idly', (None, 3)),
 ('PaniPuri', (5, 6)),
 ('Biryani', (None, 1)),
 ('Juice', (None, 3)),
 ('Sweets', (None, 5))]

In [129]:
order1.fullOuterJoin(order2).collect()

[('Dosa', (1, 4)),
 ('Vada', (2, None)),
 ('Idly', (None, 3)),
 ('PaniPuri', (5, 6)),
 ('PaniPuri', (5, 6)),
 ('Nuggets', (3, None)),
 ('Biryani', (None, 1)),
 ('Fries', (2, None)),
 ('Fries', (2, None)),
 ('Juice', (None, 3)),
 ('Sweets', (None, 5))]

In [126]:
# Cogroup: Groups data from two rdd by key resulting into new RDD with keys and values grouped from both RDD.
results = order1.cogroup(order2).collect()
results

[('Dosa',
  (<pyspark.resultiterable.ResultIterable at 0x7f0a9b937390>,
   <pyspark.resultiterable.ResultIterable at 0x7f0a9bdfa0b8>)),
 ('Vada',
  (<pyspark.resultiterable.ResultIterable at 0x7f0a9bdfa3c8>,
   <pyspark.resultiterable.ResultIterable at 0x7f0a9bdfab38>)),
 ('Idly',
  (<pyspark.resultiterable.ResultIterable at 0x7f0a9b7a6438>,
   <pyspark.resultiterable.ResultIterable at 0x7f0a9b7a62e8>)),
 ('PaniPuri',
  (<pyspark.resultiterable.ResultIterable at 0x7f0a9bdfa6d8>,
   <pyspark.resultiterable.ResultIterable at 0x7f0a9b7a67b8>)),
 ('Nuggets',
  (<pyspark.resultiterable.ResultIterable at 0x7f0a9bdfa320>,
   <pyspark.resultiterable.ResultIterable at 0x7f0a9b7bdbe0>)),
 ('Biryani',
  (<pyspark.resultiterable.ResultIterable at 0x7f0a9b7bde80>,
   <pyspark.resultiterable.ResultIterable at 0x7f0a9b7bd9e8>)),
 ('Fries',
  (<pyspark.resultiterable.ResultIterable at 0x7f0a9b7bdcc0>,
   <pyspark.resultiterable.ResultIterable at 0x7f0a9b7bdf60>)),
 ('Juice',
  (<pyspark.resultiterable

In [128]:
for key, (values1, values2) in results:
    print(f"{key} : {list(values1)},{list(values2)}")

Dosa : [1],[4]
Vada : [2],[]
Idly : [],[3]
PaniPuri : [5, 5],[6]
Nuggets : [3],[]
Biryani : [],[1]
Fries : [2, 2],[]
Juice : [],[3]
Sweets : [],[5]


#### RDD using FileName()

In [131]:
# load localFS use "file://path_to_filename"
emp_RDD = sc.textFile("file:///home/hadoop/Downloads/Employee_Advance.csv")
emp_RDD.take(6)

['1,Binnie,Hovee,bhovee0@hubpages.com,Genderqueer,Support,1419125,Lithuanian,Paris 11,France',
 '2,Devondra,Rosingdall,drosingdall1@yellowbook.com,Non-binary,Support,1365025,Tsonga,Seminole,United States',
 '3,Siward,Struijs,sstruijs2@wikia.com,Male,Business Development,432657,Armenian,London,United States',
 '4,Jaime,Camoys,jcamoys3@wsj.com,Bigender,Accounting,1122884,Armenian,Stockton,United States',
 '5,Nahum,Macieiczyk,nmacieiczyk4@reference.com,Bigender,Human Resources,1026666,Burmese,Toulouse,France',
 '6,Tova,Manclark,tmanclark5@army.mil,Female,Business Development,659039,Hiri Motu,Le Mans,France']

In [134]:
emp_RDD1 = emp_RDD.map(lambda row : row.split(','))
emp_RDD1.collect()

[['1',
  'Binnie',
  'Hovee',
  'bhovee0@hubpages.com',
  'Genderqueer',
  'Support',
  '1419125',
  'Lithuanian',
  'Paris 11',
  'France'],
 ['2',
  'Devondra',
  'Rosingdall',
  'drosingdall1@yellowbook.com',
  'Non-binary',
  'Support',
  '1365025',
  'Tsonga',
  'Seminole',
  'United States'],
 ['3',
  'Siward',
  'Struijs',
  'sstruijs2@wikia.com',
  'Male',
  'Business Development',
  '432657',
  'Armenian',
  'London',
  'United States'],
 ['4',
  'Jaime',
  'Camoys',
  'jcamoys3@wsj.com',
  'Bigender',
  'Accounting',
  '1122884',
  'Armenian',
  'Stockton',
  'United States'],
 ['5',
  'Nahum',
  'Macieiczyk',
  'nmacieiczyk4@reference.com',
  'Bigender',
  'Human Resources',
  '1026666',
  'Burmese',
  'Toulouse',
  'France'],
 ['6',
  'Tova',
  'Manclark',
  'tmanclark5@army.mil',
  'Female',
  'Business Development',
  '659039',
  'Hiri Motu',
  'Le Mans',
  'France'],
 ['7',
  'Terrill',
  'Feander',
  'tfeander6@unc.edu',
  'Genderqueer',
  'Legal',
  '226156',
  'Greek'

1. Show all employee working in Department = "Business Development"

In [136]:
emp_BD = emp_RDD1.filter(lambda val : val[5] == "Business Development")

In [137]:
# Return first 5 Rows for List of Employees working in Business Development.
emp_BD.take(5)

[['3',
  'Siward',
  'Struijs',
  'sstruijs2@wikia.com',
  'Male',
  'Business Development',
  '432657',
  'Armenian',
  'London',
  'United States'],
 ['6',
  'Tova',
  'Manclark',
  'tmanclark5@army.mil',
  'Female',
  'Business Development',
  '659039',
  'Hiri Motu',
  'Le Mans',
  'France'],
 ['9',
  'Warde',
  'Stenett',
  'wstenett8@altervista.org',
  'Genderqueer',
  'Business Development',
  '807769',
  'Moldovan',
  'Bastia',
  'France'],
 ['17',
  'Bern',
  'Lafond',
  'blafondg@amazon.de',
  'Non-binary',
  'Business Development',
  '933737',
  'Greek',
  'San Antonio',
  'United States'],
 ['19',
  'Hillery',
  'Costall',
  'hcostalli@rakuten.co.jp',
  'Genderfluid',
  'Business Development',
  '736463',
  'Norwegian',
  'Clermont-Ferrand',
  'France']]

2. Count Total Number of Employees Working in Organisation.

In [146]:
emp_RDD1.count()

1000

3. Find Average Income of Each Department. 

In [153]:
emp_Income = emp_RDD1.map(lambda row : (row[5],int(row[6])))
emp_Income1 = emp_Income.reduceByKey(lambda x, y : x + y)
emp_Income2 = emp_Map.reduceByKey(lambda x, y : x + y)
emp_Income1.join(emp_Income2).collect()

In [164]:
emp_Income1.join(emp_Income2).mapValues(lambda x : x[0]/x[1]).collect()

[('Accounting', 814474.3333333334),
 ('Human Resources', 772824.3829787234),
 ('Legal', 754482.0),
 ('Services', 765440.4444444445),
 ('Sales', 745501.5921052631),
 ('Product Management', 696304.2985074627),
 ('Support', 825695.0947368421),
 ('Business Development', 783081.2710280374),
 ('Research and Development', 789849.1486486486),
 ('Training', 757752.380952381),
 ('Marketing', 744170.7831325302),
 ('Engineering', 757094.5466666666)]

In [166]:
# Alternative Approach
emp_income = emp_RDD1.map(lambda val: (val[5],int(val[6])))
emp_income1 = emp_income.groupByKey().mapValues(list).collect()
for i in emp_income1:
    print(i[0],sum(i[1])/len(i[1]))

Accounting 814474.3333333334
Human Resources 772824.3829787234
Legal 754482.0
Services 765440.4444444445
Sales 745501.5921052631
Product Management 696304.2985074627
Support 825695.0947368421
Business Development 783081.2710280374
Research and Development 789849.1486486486
Training 757752.380952381
Marketing 744170.7831325302
Engineering 757094.5466666666


4. Count the Total Number of Employee Working in Each Department.

In [143]:
emp_Map = emp_RDD1.map(lambda row : (row[5],1))
emp_Map.reduceByKey(lambda x, y : x + y).collect()

[('Accounting', 93),
 ('Human Resources', 94),
 ('Legal', 80),
 ('Services', 72),
 ('Sales', 76),
 ('Product Management', 67),
 ('Support', 95),
 ('Business Development', 107),
 ('Research and Development', 74),
 ('Training', 84),
 ('Marketing', 83),
 ('Engineering', 75)]

In [145]:
# alternative
emp_RDD1.groupBy(lambda row : row[5]).mapValues(len).collect()

[('Accounting', 93),
 ('Human Resources', 94),
 ('Legal', 80),
 ('Services', 72),
 ('Sales', 76),
 ('Product Management', 67),
 ('Support', 95),
 ('Business Development', 107),
 ('Research and Development', 74),
 ('Training', 84),
 ('Marketing', 83),
 ('Engineering', 75)]

5. List all unique Job Titles in Dataset.

In [172]:
emp_RDD1.map(lambda  x : x[5]).distinct().collect()

['Accounting',
 'Human Resources',
 'Legal',
 'Services',
 'Sales',
 'Product Management',
 'Support',
 'Business Development',
 'Research and Development',
 'Training',
 'Marketing',
 'Engineering']

6. Count all employees whose first letter starts with a specific letter = 'A'.

In [174]:
emp_RDD1.filter(lambda row : row[1].startswith('A')).count()

81

#### Task 7. WordCount Example using Spark RDD

In [175]:
!hdfs dfs -ls /

Found 3 items
drwxr-xr-x   - hadoop supergroup          0 2022-11-21 15:25 /hbase
drwxrwxrwx   - hadoop supergroup          0 2022-11-21 15:12 /tmp
drwxr-xr-x   - hadoop supergroup          0 2022-11-21 15:11 /user


In [176]:
!hdfs dfs -mkdir /wordcount

In [188]:
!hdfs dfs -put /home/hadoop/Downloads/Harry_Potter_and_the_Deathly_Hallows.txt /wordcount

In [194]:
rdd = sc.textFile('/wordcount/Harry_Potter_and_the_Deathly_Hallows.txt')
rdd_word1 = rdd.flatMap(lambda line: line.split(" "))
rdd_word2 = rdd_word1.map(lambda word : (word.lower(),1))
rdd_word3 = rdd_word2.reduceByKey(lambda x , y : x + y)
rdd_word4 = rdd_word3.filter(lambda val : val[1] > 10).sortBy(lambda val : val[1],
                                                             ascending=False)
rdd_word4.collect()

[('the', 10280),
 ('and', 5283),
 ('to', 4843),
 ('of', 4121),
 ('he', 3845),
 ('a', 3523),
 ('was', 2681),
 ('his', 2602),
 ('in', 2159),
 ('had', 1990),
 ('harry', 1867),
 ('it', 1859),
 ('that', 1813),
 ('said', 1676),
 ('you', 1485),
 ('as', 1409),
 ('at', 1401),
 ('i', 1275),
 ('with', 1127),
 ('they', 1091),
 ('not', 1083),
 ('on', 984),
 ('for', 980),
 ('but', 975),
 ('she', 918),
 ('her', 864),
 ('—', 841),
 ('from', 836),
 ('have', 784),
 ('…', 756),
 ('were', 738),
 ('be', 737),
 ('him', 666),
 ('into', 655),
 ('hermione', 653),
 ('out', 650),
 ('could', 627),
 ('all', 583),
 ('ron', 578),
 ('what', 571),
 ('been', 567),
 ('“i', 535),
 ('up', 510),
 ('we', 498),
 ('there', 493),
 ('who', 490),
 ('is', 484),
 ('—”', 472),
 ('their', 465),
 ('did', 451),
 ('would', 443),
 ('them', 442),
 ('like', 424),
 ('if', 424),
 ('looked', 419),
 ('by', 419),
 ('harry,', 414),
 ('back', 409),
 ('so', 403),
 ('over', 383),
 ('this', 382),
 ('an', 381),
 ('then', 365),
 ('one', 364),
 ('wand

#### Task 8. Count the Number of Employees Working in Each City and Sort City by Employees Count in Desc Order.

#### Saving Data using RDD

In [196]:
rdd_word4.getNumPartitions()

2

In [197]:
# Saving RDD Data to local filesystem
rdd_word4.saveAsTextFile('file:///home/hadoop/Downloads/output/')

In [198]:
# Saving RDD Data to local filesystem
rdd_word4.saveAsTextFile('/wordcount/output/')

#### RDD - Statistical Function

In [206]:
rdd_city = emp_RDD1.map(lambda row : (row[5], int(row[6])))

In [207]:
rdd_city.min(lambda x : x [1])

('Training', 12727)

In [208]:
rdd_city.max(lambda x : x [1])

('Support', 1496924)

In [216]:
rdd_11 = sc.parallelize(range(1,100)).sample(False, 0.3, seed = 2)
rdd_11.min()

16

In [217]:
rdd_11.max()

95

In [227]:
# Mesaures distribution of samples data , and sample deviation from mean value
rdd_city.map(lambda x : x[1]).stdev()

424405.95187845733

In [228]:
rdd_city.map(lambda x : x[1]).variance()

180120411989.85944

In [229]:
rdd_city.map(lambda x : x[1]).mean()

770051.4190000002

In [231]:
# Total addition of sample
rdd_city.map(lambda x : x[1]).sum()

770051419

In [232]:
rdd_city.map(lambda x : x[1]).stats()

(count: 1000, mean: 770051.4190000002, stdev: 424405.95187845733, max: 1496924, min: 12727)

#### RDD Persistence and Its Storage Levels

In [233]:
emp_RDD.cache()

file:///home/hadoop/Downloads/Employee_Advance.csv MapPartitionsRDD[291] at textFile at NativeMethodAccessorImpl.java:0

In [238]:
from pyspark.storagelevel import StorageLevel
# Persist the RDD in memory with default storage level (MEMORY_ONLY)
emp_RDD = emp_RDD.persist(StorageLevel.MEMORY_ONLY)

In [239]:
emp_RDD1 = emp_RDD.map(lambda row : row.split(','))
emp_Map = emp_RDD1.map(lambda row : (row[5],1))
emp_Map.reduceByKey(lambda x, y : x + y).collect()

[('Accounting', 93),
 ('Human Resources', 94),
 ('Legal', 80),
 ('Services', 72),
 ('Sales', 76),
 ('Product Management', 67),
 ('Support', 95),
 ('Business Development', 107),
 ('Research and Development', 74),
 ('Training', 84),
 ('Marketing', 83),
 ('Engineering', 75)]

In [241]:
emp_RDD.unpersist()

file:///home/hadoop/Downloads/Employee_Advance.csv MapPartitionsRDD[291] at textFile at NativeMethodAccessorImpl.java:0

In [242]:
emp_RDD = emp_RDD.persist(StorageLevel.MEMORY_AND_DISK_SER)

In [243]:
emp_RDD.unpersist()

file:///home/hadoop/Downloads/Employee_Advance.csv MapPartitionsRDD[291] at textFile at NativeMethodAccessorImpl.java:0