In [1]:
import findspark
findspark.init()
findspark.find()

import pyspark

In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkContext

In [3]:
spark = SparkSession.builder.appName("ScalaToPySpark").getOrCreate()
sc = spark.sparkContext

In [4]:
data1 = sc.parallelize(["Spark", "Scala", "Java", "C++", "Python", "Cobot"])

In [5]:
data1.map(lambda x: (x, 1))

PythonRDD[1] at RDD at PythonRDD.scala:53

In [6]:
data2 = sc.parallelize(["JavaScript", "Hadoop", "Scala", "Java", "Informatica", "DataStage"])

In [7]:
data2.map(lambda x: (x, 1))

PythonRDD[3] at RDD at PythonRDD.scala:53

In [8]:
result = data1.join(data2) 

In [9]:
# result.collect()  #got the expected error because of key value pair mismatch

In [10]:
data4 = sc.parallelize([("Spark", 40),("Scala", 50),("Java", 20),("C++", 30),("Python", 50),("Cobol", 30)])

In [11]:
data5 = sc.parallelize([("JavaScript", 20),("Hadoop", 80),("Scala", 90),("Java", 80),("Informatica", 40),("DataStage", 90)])

In [12]:
result2 = data4.join(data5)

In [13]:
result2.collect()

[('Java', (20, 80)), ('Scala', (50, 90))]

In [14]:
for record in result2.collect():
    print(record)

('Java', (20, 80))
('Scala', (50, 90))


In [15]:
result3 = data4.leftOuterJoin(data5)

In [16]:
result3.collect()

[('Java', (20, 80)),
 ('Spark', (40, None)),
 ('Python', (50, None)),
 ('C++', (30, None)),
 ('Scala', (50, 90)),
 ('Cobol', (30, None))]

In [17]:
for record in result3.collect():
    print(record)

('Java', (20, 80))
('Spark', (40, None))
('Python', (50, None))
('C++', (30, None))
('Scala', (50, 90))
('Cobol', (30, None))


In [18]:
result4 = data4.rightOuterJoin(data5)

In [19]:
for record in result4.collect():
    print(record)

('Java', (20, 80))
('Hadoop', (None, 80))
('JavaScript', (None, 20))
('DataStage', (None, 90))
('Scala', (50, 90))
('Informatica', (None, 40))


In [20]:
result5 = data4.fullOuterJoin(data5)

In [21]:
for record in result5.collect():
    print(record)

('Java', (20, 80))
('Hadoop', (None, 80))
('Spark', (40, None))
('Python', (50, None))
('C++', (30, None))
('JavaScript', (None, 20))
('DataStage', (None, 90))
('Scala', (50, 90))
('Cobol', (30, None))
('Informatica', (None, 40))


In [22]:
data6 = sc.parallelize([("Spark", 40),("Scala", 50),("Java", 20),("C++", 30),("Python", 50),("Cobol", 30),("Java", 90),("Java", 30),("Java", 50)])

In [23]:
result6 = data6.lookup("Java")

In [24]:
result6

[20, 90, 30, 50]

In [25]:
data6.countByValue()

defaultdict(int,
            {('Spark', 40): 1,
             ('Scala', 50): 1,
             ('Java', 20): 1,
             ('C++', 30): 1,
             ('Python', 50): 1,
             ('Cobol', 30): 1,
             ('Java', 90): 1,
             ('Java', 30): 1,
             ('Java', 50): 1})

In [35]:
for i,j in data6.countByValue().items():
    print(f"{i}: {j}")

('Spark', 40): 1
('Scala', 50): 1
('Java', 20): 1
('C++', 30): 1
('Python', 50): 1
('Cobol', 30): 1
('Java', 90): 1
('Java', 30): 1
('Java', 50): 1


In [27]:
data7 = sc.parallelize(["Spark", "Scala", "Java", "Hadoop", "Spark", "Spark", "Scala", "Spark", "Scala", "Spark", "Scala", "Java", "Scala"])

In [31]:
data7.countByValue()

defaultdict(int, {'Spark': 5, 'Scala': 5, 'Java': 2, 'Hadoop': 1})

In [34]:
for key, value in data7.countByValue().items():
    print(f"{key}: {value}")

Spark: 5
Scala: 5
Java: 2
Hadoop: 1


In [44]:
file = sc.textFile("file:///C://Users/aksha/Pyspark/testData1.log") 

In [45]:
flatfile = file.flatMap(lambda x: x.split(" "))

In [46]:
mapfile = flatfile.map(lambda x: (x,1))

In [47]:
redfile = mapfile.reduceByKey(lambda a,b: a + b)

In [48]:
sortdata = redfile.sortBy(lambda x: x[0])

In [49]:
for word, count in sortdata.collect():
    print(f"{word}: {count}")

be: 2
demo: 3
every: 1
hadoop: 1
have: 1
holiday: 1
is: 2
not: 2
on: 2
sunday: 4
there: 2
today: 2
we: 1
will: 2


In [50]:
wordcount = file.flatMap(lambda x: x.split()).map(lambda x: (x, 1)).reduceByKey(lambda a, b: a + b).sortBy(lambda x: x[0])


In [51]:
for word, count in wordcount.collect(): 
    print(f"{word}: {count}")


be: 2
demo: 3
every: 1
hadoop: 1
have: 1
holiday: 1
is: 2
not: 2
on: 2
sunday: 4
there: 2
today: 2
we: 1
will: 2
