In [None]:
# Install Java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# Install PySpark
!pip install pyspark


Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488493 sha256=559aaf0ce9f6e72cace141dbbbe37b50102cfc33d3403f6749f02e89b1d62a9e
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [None]:
from pyspark.sql import SparkSession
import numpy as np

# Create a SparkSession
spark = SparkSession.builder \
    .appName("RDD Task 1") \
    .getOrCreate()

# Task 1: Generate 100 random numbers in range 0 to 10 using numpy
np.random.seed(10)
random_numbers = np.random.randint(0, 11, 100)
print(random_numbers)

# Create RDD using the parallelize function
rdd = spark.sparkContext.parallelize(random_numbers)

# Task 1: Calculate the frequency of each number (0 - 10)
frequency = rdd.map(lambda x: (x, 1)).reduceByKey(lambda a, b: a + b)

# Print the frequency of each number
print("Frequency of each number (0 - 10):")
for num, freq in frequency.collect():
    print(f"Number {num}: {freq} times")

# Stop the SparkSession
spark.stop()


[ 9  4  0  1  9  0  1 10  8  9  0 10  8  6  4  3  0  4  6  8 10  1  8  4
  1  3  6  5  3  9  6  9  1  9  4  2  6  7  8 10  8  9  2  0  6  7  8  1
  7  1  4 10  0  8  5  4  7  8  8  2  6  2  8  8  6  6  5 10  6  0  0  6
  9  1  8 10  9  1  2  8  9  9  5  0  2  7  3  0  4  2  0  3  3  1  2  5
  9  0 10  1]
Frequency of each number (0 - 10):
Number 4: 8 times
Number 0: 12 times
Number 10: 8 times
Number 8: 14 times
Number 6: 11 times
Number 2: 8 times
Number 9: 12 times
Number 1: 11 times
Number 3: 6 times
Number 5: 5 times
Number 7: 5 times


In [None]:
# Create a SparkSession
spark = SparkSession.builder \
    .appName("RDD Task 2") \
    .getOrCreate()

In [None]:
# Load the text8 dataset
text_rdd = spark.sparkContext.textFile(r"/content/text8")

# Split each line into words and flatten the result
words_rdd = text_rdd.flatMap(lambda line: line.split())

# Map each word to a tuple (word, 1) for counting
word_count_rdd = words_rdd.map(lambda word: (word, 1))

# Reduce by key to get the count of each word
word_frequencies = word_count_rdd.reduceByKey(lambda a, b: a + b)

words_with_a = word_frequencies.filter(lambda x: 'a' in x[0])

# Print the frequencies of words containing the letter 'a'
print("Frequencies of words containing the letter 'a':")
for word, freq in words_with_a.collect():
    print(f"Word: {word}, Frequency: {freq}")

# Stop the SparkSession
spark.stop()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Word: srimad, Frequency: 1
Word: jagannath, Frequency: 1
Word: chinmayananda, Frequency: 1
Word: kashinath, Frequency: 1
Word: nirmalananda, Frequency: 2
Word: tapasyananda, Frequency: 1
Word: baladeva, Frequency: 1
Word: vidyabhusana, Frequency: 1
Word: kisari, Frequency: 1
Word: ganguly, Frequency: 2
Word: maharshi, Frequency: 2
Word: ramana, Frequency: 1
Word: eknath, Frequency: 1
Word: kavya, Frequency: 1
Word: madhuri, Frequency: 1
Word: saral, Frequency: 1
Word: alongwith, Frequency: 1
Word: shaggy, Frequency: 3
Word: squealing, Frequency: 2
Word: primatologist, Frequency: 3
Word: unaccountable, Frequency: 3
Word: hoaxed, Frequency: 4
Word: sesqac, Frequency: 1
Word: wakemap, Frequency: 1
Word: facemasks, Frequency: 1
Word: nisga, Frequency: 1
Word: barbes, Frequency: 1
Word: anthropolgist, Frequency: 1
Word: macphee, Frequency: 1
Word: samish, Frequency: 1
Word: klallam, Frequency: 1
Word: farenbach, Frequency: 1
W

In [None]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder \
    .appName("DataFrame Task") \
    .getOrCreate()

# Load the iris JSON data
iris_df = spark.read.json("/content/iris.json")

# Calculate Pearson Correlation between petalLength and petalWidth
correlation = iris_df.stat.corr("petalLength", "petalWidth")
print("Pearson Correlation between petalLength and petalWidth:", correlation)

# Filter DataFrame to get rows with petalLength >= 1.4 and select required columns
filtered_df = iris_df.filter(iris_df["petalLength"] >= 1.4).select("sepalLength", "sepalWidth", "species")

# Show the filtered DataFrame
filtered_df.show()

# Stop the SparkSession
spark.stop()


Pearson Correlation between petalLength and petalWidth: 0.9626417223780231
+-----------+----------+-------+
|sepalLength|sepalWidth|species|
+-----------+----------+-------+
|        5.1|       3.5| setosa|
|        4.9|       3.0| setosa|
|        4.6|       3.1| setosa|
|        5.0|       3.6| setosa|
|        5.4|       3.9| setosa|
|        4.6|       3.4| setosa|
|        5.0|       3.4| setosa|
|        4.4|       2.9| setosa|
|        4.9|       3.1| setosa|
|        5.4|       3.7| setosa|
|        4.8|       3.4| setosa|
|        4.8|       3.0| setosa|
|        5.7|       4.4| setosa|
|        5.1|       3.5| setosa|
|        5.7|       3.8| setosa|
|        5.1|       3.8| setosa|
|        5.4|       3.4| setosa|
|        5.1|       3.7| setosa|
|        5.1|       3.3| setosa|
|        4.8|       3.4| setosa|
+-----------+----------+-------+
only showing top 20 rows

