In [2]:
from pyspark.sql import SparkSession
from pyspark.context import SparkContext

# Step 1: Create a Spark session and context
spark = SparkSession.builder.appName("LoadTextFile").getOrCreate()
sc = spark.sparkContext  # Get Spark Context

# Step 2: Load the text file as an RDD
rdd = sc.textFile("register_no.txt")  # Replace with the actual file path

# Step 3: Convert values to integers
num_rdd = rdd.map(lambda x: int(x))

# Step 4: Filter numbers greater than 50
filtered_rdd = num_rdd.filter(lambda x: x > 50)

# Step 5: Compute the sum of filtered numbers
total_sum = filtered_rdd.reduce(lambda x, y: x + y)

# Step 6: Display results
print(f"Filtered numbers greater than 50: {filtered_rdd.collect()}")
print(f"Sum of filtered numbers: {total_sum}")

Filtered numbers greater than 50: [356, 80, 123]
Sum of filtered numbers: 559


In [3]:
from pyspark.sql import SparkSession

# Step 1: Create Spark Session
spark = SparkSession.builder.appName("OddNumbersFilter").getOrCreate()

# Step 2: Create RDD from the list
rdd = spark.sparkContext.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

# Step 3: Filter out odd numbers
odd_numbers_rdd = rdd.filter(lambda x: x % 2 != 0)

# Step 4: Count total number of odd numbers
odd_count = odd_numbers_rdd.count()

# Step 5: Display results
print(f"Odd Numbers: {odd_numbers_rdd.collect()}")
print(f"Total Odd Numbers Count: {odd_count}")

Odd Numbers: [1, 3, 5, 7, 9]
Total Odd Numbers Count: 5


In [4]:
from pyspark.sql import SparkSession
from pyspark.context import SparkContext

# Step 1: Create Spark session and context
spark = SparkSession.builder.appName("WordCount").getOrCreate()
sc = spark.sparkContext  # Get Spark Context

# Step 2: Load the text file into an RDD
rdd = sc.textFile("Sample.txt")

# Step 3: Split lines into words
words_rdd = rdd.flatMap(lambda line: line.split(" "))

# Step 4: Count occurrences of each word
word_counts_rdd = words_rdd.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)

# Step 5: Sort by frequency in descending order
sorted_word_counts = word_counts_rdd.sortBy(lambda x: x[1], ascending=False)

# Step 6: Display results
sorted_word_counts.collect()

[('PySpark', 4),
 ('Hello', 2),
 ('world', 1),
 ('Welcome', 1),
 ('to', 1),
 ('fun', 1),
 ('Learn', 1),
 ('with', 1),
 ('examples', 1),
 ('is', 1)]