In [1]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('RDD Exampl').getOrCreate()

In [39]:
# 1. Create RDDs in three different ways. 

In [80]:
# (a)Create an RDD from a list
data = [1, 2, 3, 4, 5]
rdd = spark.sparkContext.parallelize(data)
rdd.collect()


[1, 2, 3, 4, 5]

In [81]:
# (b)Create an RDD from a text file
rdd = spark.sparkContext.textFile("D:/sparkPrograms/RDD/test.txt")
rdd.collect()

['this is a sample text file',
 'this is a test file',
 'is a file',
 'is sample',
 'text file',
 'sample text',
 'sample text file']

In [82]:
#(c)Create an RDD from files in a directory
rdd = spark.sparkContext.wholeTextFiles("D:/sparkPrograms/RDD/rddfiles/*")
rdd.collect()

[('file:/D:/sparkPrograms/RDD/rddfiles/test.txt',
  'this is a sample text file\r\nthis is a test file\r\nis a file\r\nis sample\r\ntext file\r\nsample text\r\nsample text file'),
 ('file:/D:/sparkPrograms/RDD/rddfiles/test2.txt',
  'this is a sample text file\r\nthis is a test file\r\nis a file\r\nis sample\r\ntext file\r\nsample text\r\nsample text file\r\nthis is a sample text file\r\nthis is a test file\r\nis a file\r\nis sample\r\ntext file\r\nsample text\r\nsample text file')]

In [44]:
# 2. Read a text file and count the number of words in the file using RDD operations. 

In [45]:
rdd = spark.sparkContext.textFile("D:/sparkPrograms/RDD/test.txt")
words_rdd = rdd.flatMap(lambda line: line.split())
words_rdd.count()

23

In [46]:
# 3. Write a program to find the word frequency in a given file

In [53]:
rdd = spark.sparkContext.textFile("D:/sparkPrograms/RDD/test.txt")
words_rdd = rdd.flatMap(lambda line: line.split()).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)
words_rdd.collect()

[('this', 2),
 ('sample', 4),
 ('text', 4),
 ('file', 5),
 ('is', 4),
 ('a', 3),
 ('test', 1)]

In [54]:
# 4. Write a program to convert all words in a file to uppercase. 

In [55]:
rdd = spark.sparkContext.textFile("D:/sparkPrograms/RDD/test.txt")
uppercase_rdd = rdd.flatMap(lambda line: line.split()).map(lambda word: word.upper())
uppercase_rdd.collect()

['THIS',
 'IS',
 'A',
 'SAMPLE',
 'TEXT',
 'FILE',
 'THIS',
 'IS',
 'A',
 'TEST',
 'FILE',
 'IS',
 'A',
 'FILE',
 'IS',
 'SAMPLE',
 'TEXT',
 'FILE',
 'SAMPLE',
 'TEXT',
 'SAMPLE',
 'TEXT',
 'FILE']

In [56]:
# 5. Write a program to convert all words in a file to lowercase. 

In [59]:
rdd = spark.sparkContext.textFile("D:/sparkPrograms/RDD/test.txt")
lowercase_rdd = rdd.flatMap(lambda line: line.split()).map(lambda word: word.lower())
lowercase_rdd.collect()



['this',
 'is',
 'a',
 'sample',
 'text',
 'file',
 'this',
 'is',
 'a',
 'test',
 'file',
 'is',
 'a',
 'file',
 'is',
 'sample',
 'text',
 'file',
 'sample',
 'text',
 'sample',
 'text',
 'file']

In [None]:
# 6. Write a program to capitalize first letter of each words in file (use string capitalize()  method). 

In [60]:
rdd = spark.sparkContext.textFile("D:/sparkPrograms/RDD/test.txt")
capitalized_rdd = rdd.flatMap(lambda line: line.split()).map(lambda word: word.capitalize())
capitalized_rdd.collect()

['This',
 'Is',
 'A',
 'Sample',
 'Text',
 'File',
 'This',
 'Is',
 'A',
 'Test',
 'File',
 'Is',
 'A',
 'File',
 'Is',
 'Sample',
 'Text',
 'File',
 'Sample',
 'Text',
 'Sample',
 'Text',
 'File']

In [63]:
rdd = spark.sparkContext.textFile("D:/sparkPrograms/RDD/test.txt")
capitalized_rdd = rdd.map(lambda line: ' '.join([word.capitalize() for word in line.split()]))
capitalized_rdd.collect()

['This Is A Sample Text File',
 'This Is A Test File',
 'Is A File',
 'Is Sample',
 'Text File',
 'Sample Text',
 'Sample Text File']

In [64]:
# 7. Find the number of occurrence of a word in a given file. 

In [65]:
rdd = spark.sparkContext.textFile("D:/sparkPrograms/RDD/test.txt")
words_rdd = rdd.flatMap(lambda line: line.split()).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)
words_rdd.collect()

[('this', 2),
 ('sample', 4),
 ('text', 4),
 ('file', 5),
 ('is', 4),
 ('a', 3),
 ('test', 1)]

In [66]:
# 8. Select only the sentences containing given word from a text file. 

In [88]:
word_to_find = "sample"
rdd = spark.sparkContext.textFile("D:/sparkPrograms/RDD/test.txt")
filtered_sentences = rdd.filter(lambda sentence: word_to_find.lower() in sentence.lower())
filtered_sentences.collect()

['this is a sample text file', 'is sample', 'sample text', 'sample text file']

In [68]:
# 9. Find the longest length of word from given set of words. 

In [77]:
words = ["apple", "banana", "cherry", "kiwi", "watermelon"]
rdd = spark.sparkContext.parallelize(words)
longest_word = rdd.reduce(lambda x, y: x if len(x) > len(y) else y)
print(longest_word)

watermelon


In [87]:
rdd = spark.sparkContext.textFile("D:/sparkPrograms/RDD/test.txt")
longest_word = rdd.flatMap(lambda line: line.split()).reduce(lambda x, y: x if len(x) > len(y) else y)
print(longest_word)

sample


In [89]:
# 10. Map the Registration numbers to corresponding branch. 
# 58000 series BDA, 57000 series AIML, 38000 series VLSI, 39000 series ES, and 47000 series CDC. 
# Given registration number, generate a  key-value pair of Registration Number and Corresponding Branch. 

In [90]:
registration_numbers = [58001, 57023, 38045, 39012, 47056, 58099, 57088]

def map_registration_to_branch(reg_num):
    if 58000 <= reg_num < 59000:
        return "BDA"
    elif 57000 <= reg_num < 58000:
        return "AIML"
    elif 38000 <= reg_num < 39000:
        return "VLSI"
    elif 39000 <= reg_num < 40000:
        return "ES"
    elif 47000 <= reg_num < 48000:
        return "CDC"
    else:
        return "Unknown"

rdd = spark.sparkContext.parallelize(registration_numbers)
registration_branch_rdd = rdd.map(lambda reg_num: (reg_num, map_registration_to_branch(reg_num)))
for reg_num, branch in registration_branch_rdd.collect():
    print(f"Registration Number: {reg_num}, Branch: {branch}")


Registration Number: 58001, Branch: BDA
Registration Number: 57023, Branch: AIML
Registration Number: 38045, Branch: VLSI
Registration Number: 39012, Branch: ES
Registration Number: 47056, Branch: CDC
Registration Number: 58099, Branch: BDA
Registration Number: 57088, Branch: AIML


In [91]:
#11. Text file contain numbers. 
# Numbers are separated by one white space. 
# There is no order to store the numbers. 
# One line may contain one or more numbers.
# Find the maximum, minimum, sum and mean of numbers. 

In [92]:
rdd = spark.sparkContext.textFile("D:/sparkPrograms/RDD/numbers.txt")

numbers_rdd = rdd.flatMap(lambda line: line.split()).map(lambda x: int(x))

max_value = numbers_rdd.max()
min_value = numbers_rdd.min()
sum_value = numbers_rdd.sum()
count_value = numbers_rdd.count()
mean_value = sum_value / count_value if count_value > 0 else 0  # Avoid division by zero

print(f"Maximum: {max_value}")
print(f"Minimum: {min_value}")
print(f"Sum: {sum_value}")
print(f"Mean: {mean_value}")


Maximum: 50
Minimum: 1
Sum: 280
Mean: 18.666666666666668


In [93]:
# 12. A text file (citizen.txt) contains data about citizens of country. Fields(information in file) are Name, dob, Phone, email and state name. 
# Another file contains mapping of state names to state code like Karnataka is codes as KA, TamilNadu as TN, Kerala KL etc. 
# Compress the citizen.txt file by changing full state name to state code. 

In [113]:
import shutil
import os
from pyspark.sql import SparkSession

# Step 1: Initialize SparkSession
spark = SparkSession.builder.appName("StateCodeReplacement").getOrCreate()

# Step 2: Read the state codes mapping file and create a dictionary
state_mapping = {}
with open("D:/sparkPrograms/RDD/states.txt", "r") as state_file:
    for line in state_file:
        line = line.strip()
        if line:
            parts = line.split(" ")
            if len(parts) == 2:
                state_name, state_code = parts
                state_mapping[state_name] = state_code
            else:
                print(f"Skipping malformed line: {line}")

# Step 3: Read the citizen data from citizen.txt
rdd = spark.sparkContext.textFile("D:/sparkPrograms/RDD/citizen.txt")

# Step 4: Function to replace the state name with state code
def replace_state_name_with_code(line):
    fields = line.split(" ")
    if len(fields) > 4:
        state_name = fields[-1]
        if state_name in state_mapping:
            fields[-1] = state_mapping[state_name]
    return " ".join(fields)

# Step 5: Apply the transformation to replace state names with state codes
compressed_rdd = rdd.map(replace_state_name_with_code)

# Step 6: Remove the output directory if it exists
output_dir = "D:/sparkPrograms/RDD/compressed_citizen.txt"
if os.path.exists(output_dir):
    shutil.rmtree(output_dir)

# Step 7: Save the compressed data to a new file
compressed_rdd.saveAsTextFile(output_dir)

# Step 8: If you want to view the output in the console (for testing)
for line in compressed_rdd.collect():
    print(line)

# Stop the Spark session when done
spark.stop()


Rama 23 761351431 rama@ayodhya.com Ayodhya UP
Krishna 20 987124865 krishna@dwarka.com Dwarka Gujart
Krish 50 Dwarka Kerala
Rama 23 761351431 rama@ayodhya.com Ayodhya MadhyaPradesh
Krishna 20 987124865 krishna@dwarka.com Dwarka Rajastan
Krish 50 Dwarka TamilNadu
Rama 23 761351431 rama@ayodhya.com Ayodhya UP
Krishna 20 987124865 krishna@dwarka.com Dwarka Gujart
Krish 50 Dwarka Kerala
Rama 23 761351431 rama@ayodhya.com Ayodhya UP
Krishna 20 987124865 krishna@dwarka.com Dwarka MH
Krish 50 Dwarka Karnataka
Rama 23 761351431 rama@ayodhya.com Ayodhya Delhi
Krishna 20 987124865 krishna@dwarka.com Dwarka Telangana
Krish 50 Dwarka AndraPradesh
Krishna 20 987124865 krishna@dwarka.com Dwarka KA
Gopal 23 761351431 rama@ayodhya.com Mysore KA
Gopi 34 gopi@gmail.com Udupi KA
Rama 23 761351431 rama@ayodhya.com Ayodhya Delhi
Krishna 20 987124865 krishna@dwarka.com Dwarka Telangana
Krish 50 Dwarka AndraPradesh
Rama 23 761351431 rama@ayodhya.com Ayodhya Delhi
Krishna 20 987124865 krishna@dwarka.com Dwarka

In [99]:
# Create dataset (text file) with fields like ‘Student Name’, ‘Institute’, ‘Program Name’, and 
# ‘Gender’ and solve following questions.
# 1. Compute number of students from each Institute.
# 2. Number of students enrolled to any program.
# 3. Number of ‘boy’ and ‘girl’ students.
# 4. Number of ‘boy’ and ‘girl’ students from selected Institute

In [101]:
from pyspark import SparkContext, SparkConf

# Reuse the existing SparkContext or create a new one if none exists
sc = SparkContext.getOrCreate()

# Step 1: Read the students dataset
rdd = sc.textFile("D:/sparkPrograms/RDD/students.txt")

# Step 2: Split each line by commas and create an RDD of tuples
students_rdd = rdd.map(lambda line: line.split(", "))

# 1. Compute the number of students from each Institute
institute_count = students_rdd.map(lambda student: (student[1], 1)) \
                              .reduceByKey(lambda x, y: x + y)
print("Number of students from each Institute:")
for institute, count in institute_count.collect():
    print(f"{institute}: {count}")

# 2. Number of students enrolled in any program
program_count = students_rdd.map(lambda student: (student[2], 1)) \
                            .reduceByKey(lambda x, y: x + y)
print("\nNumber of students enrolled in each program:")
for program, count in program_count.collect():
    print(f"{program}: {count}")

# 3. Number of ‘boy’ and ‘girl’ students
gender_count = students_rdd.map(lambda student: (student[3], 1)) \
                           .reduceByKey(lambda x, y: x + y)
print("\nNumber of 'boy' and 'girl' students:")
for gender, count in gender_count.collect():
    print(f"{gender}: {count}")

# 4. Number of ‘boy’ and ‘girl’ students from a selected Institute (e.g., MIT)
selected_institute = "MIT"
gender_count_institute = students_rdd.filter(lambda student: student[1] == selected_institute) \
                                      .map(lambda student: (student[3], 1)) \
                                      .reduceByKey(lambda x, y: x + y)
print(f"\nNumber of 'boy' and 'girl' students from {selected_institute}:")
for gender, count in gender_count_institute.collect():
    print(f"{gender}: {count}")


Number of students from each Institute:
Harvard: 2
MIT: 4
Stanford: 4

Number of students enrolled in each program:
Computer Science: 4
Electrical Engineering: 2
Civil Engineering: 2
Mechanical Engineering: 2

Number of 'boy' and 'girl' students:
boy: 5
girl: 5

Number of 'boy' and 'girl' students from MIT:
boy: 4


In [102]:
# Dataset: Temperature of Indian Cities. Fields of dataset are Date, Average Temperature, City, 
# Country, Latitude and Longitude (Use dataset attached to MapReduce assignment). Solve 
# following questions
# 1. Find maximum and minimum temperature of all cities from the given dataset
# 2. Count number of data point for each city.
# 3. Find the maximum and minimum temperature for city Bangalore from the given dataset.
# 4. Find the maximum and minimum temperature for any given city from the given dataset. 
# City name should be passed through command line argument

In [104]:
# Stop any existing SparkContext if it exists (to avoid multiple SparkContexts)
from pyspark.sql import SparkSession

# Stop the existing SparkContext (if any)
try:
    sc.stop()
except:
    pass

# Create a new SparkSession
spark = SparkSession.builder.appName("Temperature Data Analysis").getOrCreate()

# Read the dataset into a DataFrame
df = spark.read.csv("D:/sparkPrograms/RDD/temperature_data.csv", header=True, inferSchema=True)

# Show the first few rows of the dataset
df.show(5)

# 1. Find maximum and minimum temperature of all cities from the given dataset
max_temp = df.groupBy("City").agg({"Average Temperature": "max"})
min_temp = df.groupBy("City").agg({"Average Temperature": "min"})

print("Maximum Temperature of each City:")
max_temp.show()

print("Minimum Temperature of each City:")
min_temp.show()

# 2. Count number of data points for each city
city_count = df.groupBy("City").count()
print("Number of data points for each City:")
city_count.show()

# 3. Find the maximum and minimum temperature for the city Bangalore
bangalore_data = df.filter(df["City"] == "Bangalore")
max_bangalore_temp = bangalore_data.agg({"Average Temperature": "max"})
min_bangalore_temp = bangalore_data.agg({"Average Temperature": "min"})

print("Maximum Temperature in Bangalore:")
max_bangalore_temp.show()

print("Minimum Temperature in Bangalore:")
min_bangalore_temp.show()

# 4. Find the maximum and minimum temperature for any given city from the dataset (using a dynamic city input)
from pyspark.sql.functions import col

# Function to get max and min temperature for any given city
def get_city_temperature(city_name):
    city_data = df.filter(df["City"] == city_name)
    max_temp = city_data.agg({"Average Temperature": "max"}).collect()[0][0]
    min_temp = city_data.agg({"Average Temperature": "min"}).collect()[0][0]
    print(f"Maximum Temperature in {city_name}: {max_temp}")
    print(f"Minimum Temperature in {city_name}: {min_temp}")

# Example usage: Get max and min temperatures for the city "Chennai"
get_city_temperature("Chennai")

# Stop the SparkSession when done
spark.stop()


+----------+-------------------+---------+-------+--------+---------+
|      Date|Average Temperature|     City|Country|Latitude|Longitude|
+----------+-------------------+---------+-------+--------+---------+
|2023-01-01|                 24|Bangalore|  India| 12.9716|  77.5946|
|2023-01-02|                 25|Bangalore|  India| 12.9716|  77.5946|
|2023-01-01|                 32|   Mumbai|  India|  19.076|  72.8777|
|2023-01-02|                 33|   Mumbai|  India|  19.076|  72.8777|
|2023-01-01|                 28|    Delhi|  India| 28.6139|   77.209|
+----------+-------------------+---------+-------+--------+---------+
only showing top 5 rows

Maximum Temperature of each City:
+---------+------------------------+
|     City|max(Average Temperature)|
+---------+------------------------+
|Bangalore|                      25|
|  Chennai|                      36|
|   Mumbai|                      33|
|  Kolkata|                      28|
|    Delhi|                      30|
+---------+----

In [105]:
# Create dataset (text file) of bank transactions. Fields in file are ‘Bank ID’, ‘Account Number’, 
# ‘Transaction Date’, ‘Transaction Type’ (credit or debit), ‘Transaction Amount’. Date format is 
# dd-mm-yyyy.
# 1. Count unique number of customers
# 2. Count unique number of Bank ID
# 3. Count unique number of customers per Bank ID
# 4. Number of transactions for given Account Number
# 5. Number of credit transactions for given Account Number in a given year

In [107]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, countDistinct

# Create a Spark session
spark = SparkSession.builder.appName("Bank Transactions Analysis").getOrCreate()

# Step 1: Load the data from the text file into a DataFrame
df = spark.read.csv("D:/sparkPrograms/RDD/bank_transactions.txt", header=False, inferSchema=True)

# Rename the columns for clarity
df = df.withColumnRenamed("_c0", "Bank ID") \
       .withColumnRenamed("_c1", "Account Number") \
       .withColumnRenamed("_c2", "Transaction Date") \
       .withColumnRenamed("_c3", "Transaction Type") \
       .withColumnRenamed("_c4", "Transaction Amount")

# Show a sample of the data
df.show(5)

# Step 2: Count unique number of customers (unique Account Numbers)
unique_customers = df.select("Account Number").distinct().count()
print(f"Unique number of customers: {unique_customers}")

# Step 3: Count unique number of Bank IDs
unique_banks = df.select("Bank ID").distinct().count()
print(f"Unique number of Bank IDs: {unique_banks}")

# Step 4: Count unique number of customers per Bank ID using countDistinct
unique_customers_per_bank = df.groupBy("Bank ID").agg(countDistinct("Account Number").alias("Unique Customers"))
unique_customers_per_bank.show()

# Step 5: Number of transactions for a given Account Number (e.g., Account Number 1001)
account_number = 1001
transactions_for_account = df.filter(df["Account Number"] == account_number).count()
print(f"Number of transactions for Account Number {account_number}: {transactions_for_account}")

# Step 6: Number of credit transactions for a given Account Number in a given year (e.g., 2023, Account 1001)
account_number = 1001
year_value = 2023
credit_transactions_for_account = df.filter(
    (df["Account Number"] == account_number) & 
    (df["Transaction Type"] == "credit") & 
    (year(col("Transaction Date")) == year_value)
).count()

print(f"Number of credit transactions for Account Number {account_number} in year {year_value}: {credit_transactions_for_account}")

# Stop the Spark session when done
spark.stop()


+-------+--------------+----------------+----------------+------------------+
|Bank ID|Account Number|Transaction Date|Transaction Type|Transaction Amount|
+-------+--------------+----------------+----------------+------------------+
|   B001|        1001.0|      12-01-2023|          credit|            1500.0|
|   B001|        1002.0|      13-01-2023|           debit|             500.0|
|   B002|        1001.0|      14-01-2023|           debit|             200.0|
|   B001|        1003.0|      15-01-2023|          credit|            1000.0|
|   B002|        1004.0|      16-01-2023|          credit|            1200.0|
+-------+--------------+----------------+----------------+------------------+
only showing top 5 rows

Unique number of customers: 5
Unique number of Bank IDs: 3
+-------+----------------+
|Bank ID|Unique Customers|
+-------+----------------+
|   B002|               4|
|   B003|               2|
|   B001|               3|
+-------+----------------+

Number of transactions f