In [13]:
from pyspark import SparkContext, SparkConf

# Check if a SparkContext already exists
if 'sc' not in globals():
    # Initialize SparkContext only if one doesn't exist
    conf = SparkConf().setAppName("BigDataICP").setMaster("local")
    sc = SparkContext(conf=conf)
else:
    # Reuse the existing SparkContext
    print("Using existing SparkContext")



Using existing SparkContext


In [14]:
rdd1 = sc.parallelize(range(1, 16))
print("Task 1:", rdd1.collect())


Task 1: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]


In [15]:
print("Elements in RDD:", rdd1.collect())
print("Number of Partitions:", rdd.getNumPartitions())


Elements in RDD: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
Number of Partitions: 2


In [16]:
print("First Element in RDD:", rdd1.first())


First Element in RDD: 1


In [17]:
even_rdd = rdd1.filter(lambda x: x % 2 == 0)
print("Even Elements:", even_rdd.collect())


Even Elements: [2, 4, 6, 8, 10, 12, 14]


In [18]:
squared_rdd = rdd1.map(lambda x: x**2)
print("Squared Elements:", squared_rdd.collect())


Squared Elements: [1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 196, 225]


In [19]:
sum_elements = rdd1.reduce(lambda x, y: x + y)
print("Sum of Elements:", sum_elements)


Sum of Elements: 120


In [20]:
rdd1.saveAsTextFile("rdd_output")


In [21]:
rdd2 = sc.parallelize(range(16, 21))
rdd_union = rdd1.union(rdd2)
print("Task 8 - Union of RDDs:", rdd_union.collect())


Task 8 - Union of RDDs: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]


In [22]:
rdd_cartesian = rdd1.cartesian(rdd2)
print("Task 9 - Cartesian product:", rdd_cartesian.collect())


Task 9 - Cartesian product: [(1, 16), (1, 17), (2, 16), (3, 16), (2, 17), (3, 17), (4, 16), (5, 16), (6, 16), (7, 16), (4, 17), (5, 17), (6, 17), (7, 17), (1, 18), (1, 19), (1, 20), (2, 18), (3, 18), (2, 19), (2, 20), (3, 19), (3, 20), (4, 18), (5, 18), (6, 18), (7, 18), (4, 19), (4, 20), (5, 19), (5, 20), (6, 19), (6, 20), (7, 19), (7, 20), (8, 16), (8, 17), (9, 16), (10, 16), (9, 17), (10, 17), (11, 16), (12, 16), (13, 16), (14, 16), (11, 17), (12, 17), (13, 17), (14, 17), (15, 16), (15, 17), (8, 18), (8, 19), (8, 20), (9, 18), (10, 18), (9, 19), (9, 20), (10, 19), (10, 20), (11, 18), (12, 18), (13, 18), (14, 18), (11, 19), (11, 20), (12, 19), (12, 20), (13, 19), (13, 20), (14, 19), (14, 20), (15, 18), (15, 19), (15, 20)]


In [23]:
dict_data = {'a': 1, 'b': 2, 'c': 3}
rdd_dict = sc.parallelize(dict_data.items())
print("Task 10 - RDD from dictionary:", rdd_dict.collect())


Task 10 - RDD from dictionary: [('a', 1), ('b', 2), ('c', 3)]


In [24]:
rdd_with_duplicates = sc.parallelize([1, 2, 2, 3, 3, 3])
unique_counts = rdd_with_duplicates.countByValue()
print("Task 11 - Unique value counts:", unique_counts)

Task 11 - Unique value counts: defaultdict(<class 'int'>, {1: 1, 2: 2, 3: 3})


In [28]:
rdd_text_files = sc.textFile("/content/rdd_output,/content/squared_rdd_output")
print("Task 12 - Combined text files RDD:", rdd_text_files.collect())

Task 12 - Combined text files RDD: ['8', '9', '10', '11', '12', '13', '14', '15', '1', '2', '3', '4', '5', '6', '7', '64', '81', '100', '121', '144', '169', '196', '225', '1', '4', '9', '16', '25', '36', '49']


In [31]:
print("First 5 lines of RDD:", rdd_text_files.take(5))

First 5 lines of RDD: ['8', '9', '10', '11', '12']


In [32]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)



In [33]:
data = [("veda", 1), ("agastya", 2), ("darshana", 3)]
df = sqlContext.createDataFrame(data, ["Name", "ID"])
df.show()


+--------+---+
|    Name| ID|
+--------+---+
|    veda|  1|
| agastya|  2|
|darshana|  3|
+--------+---+



 **RDDs** are the most basic option, with no schema and minimal optimizations, making them slower and requiring more memory. They’re fault-tolerant and type-safe, catching errors early, but lack SQL support and are generally harder to use. **DataFrames**, on the other hand, are structured like tables with schemas, which allows them to be optimized for faster performance. They support SQL queries and are high-level, making them easier to work with, though they detect errors at runtime. **Datasets** combine the best of both, offering the structure and SQL support of DataFrames, with added type safety and the highest performance due to advanced optimizations. They’re also the most memory-efficient, making them ideal for complex application