In [18]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("TextFileVsWholeTextFiles").getOrCreate()

# Get the SparkContext from the SparkSession
sc = spark.sparkContext

# Example paths (adjust these paths to your local file paths)
text_file_path = "file:////home/lplab/Documents/220962432_bda lab/revision basics/f2.txt"  # Local file path for a single text file
text_files_directory_path = "file:////home/lplab/Documents/220962432_bda lab/revision basics/"  # Local directory path for multiple text files
#use /// for local reading, otherwise reads hdfs

# Using textFile()
text_rdd = sc.textFile(text_file_path)
print("Using textFile():")
print("Number of lines:", text_rdd.count())
print("First 5 lines:")
print(text_rdd.take(3))
print('number of partitions: ', text_rdd.getNumPartitions())

# Using wholeTextFiles()
whole_text_files_rdd = sc.wholeTextFiles(text_files_directory_path)
print("\nUsing wholeTextFiles():")
print("Number of files:", whole_text_files_rdd.count())
print("First file:")
file_name, file_content = whole_text_files_rdd.first()
print("Filename:", file_name)
print("Content preview (first 100 characters):", file_content[:100])
print('number of partitions: ', whole_text_files_rdd.getNumPartitions())
print("take 2 on wholeTextFiles: ", whole_text_files_rdd.take(1))
# Stop the SparkSession
spark.stop()




Using textFile():
Number of lines: 2
First 5 lines:
['this is another file lmao. im still learning.', 'hehe.']
number of partitions:  2

Using wholeTextFiles():
Number of files: 3
First file:
Filename: file:/home/lplab/Documents/220962432_bda lab/revision basics/f1.txt
Content preview (first 100 characters): this is a file hehe!

number of partitions:  2
take 2 on wholeTextFiles:  [('file:/home/lplab/Documents/220962432_bda lab/revision basics/f1.txt', 'this is a file hehe!\n')]


#### creating df from json file

In [27]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("TextFileVsWholeTextFiles").getOrCreate()
sc = spark.sparkContext

json_path = 'file:////home/lplab/Documents/220962432_bda lab/revision basics/movies 1.json'
json_df = spark.read.json(json_path)

json_df.head


spark.stop()


#### creating df programtically

In [38]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('parallelize').getOrCreate()
sc = spark.sparkContext

parallelRDD = sc.parallelize([0,1,2,3,4,5,6,7,8], numSlices = 2)
print(parallelRDD)
print("trying count: ",parallelRDD.count()) #number of elements

print("trying collect: ",parallelRDD.collect()) 


# Collect the data from each partition
partitions = parallelRDD.glom().collect()


#unnecessary shit
# Print the data in each partition
for i, partition in enumerate(partitions):
    print(f"Partition {i}: {partition}")


# Perform a map operation
mappedRDD = parallelRDD.map(lambda x: x * 2)

# Collect and print the results
result = mappedRDD.collect()
print("Mapped RDD:", result)


print('stopping')
spark.stop()



ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:287
trying count:  9
trying collect:  [0, 1, 2, 3, 4, 5, 6, 7, 8]
Partition 0: [0, 1, 2, 3]
Partition 1: [4, 5, 6, 7, 8]
Mapped RDD: [0, 2, 4, 6, 8, 10, 12, 14, 16]
stopping


#### sc.range(start, end=None, step=1, numSlices=None)

In [46]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("range").getOrCreate()
sc = spark.sparkContext


range_rdd = sc.range(10,20,1)
print(range_rdd)
print(range_rdd.collect())
print('min: ',range_rdd.min())
print('max: ',range_rdd.max())
print('take(5): ',range_rdd.take(5))

spark.stop()



PythonRDD[1] at RDD at PythonRDD.scala:53
[10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
min:  10
max:  19
take(5):  [10, 11, 12, 13, 14]
