In [7]:
!pyspark --version

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 3.4.4
      /_/
                        
Using Scala version 2.12.17, Java HotSpot(TM) 64-Bit Server VM, 18.0.2.1
Branch HEAD
Compiled by user ubuntu on 2024-10-21T02:09:45Z
Revision 6729992c76fc59ab07f63f97a9858691274447d0
Url https://github.com/apache/spark
Type --help for more information.


In [1]:
import pyspark

sc = pyspark.SparkContext(appName="maps_and_lazy_evaluation_example")

### Exemples de Manipulation avec RDD!

In [6]:
data = [4, 3, 2, 4, 5]
rdd = sc.parallelize(data, 4)
print(rdd.collect())

[4, 3, 2, 4, 5]


In [9]:
rdd2 = sc.parallelize([1, 2, 3, 4, 5])

result_rdd2 = rdd2.map(lambda x: x * 2)
print(result_rdd2.collect())

[2, 4, 6, 8, 10]


In [11]:
rdd3 = sc.parallelize(["spark", "rdd", "example", "python"])

uppercase_rdd3 = rdd3.map(lambda x : x.upper())
print(uppercase_rdd3.collect())

['SPARK', 'RDD', 'EXAMPLE', 'PYTHON']


In [13]:
rdd4 = sc.parallelize([("a", 1), ("b", 2), ("c", 3)])
keys_rdd4 = rdd4.map(lambda x: x[0])
print(keys_rdd4.collect())

['a', 'b', 'c']


In [16]:
rdd4 = sc.parallelize([("a", 1), ("b", 2), ("c", 3)])
keys_rdd4 = rdd4.filter(lambda x: True).collect()
keys = [x[0] for x in keys_rdd4]
print(keys)

['a', 'b', 'c']


## Exemple Pour Autres Transformations comme filter

In [None]:

rdd5 = sc.parallelize([1, 2, 3, 4, 5])
even_rdd5 = rdd5.filter(lambda x: x % 2 == 0)
print(even_rdd5.collect())

[2, 4]


### Exercices

In [19]:
# Starting off with a regular python list
log_of_songs = [
        "Despacito",
        "Nice for what",
        "No tears left to cry",
        "Despacito",
        "Havana",
        "In my feelings",
        "Nice for what",
        "despacito",
        "All the stars"
]

In [20]:
log_of_songs_rdd = sc.parallelize(log_of_songs)

In [21]:
print(log_of_songs_rdd.collect())

['Despacito', 'Nice for what', 'No tears left to cry', 'Despacito', 'Havana', 'In my feelings', 'Nice for what', 'despacito', 'All the stars']


In [26]:
# create a python function to convert strings to lowercase
def convert_song_to_lowercase(song):
    return song.lower()

In [None]:
# use the map function to transform the list of songs with the python function that converts strings to lowercase
rdd_songs_lower = log_of_songs_rdd.map(convert_song_to_lowercase)
print(rdd_songs_lower.collect())

['despacito', 'nice for what', 'no tears left to cry', 'despacito', 'havana', 'in my feelings', 'nice for what', 'despacito', 'all the stars']


In [28]:
# Show the original input data is still mixed case
print(log_of_songs_rdd.collect())

['Despacito', 'Nice for what', 'No tears left to cry', 'Despacito', 'Havana', 'In my feelings', 'Nice for what', 'despacito', 'All the stars']


In [29]:
# Use lambda functions instead of named functions to do the same map operation

test_rdd = log_of_songs_rdd.map(lambda x: x.lower())
print(test_rdd.collect())

['despacito', 'nice for what', 'no tears left to cry', 'despacito', 'havana', 'in my feelings', 'nice for what', 'despacito', 'all the stars']
