In [None]:
# install Java8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# download spark 3.5.0
!wget -q https://apache.osuosl.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz

In [None]:
ls -l # check the .tgz is there

total 1184108
-rw-r--r--  1 root root     17362 Jan 24 10:54 2021_population.csv
-rw-r--r--  1 root root     15541 Jan 24 10:54 2022_population.csv
-rw-r--r--  1 root root     15303 Jan 24 10:54 2023_population.csv
-rw-r--r--  1 root root     15286 Jan 24 10:54 2024_population.csv
drwxr-xr-x  2 root root      4096 Jan 24 10:49 [0m[01;34mdataset[0m/
drwxr-xr-x  1 root root      4096 Jan 19 14:20 [01;34msample_data[0m/
drwxr-xr-x 13 1000 1000      4096 Sep  9 02:08 [01;34mspark-3.5.0-bin-hadoop3[0m/
-rw-r--r--  1 root root 400395283 Sep  9 02:10 spark-3.5.0-bin-hadoop3.tgz
-rw-r--r--  1 root root 400395283 Sep  9 02:10 spark-3.5.0-bin-hadoop3.tgz.1
-rw-r--r--  1 root root 400395283 Sep  9 02:10 spark-3.5.0-bin-hadoop3.tgz.2
-rw-r--r--  1 root root  11245981 Jan 24 10:55 timeseries_population_count.csv


In [None]:
# unzip it
!tar xf spark-3.5.0-bin-hadoop3.tgz

In [None]:
!pip install -q findspark

In [None]:
!pip install py4j

# For maps
!pip install folium
!pip install plotly



In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.0-bin-hadoop3"
os.environ["PYSPARK_SUBMIT_ARGS"] = "--master local[*] pyspark-shell"

In [None]:
import findspark
findspark.init("spark-3.5.0-bin-hadoop3")# SPARK_HOME

from pyspark.sql import SparkSession

# create the session
spark = SparkSession \
        .builder \
        .appName("Columns and Expressions") \
        .master("local[*]") \
        .getOrCreate()

spark.version

'3.5.0'

In [None]:
spark

In [None]:
# For Pandas conversion optimization
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

In [None]:
# Import sql functions
from pyspark.sql.functions import *

In [None]:
!mkdir -p dataset
!wget -q /content/artists.csv. -P /dataset


In [None]:
pop21DF = spark.read \
    .option("header", True) \
    .csv("/content/2021_population.csv")
pop21DF.show()
pop22DF = spark.read \
    .option("header", True) \
    .csv("/content/2022_population.csv")
pop22DF.show()
pop23DF = spark.read \
    .option("header", True) \
    .csv("/content/2023_population.csv")
pop23DF.show()
pop24DF = spark.read \
    .option("header", True) \
    .csv("/content/2024_population.csv")
pop24DF.show()
timeseriespopDF = spark.read \
    .option("header", True) \
    .csv("/content/timeseries_population_count.csv")
timeseriespopDF.show()

+--------+--------------+-----------------+---------------+----------------+-------------+-----------+-------+----+
|iso_code|       country|2021_last_updated|2020_population|            area|density_sq_km|growth_rate|world_%|rank|
+--------+--------------+-----------------+---------------+----------------+-------------+-----------+-------+----+
|     CHN|         China|    1,447,065,329|  1,439,323,776| 9,706,961 sq_km|    149/sq_km|      0.34%| 18.34%|   1|
|     IND|         India|    1,401,310,563|  1,380,004,385| 3,287,590 sq_km|    424/sq_km|      0.97%| 17.69%|   2|
|     USA| United States|      334,058,426|    331,002,651| 9,372,610 sq_km|     36/sq_km|      0.58%|  4.23%|   3|
|     IDN|     Indonesia|      278,037,263|    273,523,615| 1,904,569 sq_km|    145/sq_km|      1.04%|  3.51%|   4|
|     PAK|      Pakistan|      227,724,796|    220,892,340|   881,912 sq_km|    255/sq_km|      1.95%|  2.86%|   5|
|     BRA|        Brazil|      214,832,901|    212,559,417| 8,515,767 sq

In [None]:
# various select methods
pop21DF.select(
    pop21DF.country,
    col("2021_last_updated"),
    "iso_code"
).show()

+--------------+-----------------+--------+
|       country|2021_last_updated|iso_code|
+--------------+-----------------+--------+
|         China|    1,447,065,329|     CHN|
|         India|    1,401,310,563|     IND|
| United States|      334,058,426|     USA|
|     Indonesia|      278,037,263|     IDN|
|      Pakistan|      227,724,796|     PAK|
|        Brazil|      214,832,901|     BRA|
|       Nigeria|      214,507,696|     NGA|
|    Bangladesh|      167,247,491|     BGD|
|        Russia|      145,899,956|     RUS|
|        Mexico|      131,046,075|     MEX|
|         Japan|      125,802,521|     JPN|
|      Ethiopia|      119,590,501|     ETH|
|   Philippines|      111,913,102|     PHL|
|         Egypt|      105,390,688|     EGY|
|       Vietnam|       98,655,916|     VNM|
|        Turkey|       85,484,777|     TUR|
|          Iran|       85,627,052|     IRN|
|       Germany|       83,975,691|     DEU|
|      Thailand|       70,039,646|     THA|
|United Kingdom|       68,401,08

In [None]:
pop23DF.select(sum(pop23DF['2022_population']).alias("WW22population")).show()





+--------------+
|WW22population|
+--------------+
|         510.0|
+--------------+



In [None]:
pop23DF2 = pop23DF.filter(pop23DF['density_/sq_km'] > 350)
pop23DF2.show()






+--------+-----------+-----------------+---------------+----------+---------------+--------------+-----------+-------+----+---------+
|iso_code|    country|2023_last_updated|2022_population|area_sq_km|land_area_sq_km|density_/sq_km|growth_rate|world_%|rank|un_member|
+--------+-----------+-----------------+---------------+----------+---------------+--------------+-----------+-------+----+---------+
|     IND|      India|    1,435,297,914|  1,417,173,173|      3.3M|             3M|           481|      0.81%| 17.85%|   1|      IND|
|     PHL|Philippines|      118,231,730|    115,559,009|    342.4K|         298.2K|           394|      1.54%|  1.47%|  13|      PHL|
|     KOR|South Korea|       51,764,928|     51,815,810|    100.2K|          97.6K|           531|     -0.06%|  0.65%|  29|      KOR|
|     TWN|     Taiwan|       23,937,624|     23,893,394|     36.2K|          36.2K|           661|      0.13%|   0.3%|  57|      TWN|
|     LKA|  Sri Lanka|       21,922,275|     21,832,143|     6

In [None]:
# Verificar tipos de datos
pop22DF.printSchema()


root
 |-- iso_code: string (nullable = true)
 |-- country: string (nullable = true)
 |-- 2022_last_updated: string (nullable = true)
 |-- 2021_population: string (nullable = true)
 |-- area_sq_km: string (nullable = true)
 |-- land_area_sq_km: string (nullable = true)
 |-- density_/sq_km: string (nullable = true)
 |-- growth_rate: string (nullable = true)
 |-- world_%: string (nullable = true)
 |-- rank: string (nullable = true)



In [None]:
difference2122DF = pop22DF.withColumn("difference", col("2022_last_updated")-col("2021_population").cast("int"))


In [None]:
# Ejemplo de consulta con groupBy y agregación
result = pop23DF.groupBy('iso_code').agg(avg('density_/sq_km').alias('avg_density'))
result.show()


+--------+-----------+
|iso_code|avg_density|
+--------+-----------+
|     NIU|        7.0|
|     HTI|      425.0|
|     PSE|      892.0|
|     POL|      134.0|
|     LVA|       29.0|
|     BRB|      656.0|
|     ZMB|       28.0|
|     JAM|      261.0|
|     BRA|       26.0|
|     ARM|       98.0|
|     MOZ|       43.0|
|     JOR|      128.0|
|     CUB|      108.0|
|     FRA|      118.0|
|     SOM|       29.0|
|     ABW|      590.0|
|     BRN|       86.0|
|     FSM|      165.0|
|     BOL|       11.0|
|     URY|       20.0|
+--------+-----------+
only showing top 20 rows



In [None]:
joinCondition = pop23DF.country == pop24DF.country
joined2324DF = pop23DF.join(pop24DF, 'country', 'inner')
joined2324DF.show()

+--------------+--------+-----------------+---------------+----------+---------------+--------------+-----------+-------+----+---------+--------+-----------------+---------------+----------+---------------+--------------+-----------+-------+----+---------+
|       country|iso_code|2023_last_updated|2022_population|area_sq_km|land_area_sq_km|density_/sq_km|growth_rate|world_%|rank|un_member|iso_code|2024_last_updated|2023_population|area_sq_km|land_area_sq_km|density_/sq_km|growth_rate|world_%|rank|un_member|
+--------------+--------+-----------------+---------------+----------+---------------+--------------+-----------+-------+----+---------+--------+-----------------+---------------+----------+---------------+--------------+-----------+-------+----+---------+
|         India|     IND|    1,435,297,914|  1,417,173,173|      3.3M|             3M|           481|      0.81%| 17.85%|   1|      IND|     IND|    1,436,015,555|  1,428,627,663|      3.3M|             3M|           485|      0.