# Windows Partitioning

## Prerrequisites

Install Spark and Java in VM

In [2]:
# install Java8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# download spark 3.5.0
!wget -q https://apache.osuosl.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz

In [3]:
ls -l # check the .tgz is there

total 391016
drwxr-xr-x 1 root root      4096 Jan 11 17:02 [0m[01;34msample_data[0m/
-rw-r--r-- 1 root root 400395283 Sep  9 02:10 spark-3.5.0-bin-hadoop3.tgz


In [4]:
# unzip it
!tar xf spark-3.5.0-bin-hadoop3.tgz

In [5]:
!pip install -q findspark

Defining the environment

In [6]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.0-bin-hadoop3"
os.environ["PYSPARK_SUBMIT_ARGS"] = "--master local[*] pyspark-shell"

Start Spark Session

---

In [7]:
import findspark
findspark.init("spark-3.5.0-bin-hadoop3")# SPARK_HOME

from pyspark.sql import SparkSession

# create the session
spark = SparkSession \
        .builder \
        .appName("Entregable") \
        .master("local[*]") \
        .getOrCreate()

spark.version

'3.5.0'

In [8]:
spark

In [12]:
# For Pandas conversion optimization
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

In [13]:
# Import sql functions
from pyspark.sql.functions import *

In [16]:
temperatureDF = spark.read.option("header", "true").csv("temperatures.csv")
temperatureDF.show()

+----------+------------------+-----------------------------+-------+-------------+--------+---------+
|        dt|AverageTemperature|AverageTemperatureUncertainty|   City|      Country|Latitude|Longitude|
+----------+------------------+-----------------------------+-------+-------------+--------+---------+
|1849-01-01|            26.704|                        1.435|Abidjan|Côte D'Ivoire|   5.63N|    3.23W|
|1849-02-01|            27.434|                        1.362|Abidjan|Côte D'Ivoire|   5.63N|    3.23W|
|1849-03-01|            28.101|                        1.612|Abidjan|Côte D'Ivoire|   5.63N|    3.23W|
|1849-04-01|             26.14|           1.3869999999999998|Abidjan|Côte D'Ivoire|   5.63N|    3.23W|
|1849-05-01|            25.427|                          1.2|Abidjan|Côte D'Ivoire|   5.63N|    3.23W|
|1849-06-01|            24.844|                        1.402|Abidjan|Côte D'Ivoire|   5.63N|    3.23W|
|1849-07-01|24.058000000000003|                        1.254|Abidjan|Côte

In [53]:
from pyspark.sql.functions import year, mean, first, col, avg
from pyspark.sql import Window


temperatureDF = temperatureDF.withColumn("Year", year("dt"))
temperatureDF = temperatureDF.filter(temperatureDF["Year"] > 1850)
annualMeanDF = temperatureDF.groupBy("Country", "City", "Latitude", "Longitude", "Year").agg(mean("AverageTemperature").alias("AnnualMeanTemp"))


windowSpec = Window.partitionBy("Country", "City", "Latitude", "Longitude").orderBy("Year")
firstAnnualMeanDF = annualMeanDF.withColumn("FirstYearMeanTemp", first("AnnualMeanTemp").over(windowSpec))
firstAnnualMeanDF = firstAnnualMeanDF.withColumn("FirstYear", first("Year").over(windowSpec))


resultDF = firstAnnualMeanDF.withColumn("TempVariation",
                                        (col("AnnualMeanTemp") - col("FirstYearMeanTemp")) / col("FirstYearMeanTemp") * 100)


resultDF = resultDF.dropna(subset=["AnnualMeanTemp", "FirstYearMeanTemp", "TempVariation"])


resultDF.select("Country", "City", "Year", "AnnualMeanTemp", "FirstYearMeanTemp", "FirstYear", "TempVariation", "Latitude", "Longitude").show()


+-----------+-----+----+------------------+------------------+---------+--------------------+--------+---------+
|    Country| City|Year|    AnnualMeanTemp| FirstYearMeanTemp|FirstYear|       TempVariation|Latitude|Longitude|
+-----------+-----+----+------------------+------------------+---------+--------------------+--------+---------+
|Afghanistan|Kabul|1943|             13.07|             13.07|     1943|                 0.0|  34.56N|    70.05|
|Afghanistan|Kabul|1851|13.828916666666665|13.828916666666665|     1851|                 0.0|  34.56N|   70.05E|
|Afghanistan|Kabul|1852|          13.78325|13.828916666666665|     1851| -0.3302259155031223|  34.56N|   70.05E|
|Afghanistan|Kabul|1853|13.730166666666662|13.828916666666665|     1851| -0.7140834121737849|  34.56N|   70.05E|
|Afghanistan|Kabul|1854|13.994499999999997|13.828916666666665|     1851|  1.1973702447166794|  34.56N|   70.05E|
|Afghanistan|Kabul|1855|14.227000000000002|13.828916666666665|     1851|  2.8786299240119146|  3

In [54]:
resultDF.orderBy("Country", "City", "Year").where(resultDF["Country"] == "Spain").show(30)

+-------+------+--------+---------+----+------------------+------------------+---------+--------------------+
|Country|  City|Latitude|Longitude|Year|    AnnualMeanTemp| FirstYearMeanTemp|FirstYear|       TempVariation|
+-------+------+--------+---------+----+------------------+------------------+---------+--------------------+
|  Spain|Madrid|  40.99N|    4.26W|1851|11.014916666666666|11.014916666666666|     1851|                 0.0|
|  Spain|Madrid|  40.99N|    4.26W|1852|11.837083333333332|11.014916666666666|     1851|   7.464120624304915|
|  Spain|Madrid|  40.99N|    4.26W|1853|10.415000000000001|11.014916666666666|     1851|  -5.446402227282688|
|  Spain|Madrid|  40.99N|    4.26W|1854|11.189916666666667|11.014916666666666|     1851|   1.588754643324589|
|  Spain|Madrid|  40.99N|    4.26W|1855|             10.61|11.014916666666666|     1851|  -3.676075624721024|
|  Spain|Madrid|  40.99N|    4.26W|1856|11.868083333333331|11.014916666666666|     1851|  7.7455571611224014|
|  Spain|M

In [55]:
resultDF_avg = resultDF.groupBy("Country", "City").agg(avg("TempVariation"))
resultDF_avg.show()

+-------------+----------------+-------------------+
|      Country|            City| avg(TempVariation)|
+-------------+----------------+-------------------+
|        Egypt|      Alexandria| 1.1910732469400016|
|      Ukraine|            Kiev| -7.480695520019671|
|Côte D'Ivoire|         Abidjan| 1.9407124111953624|
|        India|       Ahmadabad| 2.2886399786041283|
|      Vietnam|Ho Chi Minh City| 1.4936165094433473|
|        India|         Lakhnau|  2.183648258566714|
|     Pakistan|      Faisalabad| 2.5569649278686937|
|     Pakistan|         Karachi|  2.052650421810301|
|        India|          Jaipur| 2.1483952466083363|
|        India|          Madras| 1.7910512641139211|
|        China|       Guangzhou| 1.5165098909364851|
|       Turkey|           Izmir|  1.545534330321618|
|        Spain|          Madrid|  4.941391291424531|
|       Turkey|        Istanbul|-0.7763388484512458|
|        China|          Dalian| 17.490177294210472|
|        Syria|          Aleppo|-0.52153780732

Ya tenemos el dataset filtrado, ordenado y con los datos que buscamos. Ahora pasamos a responder a preguntas

Cuanto ha aumentado la temperatura globalmente desde el comienzo del registro de este dataset? (1850)

In [56]:
globalAvgTempVariation = resultDF.agg(avg("TempVariation").alias("GlobalAvgTempVariation"))
globalAvgTempVariation.show()

+----------------------+
|GlobalAvgTempVariation|
+----------------------+
|     5.037011919370402|
+----------------------+



Desde 1850, la temperatura global a aumentado un 5%.

In [59]:
spainAvgTempVariation = resultDF.where(resultDF["Country"] == "Spain").agg(avg("TempVariation")).alias("SpanishVariation")
spainAvgTempVariation.show()

+------------------+
|avg(TempVariation)|
+------------------+
| 4.941391291424531|
+------------------+



Lo mismo para España.

In [65]:
result_minus = resultDF_avg.where(resultDF_avg["avg(TempVariation)"] < 0).orderBy("avg(TempVariation)")
result_minus.show()

+--------------------+----------+--------------------+
|             Country|      City|  avg(TempVariation)|
+--------------------+----------+--------------------+
|             Ukraine|      Kiev|  -7.480695520019671|
|              Angola|    Luanda| -3.2161431798758517|
|             Nigeria|    Ibadan|  -2.007116452293587|
|              Turkey|    Ankara| -1.5527979359053066|
|             Nigeria|      Kano| -0.7799475107423038|
|              Turkey|  Istanbul| -0.7763388484512458|
|             Morocco|Casablanca| -0.7432251546868214|
|               Syria|    Aleppo| -0.5215378073246277|
|Congo (Democratic...|  Kinshasa|-0.49118113468184943|
|         Philippines|    Manila| -0.1442126100487592|
|                Iraq|   Baghdad|-0.01291547648297...|
+--------------------+----------+--------------------+



In [66]:
result_minus.count()

11

Solo 11 ciudades en todo el mundo registran temperaturas medias en el país con una variación negativa.