Install Spark and Java in VM

In [56]:
# install Java8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# download spark 3.5.0
!wget -q https://apache.osuosl.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz

In [57]:
ls -l # check the .tgz is there

total 795840
-rw-r--r--  1 root root  14138385 Jan 15 10:04 GlobalLandTemperatures_GlobalLandTemperaturesByMajorCity.csv
drwxr-xr-x  1 root root      4096 Jan 11 17:02 [0m[01;34msample_data[0m/
drwxr-xr-x 13 1000 1000      4096 Sep  9 02:08 [01;34mspark-3.5.0-bin-hadoop3[0m/
-rw-r--r--  1 root root 400395283 Sep  9 02:10 spark-3.5.0-bin-hadoop3.tgz
-rw-r--r--  1 root root 400395283 Sep  9 02:10 spark-3.5.0-bin-hadoop3.tgz.1


In [58]:
# unzip it
!tar xf spark-3.5.0-bin-hadoop3.tgz

In [59]:
!pip install -q findspark

In [60]:

!pip install py4j

# For maps
!pip install folium
!pip install plotly



Define the environment

In [61]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.0-bin-hadoop3"
os.environ["PYSPARK_SUBMIT_ARGS"] = "--master local[*] pyspark-shell"

Start Spark Session

---

In [62]:
import findspark
findspark.init("spark-3.5.0-bin-hadoop3")# SPARK_HOME

from pyspark.sql import SparkSession

# create the session
spark = SparkSession \
        .builder \
        .appName("DataFrames Basics") \
        .master("local[*]") \
        .getOrCreate()

spark.version

'3.5.0'

In [63]:
spark

In [64]:
# For Pandas conversion optimization
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

In [65]:
# Import sql functions
from pyspark.sql.functions import *

# Global Temperature Records by Major City (1849-2022)


## Dataset Overview
This comprehensive dataset provides historical records of average temperatures by major city from 1849 to 2022. It's an invaluable resource for climate researchers, historians, and data scientists interested in analyzing long-term temperature trends and their impacts on different parts of the world.

## Data Description
The dataset contains the following columns:

dt: Date of the record.

AverageTemperature: Average temperature of the city for the given date.

AverageTemperatureUncertainty: The uncertainty of the average temperature.

City: Name of the city.

Country: Country in which the city is located.

Latitude: Latitude of the city.

Longitude: Longitude of the city.

## Download datasets

In [66]:
Temperatures_GlobalDF = spark.read.option("header", "true").option("delimiter", ",").csv("/content/GlobalLandTemperatures_GlobalLandTemperaturesByMajorCity.csv")


In [67]:
Temperatures_GlobalDF.printSchema()
Temperatures_GlobalDF.show(10)

root
 |-- dt: string (nullable = true)
 |-- AverageTemperature: string (nullable = true)
 |-- AverageTemperatureUncertainty: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Latitude: string (nullable = true)
 |-- Longitude: string (nullable = true)

+----------+------------------+-----------------------------+-------+-------------+--------+---------+
|        dt|AverageTemperature|AverageTemperatureUncertainty|   City|      Country|Latitude|Longitude|
+----------+------------------+-----------------------------+-------+-------------+--------+---------+
|1849-01-01|            26.704|                        1.435|Abidjan|Côte D'Ivoire|   5.63N|    3.23W|
|1849-02-01|            27.434|                        1.362|Abidjan|Côte D'Ivoire|   5.63N|    3.23W|
|1849-03-01|            28.101|                        1.612|Abidjan|Côte D'Ivoire|   5.63N|    3.23W|
|1849-04-01|             26.14|           1.3869999999999998|Abidjan|Côte D

### Distinct cuidades

In [68]:
unique_values_list = Temperatures_GlobalDF.select("City").distinct().rdd.flatMap(lambda x: x).collect()
print(unique_values_list)

['Bangalore', 'Cairo', 'Casablanca', 'Guangzhou', 'Fortaleza', 'Ho Chi Minh City', 'Lima', 'Madrid', 'Mashhad', 'Dalian', 'Jakarta', 'Jinan', 'Alexandria', 'Baghdad', 'Los Angeles', 'Moscow', 'Harare', 'Dar Es Salaam', 'Berlin', 'Chengdu', 'Ankara', 'Karachi', 'Luanda', 'Lagos', 'London', 'Addis Abeba', 'Montreal', 'Harbin', 'Jiddah', 'Durban', 'Cape Town', 'Mogadishu', 'Ibadan', 'Kano', 'Chongqing', 'Chicago', 'Istanbul', 'Belo Horizonte', 'Abidjan', 'Lahore', 'Kabul', 'Mexico', 'Manila', 'Cali', 'Faisalabad', 'Dakar', 'Delhi', 'Changchun', 'Aleppo', 'Bombay', 'Melbourne', 'Gizeh', 'Calcutta', 'Brasília', 'Kanpur', 'Kiev', 'Kinshasa', 'Bangkok', 'Hyderabad', 'Lakhnau', 'Madras', 'Ahmadabad', 'Jaipur', 'Dhaka', 'Bogotá', 'Izmir', 'Singapore', 'Tianjin', 'Wuhan', 'Tangshan', 'Santo Domingo', 'Nanjing', 'Taiyuan', 'Xian', 'Sydney', 'Tokyo', 'Nagoya', 'Shenyang', 'Paris', 'Riyadh', 'Toronto', 'Rangoon', 'Salvador', 'Surat', 'Peking', 'São Paulo', 'Pune', 'Taipei', 'Surabaya', 'Nagpur', 'R

In [69]:

filter_spb = Temperatures_GlobalDF.filter((col("City") == "Saint Petersburg")& (col("dt") > "1900-01-01"))
filter_spb.show()

+----------+--------------------+-----------------------------+----------------+-------+--------+---------+
|        dt|  AverageTemperature|AverageTemperatureUncertainty|            City|Country|Latitude|Longitude|
+----------+--------------------+-----------------------------+----------------+-------+--------+---------+
|1900-02-01|              -9.852|                        0.684|Saint Petersburg| Russia|  60.27N|   29.19E|
|1900-03-01|  -5.502000000000002|                        0.474|Saint Petersburg| Russia|  60.27N|   29.19E|
|1900-04-01|  0.4630000000000001|                        0.403|Saint Petersburg| Russia|  60.27N|   29.19E|
|1900-05-01|   6.448999999999999|                         0.31|Saint Petersburg| Russia|  60.27N|   29.19E|
|1900-06-01|              12.377|                        0.753|Saint Petersburg| Russia|  60.27N|   29.19E|
|1900-07-01|              15.317|                         0.73|Saint Petersburg| Russia|  60.27N|   29.19E|
|1900-08-01|              16

In [70]:
# Avg/Min/Max temperature

stat_temp = filter_spb.select(
    avg("AverageTemperature").alias("Avg_temperature"),
    min("AverageTemperature").alias("Min_temperature"),
    max("AverageTemperature").alias("Max_temperature")
    )
stat_temp.show()

+-----------------+--------------------+-----------------+
|  Avg_temperature|     Min_temperature|  Max_temperature|
+-----------------+--------------------+-----------------+
|4.344805575935429|-0.00100000000000...|9.966000000000001|
+-----------------+--------------------+-----------------+



In [71]:
# filter julio 1991 - 2012
filter_spb_1991_2012 = filter_spb.filter((col("dt") >= "1991-01-01") & (col("dt") <= "2012-12-01"))
filtered_spb_julio = filter_spb_1991_2012.filter(month("dt") == 7)
filtered_spb_julio.show(22)

+----------+------------------+-----------------------------+----------------+-------+--------+---------+
|        dt|AverageTemperature|AverageTemperatureUncertainty|            City|Country|Latitude|Longitude|
+----------+------------------+-----------------------------+----------------+-------+--------+---------+
|1991-07-01|            16.968|          0.32299999999999995|Saint Petersburg| Russia|  60.27N|   29.19E|
|1992-07-01|            16.348|                        0.262|Saint Petersburg| Russia|  60.27N|   29.19E|
|1993-07-01|            15.581|                        0.168|Saint Petersburg| Russia|  60.27N|   29.19E|
|1994-07-01|             18.48|          0.28800000000000003|Saint Petersburg| Russia|  60.27N|   29.19E|
|1995-07-01|            15.148|                        0.311|Saint Petersburg| Russia|  60.27N|   29.19E|
|1996-07-01|            14.755|                        0.279|Saint Petersburg| Russia|  60.27N|   29.19E|
|1997-07-01|            17.977|           0.49

### Statistics Avg Temperature en San-Petersburg en julio de 1991 a 2012

In [72]:
# falta solo columnas que me interesa
filtered_spb_julio = filtered_spb_julio.select("dt","AverageTemperature","City")
filtered_spb_julio.show(23)

+----------+------------------+----------------+
|        dt|AverageTemperature|            City|
+----------+------------------+----------------+
|1991-07-01|            16.968|Saint Petersburg|
|1992-07-01|            16.348|Saint Petersburg|
|1993-07-01|            15.581|Saint Petersburg|
|1994-07-01|             18.48|Saint Petersburg|
|1995-07-01|            15.148|Saint Petersburg|
|1996-07-01|            14.755|Saint Petersburg|
|1997-07-01|            17.977|Saint Petersburg|
|1998-07-01|            16.549|Saint Petersburg|
|1999-07-01|18.505000000000006|Saint Petersburg|
|2000-07-01|            16.887|Saint Petersburg|
|2001-07-01|            20.199|Saint Petersburg|
|2002-07-01|18.944000000000003|Saint Petersburg|
|2003-07-01|            19.782|Saint Petersburg|
|2004-07-01|16.845000000000002|Saint Petersburg|
|2005-07-01|            18.366|Saint Petersburg|
|2006-07-01|            17.581|Saint Petersburg|
|2007-07-01|16.868000000000002|Saint Petersburg|
|2008-07-01|        

### Min y Max avg temp por cuidad

In [73]:
min_max_temp_cuidad = Temperatures_GlobalDF.groupBy("City") \
  .agg(
      max("AverageTemperature").alias("max_temp"),
      min("AverageTemperature").alias("min_temp")
  ) \
  .show()

+--------------+------------------+--------------------+
|          City|          max_temp|            min_temp|
+--------------+------------------+--------------------+
|       Abidjan|29.923000000000002|  22.363000000000003|
|   Addis Abeba|21.223000000000003|              14.528|
|     Ahmadabad| 35.41900000000001|              16.792|
|        Aleppo|              9.98|  0.6699999999999999|
|    Alexandria|            28.806|              10.227|
|        Ankara| 9.988999999999999|-0.01400000000000...|
|       Baghdad|             9.992|              10.024|
|     Bangalore|            29.688|              20.257|
|       Bangkok|            31.115|  21.894000000000002|
|Belo Horizonte|25.226000000000006|               15.92|
|        Berlin|             9.971|-0.01700000000000...|
|        Bogotá|22.508000000000006|  17.932000000000002|
|        Bombay|            30.682|               21.01|
|      Brasília|25.933000000000003|  17.208000000000002|
|         Cairo|             9.

In [74]:
# import library
from pyspark.sql.window import Window

### Most hot date for each city

In [75]:
# now we apply the window partitioning
byAvgTemp = Window.partitionBy("City").orderBy(col("AverageTemperature").desc())

byAvgTempDF = Temperatures_GlobalDF.withColumn("rank_temp", row_number().over(byAvgTemp)).filter(col("rank_temp") <= 1)
byAvgTempDF = byAvgTempDF.select("dt","AverageTemperature","City", "Country")
byAvgTempDF.show()

+----------+------------------+--------------+-------------+
|        dt|AverageTemperature|          City|      Country|
+----------+------------------+--------------+-------------+
|1998-03-01|29.923000000000002|       Abidjan|Côte D'Ivoire|
|2011-04-01|21.223000000000003|   Addis Abeba|     Ethiopia|
|2010-05-01| 35.41900000000001|     Ahmadabad|        India|
|1846-11-01|              9.98|        Aleppo|        Syria|
|2010-08-01|            28.806|    Alexandria|        Egypt|
|1897-04-01| 9.988999999999999|        Ankara|       Turkey|
|1839-01-01|             9.992|       Baghdad|         Iraq|
|1998-04-01|            29.688|     Bangalore|        India|
|2010-05-01|            31.115|       Bangkok|     Thailand|
|1984-02-01|25.226000000000006|Belo Horizonte|       Brazil|
|1962-04-01|             9.971|        Berlin|      Germany|
|2010-02-01|22.508000000000006|        Bogotá|     Colombia|
|2010-05-01|            30.682|        Bombay|        India|
|2005-10-01|25.933000000