# Pruebas con las bases de datos



## Preamble 

In [None]:
import $file.sparksession
import sparksession._
import spark.implicits._
import org.apache.spark._
import org.apache.spark.sql.{functions => func, _}
import org.apache.spark.sql.types._, func._

In [None]:
import $ivy.`org.vegas-viz::vegas:0.3.11`

In [None]:
val sqlContext = SparkSession.builder().getOrCreate();

## DataFrame

In [None]:
def readCsv(input: String ): DataFrame ={
    spark.read.format("csv")
  .option("inferSchema", "true")
  .option("header", "true")
  .load("data/" + input + ".csv")
}

In [None]:
val temperatureByCountry : DataFrame = readCsv("GlobalLandTemperaturesByCountry")
val disasterNumber : DataFrame = readCsv("number-of-natural-disaster-events").drop($"Code")
val disasterEconomic: DataFrame = readCsv("economic-damage-from-natural-disasters").drop($"Code")

In [None]:
val deathByCountry : DataFrame = readCsv("deaths-natural-disasters-ihme").drop($"Code")
                                    .withColumn("Deaths", $"Deaths".cast("Long"))
val disasterDeath : DataFrame = readCsv("number-of-deaths-from-natural-disasters").drop($"Code")
val deathPercentByCountry: DataFrame = readCsv("share-deaths-from-natural-disasters").drop($"Code")
                                .withColumn("Deaths(Percent) (%)",format_number($"Deaths(Percent) (%)", 2).cast("Double"))

In [None]:
val earthquakes : DataFrame = readCsv("significant-earthquakes").drop($"Code")
val volcano : DataFrame = readCsv("significant-volcanic-eruptions").drop($"Code")

## Algunas queries 

In [None]:
disasterNumber.select($"Entity", $"Year", $"Number").show

In [None]:
disasterNumber.filter($"Entity" =!="All natural disasters")
         .groupBy($"Year")
         .sum("Number")
         .orderBy($"Year")
         .show

disasterNumber.filter($"Entity" ==="All natural disasters")
         .show


disasterNumber.filter($"Entity" =!="All natural disasters")
         .groupBy($"Year", $"Entity")
         .sum("Number")
         .orderBy($"Year".desc)
         .show

In [None]:
temperatureByCountry.filter($"AverageTemperature" > -1000000)
            .groupBy($"dt")
            .avg("AverageTemperature")
            .orderBy($"dt")
            .limit(20)
            .show

In [None]:
import almond.interpreter.api.DisplayData
display(DisplayData(disasterNumber.filter($"Entity" =!="All natural disasters")
         .groupBy("Entity")
         .sum("Number")
         .orderBy($"Entity")))

## Queries con explicaciones

In [None]:
//Relacionar numero de desastres con sus daños economicos

val disasterNumberDamage =disasterNumber.join(disasterEconomic, Seq("Entity","Year"))
disasterNumberDamage.filter($"Entity" =!="All natural disasters").limit(10).show

In [None]:
//Creo que no funciona porque en ambos casos, en el campo Code es null

disasterNumber.join(disasterEconomic, Seq("Entity","Code","Year")).limit(10).show

In [None]:
// Modificar la columna dt de tipo timestamp (yyyy-MM-dd hh-mm-ss) a solo de años 

val temperatureByCountryYear:DataFrame = temperatureByCountry.withColumn("dt", date_format(col("dt"), "yyyy"))

In [None]:
// tabla con temperetura media del año por pais 
//ordenado primero por el año y luego por pais
val temperatureAVG = temperatureByCountryYear
                    .groupBy($"dt",$"Country")
                    .avg("AverageTemperature")
                    .orderBy($"dt", $"Country")
temperatureAVG.limit(10).show

In [None]:
//Relacionar el numero total de muertos con numero de desastres

val disasterNumberDeath =disasterNumber.join(disasterDeath, Seq("Entity","Year"))
disasterNumberDeath.limit(10).show


In [None]:
//Relacionar numero de muertos por pais con su porcentage

val deathNumberPercent =deathByCountry.join(deathPercentByCountry, Seq("Entity","Year"))
deathNumberPercent.filter($"Year">2000).filter($"Entity" === "Haiti").limit(10).show

## Some graphics

In [None]:
import $ivy.`org.plotly-scala::plotly-almond:0.7.2`

import plotly._
import plotly.element._
import plotly.layout._
import plotly.Almond._


In [None]:



val consulta= disasterNumber.filter($"Entity" =!="All natural disasters")
         .groupBy($"Year",$"Entity")
         .sum("Number")
         .orderBy($"Year")
val x=consulta.select($"Year").collectAsList.toArray.mkString("!!!").replace("[","").replace("]","").split("!!!").toSeq
val y = consulta.select($"sum(Number)").collectAsList.toArray.mkString("!!!").replace("[","").replace("]","").split("!!!").toSeq
Bar(x,y).plot()

In [None]:
val trace1 = Scatter(x,y)


trace1.plot()

In [None]:
val consulta= disasterNumber.filter($"Entity" =!="All natural disasters")
         .groupBy($"Year",$"Entity")
         .sum("Number")
         .groupByKey(_.getAs("Entity"))