In [36]:
from pyspark.sql import SparkSession
from pyspark.files import SparkFiles

In [37]:
oij_csv=spark.read.format("csv").option("header","true").option("inferSchema","true").load("Estadisticas.csv")

In [38]:
inec_csv=spark.read.format("csv").option("header","true").option("inferSchema","true").load("reempleocenso2011-22.csv")

In [39]:
oij_csv.printSchema()

root
 |-- Delito: string (nullable = true)
 |-- SubDelito: string (nullable = true)
 |-- Fecha: string (nullable = true)
 |-- Hora: string (nullable = true)
 |-- Victima: string (nullable = true)
 |-- SubVictima: string (nullable = true)
 |-- Edad: string (nullable = true)
 |-- Genero: string (nullable = true)
 |-- Nacionalidad: string (nullable = true)
 |-- Provincia: string (nullable = true)
 |-- Canton: string (nullable = true)
 |-- Distrito: string (nullable = true)



In [40]:

inec_csv.printSchema()

root
 |-- ProvinciaCantonDistrito: string (nullable = true)
 |-- PoblacionMayor15: integer (nullable = true)
 |-- TasaNetaParticipacion: double (nullable = true)
 |-- TasaOcupacion: double (nullable = true)
 |-- TasaDesempleoAbierto: string (nullable = true)
 |-- PorcentajeEconomicamenteInactivo: double (nullable = true)
 |-- RelacionDependenciaEconomica: double (nullable = true)



In [41]:
sqlContext.registerDataFrameAsTable(oij_csv, "oij")

In [42]:
sqlContext.registerDataFrameAsTable(inec_csv, "inec")

In [43]:
def quitaEspacios(ind):
    if ind == 1:
        return sqlContext.sql("SELECT Delito, SubDelito,Hora, Fecha, Victima, SubVictima, Edad, Genero, Nacionalidad, Provincia,Canton, TRIM(Distrito) as Distrito from oij")
    else:
        return sqlContext.sql("SELECT TRIM(ProvinciaCantonDistrito) AS ProvinciaCantonDistrito, PoblacionMayor15, TasaNetaParticipacion, TasaOcupacion, TasaDesempleoAbierto, PorcentajeEconomicamenteInactivo, RelacionDependenciaEconomica from inec")

In [44]:
oij = quitaEspacios(1)
sqlContext.registerDataFrameAsTable(oij, "oij")

In [45]:
inec = quitaEspacios(2)
sqlContext.registerDataFrameAsTable(inec, "inec")

In [46]:
def minusculas(ind):
    if ind == 1:
        return sqlContext.sql("SELECT Delito, SubDelito,Hora, Fecha, Victima, SubVictima, Edad, Genero, Nacionalidad, Provincia,Canton, LOWER(Distrito) as Distrito from oij")
    else:
        return sqlContext.sql("SELECT LOWER(ProvinciaCantonDistrito) AS ProvinciaCantonDistrito, PoblacionMayor15, TasaNetaParticipacion, TasaOcupacion, TasaDesempleoAbierto, PorcentajeEconomicamenteInactivo, RelacionDependenciaEconomica from inec")

In [47]:
oij = minusculas(1)
sqlContext.registerDataFrameAsTable(oij, "oij")

In [48]:
inec = minusculas(2)
sqlContext.registerDataFrameAsTable(inec, "inec")

In [72]:
def sacaNoExistentes():
    return sqlContext.sql("SELECT DISTINCT(Distrito) FROM oij WHERE NOT EXISTS(SELECT 1 FROM inec WHERE inec.ProvinciaCantonDistrito = oij.Distrito)") 

In [77]:
noCoinciden = sacaNoExistentes()
sqlContext.registerDataFrameAsTable(noCoinciden, "noCoincidencias")

In [78]:
noCoinciden.show()

+-------------------+
|           Distrito|
+-------------------+
|             pococi|
|            guacimo|
|           la union|
|              belen|
|             sarchí|
|               null|
|        leon cortes|
|            tarrazu|
|           san jose|
|          san ramon|
|             aserri|
|vasquez de coronado|
|            paraiso|
|            tilaran|
|              canas|
|              limon|
|      perez zeledon|
|              tibas|
|          sarapiqui|
|             escazu|
+-------------------+
only showing top 20 rows



In [82]:
def cuentaNoExistentes():
    return sqlContext.sql("SELECT COUNT(*) as num FROM noCoincidencias") 

In [83]:
numNoCoinciden = cuentaNoExistentes()

In [84]:
numNoCoinciden.show()

+---+
|num|
+---+
| 21|
+---+



In [59]:
oij.show()

+------+-----------+-------------------+----------+--------+--------------------+-------------+------+------------+----------+-------------+--------+
|Delito|  SubDelito|               Hora|     Fecha| Victima|          SubVictima|         Edad|Genero|Nacionalidad| Provincia|       Canton|Distrito|
+------+-----------+-------------------+----------+--------+--------------------+-------------+------+------------+----------+-------------+--------+
|ASALTO|ARMA BLANCA|18:00:00 - 20:59:59|2011-01-10| PERSONA|    PEATON [PERSONA]|Mayor de edad|HOMBRE|   NICARAGUA|  SAN JOSE|   ALAJUELITA|    null|
|ASALTO|ARMA BLANCA|03:00:00 - 05:59:59|2011-02-02| PERSONA|    PEATON [PERSONA]|Mayor de edad|HOMBRE|  COSTA RICA|   CARTAGO|     OREAMUNO|    null|
|ASALTO|ARMA BLANCA|21:00:00 - 23:59:59|2011-10-23| PERSONA|    PEATON [PERSONA]|Mayor de edad|HOMBRE|  COSTA RICA|   HEREDIA|      HEREDIA|    null|
|ASALTO|ARMA BLANCA|18:00:00 - 20:59:59|2011-05-19| PERSONA|    PEATON [PERSONA]|Mayor de edad| MUJE

In [60]:
inec.show()

+-----------------------+----------------+---------------------+-------------+--------------------+--------------------------------+----------------------------+
|ProvinciaCantonDistrito|PoblacionMayor15|TasaNetaParticipacion|TasaOcupacion|TasaDesempleoAbierto|PorcentajeEconomicamenteInactivo|RelacionDependenciaEconomica|
+-----------------------+----------------+---------------------+-------------+--------------------+--------------------------------+----------------------------+
|             costa rica|         3233882|                 53.5|         51.7|                 3.4|                            46.5|                         1.5|
|               san josé|         1087315|                 56.0|         54.1|                 3.5|                            44.0|                         1.3|
|               san josé|          225856|                 56.7|         54.5|                 3.9|                            43.3|                         1.2|
|                 carmen|   