In [31]:
# Import findspark 
import findspark

findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder.getOrCreate()
import pandas as pd
pd.read_csv("datasources/caso_covid_full.csv")

Unnamed: 0,date,state,city,place_type,confirmed,deaths,order_for_place,is_last,estimated_population_2019,estimated_population,city_ibge_code,confirmed_per_100k_inhabitants,death_rate
0,2020-12-31,AP,,state,68201,925,283,False,845731.0,861773.0,16.0,7914.03305,0.0136
1,2020-12-30,AP,,state,67702,919,282,False,845731.0,861773.0,16.0,7856.12917,0.0136
2,2020-12-29,AP,,state,67405,913,281,False,845731.0,861773.0,16.0,7821.66533,0.0135
3,2020-12-28,AP,,state,67149,907,280,False,845731.0,861773.0,16.0,7791.95914,0.0135
4,2020-12-27,AP,,state,66724,901,279,False,845731.0,861773.0,16.0,7742.64220,0.0135
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1332236,2021-01-08,SP,Óleo,city,6,0,181,False,2496.0,2471.0,3533809.0,242.81667,0.0000
1332237,2021-01-07,SP,Óleo,city,6,0,180,False,2496.0,2471.0,3533809.0,242.81667,0.0000
1332238,2021-01-05,SP,Óleo,city,6,0,179,False,2496.0,2471.0,3533809.0,242.81667,0.0000
1332239,2021-01-04,SP,Óleo,city,6,0,178,False,2496.0,2471.0,3533809.0,242.81667,0.0000


In [50]:
spark.conf.set("spark.sql.shuffle.partitions", 8)

df = spark \
        .read \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .option("sep", ",") \
        .csv("datasources/caso_covid_full.csv")

In [51]:
df.printSchema()

root
 |-- date: string (nullable = true)
 |-- state: string (nullable = true)
 |-- city: string (nullable = true)
 |-- place_type: string (nullable = true)
 |-- confirmed: integer (nullable = true)
 |-- deaths: integer (nullable = true)
 |-- order_for_place: integer (nullable = true)
 |-- is_last: boolean (nullable = true)
 |-- estimated_population_2019: integer (nullable = true)
 |-- estimated_population: integer (nullable = true)
 |-- city_ibge_code: integer (nullable = true)
 |-- confirmed_per_100k_inhabitants: double (nullable = true)
 |-- death_rate: double (nullable = true)



## Data Preprocessing

In [52]:
df = df \
        .filter(df.place_type == "city") \
        .drop("is_last", "estimated_population_2019", "order_for_place")

### Write transformed Data

In [53]:
df \
    .write \
    .mode("overwrite") \
    .option("compression", "snappy") \
    .parquet("datasources/casos_covid_preprocess.parquet")

In [54]:
test_df = spark.read.parquet("datasources/casos_covid_preprocess.parquet")
print(f"Total Records: {test_df.count():,}")

Total Records: 1,323,917


NameError: name 'dbutils' is not defined

## Create grouping

In [40]:
df \
  .filter(df.place_type == "city") \
  .groupBy("state", "city") \
  .agg({"confirmed": "sum", "deaths": "sum", "confirmed_per_100k_inhabitants": "avg"}) \
  .toPandas()

Unnamed: 0,state,city,avg(confirmed_per_100k_inhabitants),sum(confirmed),sum(deaths)
0,ES,Guaçuí,1159.101704,94152,4605
1,GO,Damianópolis,413.270474,2309,100
2,BA,Carinhanha,425.536772,27957,686
3,BA,Igaporã,235.536014,7446,396
4,BA,Itaberaba,2333.367218,411801,7531
...,...,...,...,...,...
5584,RS,Novo Cabrais,634.995828,4504,0
5585,TO,Pindorama do Tocantins,782.607853,8078,0
5586,SP,Guararema,997.742025,77876,4671
5587,SP,Palestina,738.914440,21042,753


In [41]:
df.toPandas()['place_type'].value_counts()

city    1323917
Name: place_type, dtype: int64

In [42]:
df \
  .where(col("city").isNull()).toPandas()["place_type"].value_counts()

Series([], Name: place_type, dtype: int64)

In [43]:
df \
  .where(col("city").isNotNull()).toPandas()["place_type"].value_counts()

city    1323917
Name: place_type, dtype: int64

In [44]:
df.where(col("place_type") == "city").where(col("city").isNull()).collect()

[]