# Data Inspection
Author(s): Tomaso Stefanizzi

Description:
The dataset used for this project contains a historical data about brazilian flights between 2000 until 2021.
You can find the source 
[here](https://www.kaggle.com/datasets/mayconfelipemota/brazilian-data-flights).

### Imports

In [83]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os as os

### read data

In [84]:
dest_path = 'dataset/DW_ARPT_DEST.csv'
orig_path = 'dataset/DW_ARPT_ORIGEM.csv'
company_path = 'dataset/DW_EMPRESA.csv'
equip_path = 'dataset/DW_EQPT.csv'
line_path = 'dataset/DW_TIPO_LINHA.csv'
#voos_path = 'dataset/DW_VOO.csv'

#read the data
dest = pd.read_csv(dest_path, encoding='latin1')
orig = pd.read_csv(orig_path, encoding='latin1')
company = pd.read_csv(company_path, encoding='latin1')
equip = pd.read_csv(equip_path, encoding='latin1')
line = pd.read_csv(line_path, encoding='latin1')
#voos = pd.read_csv(voos_path)

# Tables inspection

## Airport Destination

In [85]:
dest.head()

Unnamed: 0,DT_CARGA_DW,id_aerodromo_destino,sg_icao_destino,sg_iata_destino,nm_aerodromo_destino,nm_municipio_destino,sg_uf_destino,nm_regiao_destino,nm_pais_destino,nm_continente_destino,nr_escala_destino
0,2020-12-28T16:10:21,162,SBKP,VCP,VIRACOPOS,CAMPINAS,SP,SUDESTE,BRASIL,AMÉRICA DO SUL,
1,2020-12-28T16:10:21,230,GOOY,DKR,DACAR,DACAR,,,SENEGAL,ÁFRICA,
2,2020-12-28T16:10:21,275,EDDF,FRA,FRANKFURT INTERNATIONAL AIRPORT,FRANKFURT AM MAIN,,,ALEMANHA,EUROPA,
3,2020-12-28T16:10:21,467,KMIA,MIA,MIAMI INTERNATIONAL AIRPORT,"MIAMI, FLORIDA",,,ESTADOS UNIDOS DA AMÉRICA,AMÉRICA DO NORTE,
4,2020-12-28T16:10:21,626,SBGL,GIG,AEROPORTO INTERNACIONAL DO RIO DE JANEIRO (GAL...,RIO DE JANEIRO,RJ,SUDESTE,BRASIL,AMÉRICA DO SUL,


In [86]:
dest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1549 entries, 0 to 1548
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   DT_CARGA_DW            1549 non-null   object 
 1   id_aerodromo_destino   1549 non-null   int64  
 2   sg_icao_destino        1549 non-null   object 
 3   sg_iata_destino        1253 non-null   object 
 4   nm_aerodromo_destino   1549 non-null   object 
 5   nm_municipio_destino   1549 non-null   object 
 6   sg_uf_destino          840 non-null    object 
 7   nm_regiao_destino      841 non-null    object 
 8   nm_pais_destino        1549 non-null   object 
 9   nm_continente_destino  1549 non-null   object 
 10  nr_escala_destino      0 non-null      float64
dtypes: float64(1), int64(1), object(9)
memory usage: 133.2+ KB


#### let's see if there are duplicates in the id (which i assume to be ```id_aerodromo_destino```)

In [87]:
dest['id_aerodromo_destino'].value_counts()


id_aerodromo_destino
34       5
21243    5
10       5
1292     5
162      4
        ..
129      1
77       1
392      1
158      1
3233     1
Name: count, Length: 1025, dtype: int64

and indeed there are :D let's keep just one (possibly with all the informations)

In [88]:
dest[dest['id_aerodromo_destino']==34]

Unnamed: 0,DT_CARGA_DW,id_aerodromo_destino,sg_icao_destino,sg_iata_destino,nm_aerodromo_destino,nm_municipio_destino,sg_uf_destino,nm_regiao_destino,nm_pais_destino,nm_continente_destino,nr_escala_destino
96,2020-12-28T16:10:21,34,SDAG,,ANGRA DOS REIS,ANGRA DOS REIS,RJ,SUDESTE,BRASIL,AMÉRICA DO SUL,
1023,2020-12-28T16:10:21,34,SDAG,QAR,ANGRA DOS REIS,ANGRA DOS REIS,RJ,SUDESTE,BRASIL,AMÉRICA DO SUL,
1166,2020-12-28T16:10:21,34,SDAG,QAR,ANGRA DOS REIS,ANGRA DOS REIS,RJ,SUDESTE,BRASIL,AMÉRICA DO SUL,
1288,2020-12-28T16:10:21,34,SDAG,QAR,ANGRA DOS REIS,ANGRA DOS REIS,RJ,SUDESTE,BRASIL,AMÉRICA DO SUL,
1452,2020-12-28T16:10:21,34,SDAG,QAR,ANGRA DOS REIS,ANGRA DOS REIS,RJ,SUDESTE,BRASIL,AMÉRICA DO SUL,


In [89]:
#remove duplicates
dest = dest.drop_duplicates()

In [90]:
dest.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1035 entries, 0 to 1484
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   DT_CARGA_DW            1035 non-null   object 
 1   id_aerodromo_destino   1035 non-null   int64  
 2   sg_icao_destino        1035 non-null   object 
 3   sg_iata_destino        759 non-null    object 
 4   nm_aerodromo_destino   1035 non-null   object 
 5   nm_municipio_destino   1035 non-null   object 
 6   sg_uf_destino          497 non-null    object 
 7   nm_regiao_destino      498 non-null    object 
 8   nm_pais_destino        1035 non-null   object 
 9   nm_continente_destino  1035 non-null   object 
 10  nr_escala_destino      0 non-null      float64
dtypes: float64(1), int64(1), object(9)
memory usage: 97.0+ KB


In [91]:
dupl = dest['id_aerodromo_destino'].value_counts() 
dupl[dupl>1]

id_aerodromo_destino
802      3
17       2
34       2
10       2
1292     2
1479     2
208      2
21243    2
338      2
Name: count, dtype: int64

#### id 802:

In [92]:
dest[dest['id_aerodromo_destino']==802]

Unnamed: 0,DT_CARGA_DW,id_aerodromo_destino,sg_icao_destino,sg_iata_destino,nm_aerodromo_destino,nm_municipio_destino,sg_uf_destino,nm_regiao_destino,nm_pais_destino,nm_continente_destino,nr_escala_destino
111,2020-12-28T16:10:21,802,SBQV,,VITÓRIA DA CONQUISTA,VITÓRIA DA CONQUISTA,BA,NORDESTE,BRASIL,AMÉRICA DO SUL,
456,2020-12-28T16:10:21,802,SBQV,VDC,VITÓRIA DA CONQUISTA,VITÓRIA DA CONQUISTA,BA,NORDESTE,BRASIL,AMÉRICA DO SUL,
1244,2020-12-28T16:10:21,802,SBQV,VD*,VITÓRIA DA CONQUISTA,VITÓRIA DA CONQUISTA,BA,NORDESTE,BRASIL,AMÉRICA DO SUL,


In [93]:
#drop line with index 0
dest = dest.drop([111,1244])


#### id 17:


In [94]:
dest[dest['id_aerodromo_destino']==17]

Unnamed: 0,DT_CARGA_DW,id_aerodromo_destino,sg_icao_destino,sg_iata_destino,nm_aerodromo_destino,nm_municipio_destino,sg_uf_destino,nm_regiao_destino,nm_pais_destino,nm_continente_destino,nr_escala_destino
399,2020-12-28T16:10:21,17,SNYA,SYL,ALMEIRIM,ALMEIRIM,PA,NORTE,BRASIL,AMÉRICA DO SUL,
1155,2020-12-28T16:10:21,17,SNYA,GGF,ALMEIRIM,ALMEIRIM,PA,NORTE,BRASIL,AMÉRICA DO SUL,


I kept the correct code

In [95]:
dest = dest.drop(399)

#### id 34:

In [96]:
dest[dest['id_aerodromo_destino']==34]

Unnamed: 0,DT_CARGA_DW,id_aerodromo_destino,sg_icao_destino,sg_iata_destino,nm_aerodromo_destino,nm_municipio_destino,sg_uf_destino,nm_regiao_destino,nm_pais_destino,nm_continente_destino,nr_escala_destino
96,2020-12-28T16:10:21,34,SDAG,,ANGRA DOS REIS,ANGRA DOS REIS,RJ,SUDESTE,BRASIL,AMÉRICA DO SUL,
1023,2020-12-28T16:10:21,34,SDAG,QAR,ANGRA DOS REIS,ANGRA DOS REIS,RJ,SUDESTE,BRASIL,AMÉRICA DO SUL,


In [97]:
dest = dest.drop(96)

#### id 10:

In [98]:
dest[dest['id_aerodromo_destino']==10]

Unnamed: 0,DT_CARGA_DW,id_aerodromo_destino,sg_icao_destino,sg_iata_destino,nm_aerodromo_destino,nm_municipio_destino,sg_uf_destino,nm_regiao_destino,nm_pais_destino,nm_continente_destino,nr_escala_destino
639,2020-12-28T16:10:21,10,SWHP,,OLHOS D´ÁGUA,ÁGUA BOA,MT,CENTRO-OESTE,BRASIL,AMÉRICA DO SUL,
1022,2020-12-28T16:10:21,10,SWHP,GGB,OLHOS D´ÁGUA,ÁGUA BOA,MG,SUDESTE,BRASIL,AMÉRICA DO SUL,


In [99]:
dest = dest.drop(639)

#### id 1292:

In [100]:
dest[dest['id_aerodromo_destino']==1292]

Unnamed: 0,DT_CARGA_DW,id_aerodromo_destino,sg_icao_destino,sg_iata_destino,nm_aerodromo_destino,nm_municipio_destino,sg_uf_destino,nm_regiao_destino,nm_pais_destino,nm_continente_destino,nr_escala_destino
1009,2020-12-28T16:10:21,1292,SNRJ,,BREJO,BREJO,MA,NORDESTE,BRASIL,AMÉRICA DO SUL,
1021,2020-12-28T16:10:21,1292,SNRJ,WQB,BREJO,BREJO,MA,NORDESTE,BRASIL,AMÉRICA DO SUL,


In [101]:
dest = dest.drop(1009)

#### id 1479:

In [102]:
dest[dest['id_aerodromo_destino']==1479]

Unnamed: 0,DT_CARGA_DW,id_aerodromo_destino,sg_icao_destino,sg_iata_destino,nm_aerodromo_destino,nm_municipio_destino,sg_uf_destino,nm_regiao_destino,nm_pais_destino,nm_continente_destino,nr_escala_destino
996,2020-12-28T16:10:21,1479,SWBG,,PONTES E LACERDA,PONTES E LACERDA,MT,CENTRO-OESTE,BRASIL,AMÉRICA DO SUL,
1026,2020-12-28T16:10:21,1479,SWBG,LCB,PONTES E LACERDA,PONTES E LACERDA,MT,CENTRO-OESTE,BRASIL,AMÉRICA DO SUL,


In [103]:
dest = dest.drop(996)

#### id 208:

In [104]:
dest[dest['id_aerodromo_destino']==208]

Unnamed: 0,DT_CARGA_DW,id_aerodromo_destino,sg_icao_destino,sg_iata_destino,nm_aerodromo_destino,nm_municipio_destino,sg_uf_destino,nm_regiao_destino,nm_pais_destino,nm_continente_destino,nr_escala_destino
698,2020-12-28T16:10:21,208,SBUY,,URUCU,COARI,AM,NORTE,BRASIL,AMÉRICA DO SUL,
1194,2020-12-28T16:10:21,208,SBUY,RPU,URUCU,COARI,AM,NORTE,BRASIL,AMÉRICA DO SUL,


In [105]:
dest = dest.drop(698)

#### id 21243:

also here i kept the correct code

In [106]:
dest[dest['id_aerodromo_destino']==21243]

Unnamed: 0,DT_CARGA_DW,id_aerodromo_destino,sg_icao_destino,sg_iata_destino,nm_aerodromo_destino,nm_municipio_destino,sg_uf_destino,nm_regiao_destino,nm_pais_destino,nm_continente_destino,nr_escala_destino
973,2020-12-28T16:10:21,21243,LTFM,IST,ISTANBUL AIRPORT,ISTANBUL,,,TURQUIA,EUROPA,
976,2020-12-28T16:10:21,21243,LTFM,ISL,ISTANBUL AIRPORT,ISTANBUL,,,TURQUIA,EUROPA,


In [107]:
dest = dest.drop(976)

#### id 338:

and so here

In [108]:
dest[dest['id_aerodromo_destino']==338]

Unnamed: 0,DT_CARGA_DW,id_aerodromo_destino,sg_icao_destino,sg_iata_destino,nm_aerodromo_destino,nm_municipio_destino,sg_uf_destino,nm_regiao_destino,nm_pais_destino,nm_continente_destino,nr_escala_destino
652,2020-12-28T16:10:21,338,LTBA,IST,ATATÜRK INTERNATIONAL AIRPORT,ISTANBUL,,,TURQUIA,EUROPA,
656,2020-12-28T16:10:21,338,LTBA,ISL,ATATÜRK INTERNATIONAL AIRPORT,ISTANBUL,,,TURQUIA,EUROPA,


In [109]:
dest = dest.drop(652)

## Origin

In [110]:
orig.head()

Unnamed: 0,DT_CARGA_DW,id_aerodromo_origem,sg_icao_origem,sg_iata_origem,nm_aerodromo_origem,nm_municipio_origem,sg_uf_origem,nm_regiao_origem,nm_pais_origem,nm_continente_origem
0,2020-12-28T16:10:21,230,GOOY,DKR,DACAR,DACAR,,,SENEGAL,ÁFRICA
1,2020-12-28T16:10:21,275,EDDF,FRA,FRANKFURT INTERNATIONAL AIRPORT,FRANKFURT AM MAIN,,,ALEMANHA,EUROPA
2,2020-12-28T16:10:21,162,SBKP,VCP,VIRACOPOS,CAMPINAS,SP,SUDESTE,BRASIL,AMÉRICA DO SUL
3,2020-12-28T16:10:21,301,SBGR,GRU,GUARULHOS - GOVERNADOR ANDRÉ FRANCO MONTORO,GUARULHOS,SP,SUDESTE,BRASIL,AMÉRICA DO SUL
4,2020-12-28T16:10:21,500,KJFK,JFK,JOHN F. KENNEDY INTERNATIONAL AIRPORT,"NEW YORK, NEW YORK",,,ESTADOS UNIDOS DA AMÉRICA,AMÉRICA DO NORTE


In [111]:
orig.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1524 entries, 0 to 1523
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   DT_CARGA_DW           1524 non-null   object
 1   id_aerodromo_origem   1524 non-null   int64 
 2   sg_icao_origem        1524 non-null   object
 3   sg_iata_origem        1239 non-null   object
 4   nm_aerodromo_origem   1524 non-null   object
 5   nm_municipio_origem   1524 non-null   object
 6   sg_uf_origem          837 non-null    object
 7   nm_regiao_origem      838 non-null    object
 8   nm_pais_origem        1524 non-null   object
 9   nm_continente_origem  1524 non-null   object
dtypes: int64(1), object(9)
memory usage: 119.2+ KB


#### also here, let's see if the ids are unique

In [112]:
dupl = orig['id_aerodromo_origem'].value_counts()
dupl[dupl>1]

id_aerodromo_origem
10       5
34       5
21243    5
1292     5
683      4
        ..
251      2
211      2
338      2
419      2
18313    2
Name: count, Length: 206, dtype: int64

there are too many duplicates. for this reason, I want it to be consistent with the ones from the departure, and the others i'll just keep the same values for the ones in departures, and then for the others I'll try another strategy.

-   802      3
-   17       2
-   34       2
-   10       2
-   1292     2
-   1479     2
-   208      2
-   21243    2
-   338      2

#### strategy 1:

##### id 802:

In [113]:
orig[orig['id_aerodromo_origem']==802]

Unnamed: 0,DT_CARGA_DW,id_aerodromo_origem,sg_icao_origem,sg_iata_origem,nm_aerodromo_origem,nm_municipio_origem,sg_uf_origem,nm_regiao_origem,nm_pais_origem,nm_continente_origem
119,2020-12-28T16:10:21,802,SBQV,,VITÓRIA DA CONQUISTA,VITÓRIA DA CONQUISTA,BA,NORDESTE,BRASIL,AMÉRICA DO SUL
445,2020-12-28T16:10:21,802,SBQV,VDC,VITÓRIA DA CONQUISTA,VITÓRIA DA CONQUISTA,BA,NORDESTE,BRASIL,AMÉRICA DO SUL
1211,2020-12-28T16:10:21,802,SBQV,VD*,VITÓRIA DA CONQUISTA,VITÓRIA DA CONQUISTA,BA,NORDESTE,BRASIL,AMÉRICA DO SUL
1482,2020-12-28T16:10:21,802,SBQV,VD*,VITÓRIA DA CONQUISTA,VITÓRIA DA CONQUISTA,BA,NORDESTE,BRASIL,AMÉRICA DO SUL


In [114]:
orig = orig.drop([119,1211,1482])

##### id 17:

In [115]:
orig[orig['id_aerodromo_origem']==17]

Unnamed: 0,DT_CARGA_DW,id_aerodromo_origem,sg_icao_origem,sg_iata_origem,nm_aerodromo_origem,nm_municipio_origem,sg_uf_origem,nm_regiao_origem,nm_pais_origem,nm_continente_origem
395,2020-12-28T16:10:21,17,SNYA,SYL,ALMEIRIM,ALMEIRIM,PA,NORTE,BRASIL,AMÉRICA DO SUL
1127,2020-12-28T16:10:21,17,SNYA,GGF,ALMEIRIM,ALMEIRIM,PA,NORTE,BRASIL,AMÉRICA DO SUL
1246,2020-12-28T16:10:21,17,SNYA,GGF,ALMEIRIM,ALMEIRIM,PA,NORTE,BRASIL,AMÉRICA DO SUL
1393,2020-12-28T16:10:21,17,SNYA,GGF,ALMEIRIM,ALMEIRIM,PA,NORTE,BRASIL,AMÉRICA DO SUL


In [116]:
orig = orig.drop([395, 1246, 1393])   

##### id 34:

In [117]:
orig[orig['id_aerodromo_origem']==34]

Unnamed: 0,DT_CARGA_DW,id_aerodromo_origem,sg_icao_origem,sg_iata_origem,nm_aerodromo_origem,nm_municipio_origem,sg_uf_origem,nm_regiao_origem,nm_pais_origem,nm_continente_origem
53,2020-12-28T16:10:21,34,SDAG,,ANGRA DOS REIS,ANGRA DOS REIS,RJ,SUDESTE,BRASIL,AMÉRICA DO SUL
993,2020-12-28T16:10:21,34,SDAG,QAR,ANGRA DOS REIS,ANGRA DOS REIS,RJ,SUDESTE,BRASIL,AMÉRICA DO SUL
1134,2020-12-28T16:10:21,34,SDAG,QAR,ANGRA DOS REIS,ANGRA DOS REIS,RJ,SUDESTE,BRASIL,AMÉRICA DO SUL
1266,2020-12-28T16:10:21,34,SDAG,QAR,ANGRA DOS REIS,ANGRA DOS REIS,RJ,SUDESTE,BRASIL,AMÉRICA DO SUL
1427,2020-12-28T16:10:21,34,SDAG,QAR,ANGRA DOS REIS,ANGRA DOS REIS,RJ,SUDESTE,BRASIL,AMÉRICA DO SUL


In [118]:
orig = orig.drop([53, 1134, 1266, 1427])

##### id 10:

In [119]:
orig[orig['id_aerodromo_origem']==10]

Unnamed: 0,DT_CARGA_DW,id_aerodromo_origem,sg_icao_origem,sg_iata_origem,nm_aerodromo_origem,nm_municipio_origem,sg_uf_origem,nm_regiao_origem,nm_pais_origem,nm_continente_origem
632,2020-12-28T16:10:21,10,SWHP,,OLHOS D´ÁGUA,ÁGUA BOA,MT,CENTRO-OESTE,BRASIL,AMÉRICA DO SUL
994,2020-12-28T16:10:21,10,SWHP,GGB,OLHOS D´ÁGUA,ÁGUA BOA,MG,SUDESTE,BRASIL,AMÉRICA DO SUL
1129,2020-12-28T16:10:21,10,SWHP,GGB,OLHOS D´ÁGUA,ÁGUA BOA,MG,SUDESTE,BRASIL,AMÉRICA DO SUL
1240,2020-12-28T16:10:21,10,SWHP,GGB,OLHOS D´ÁGUA,ÁGUA BOA,MG,SUDESTE,BRASIL,AMÉRICA DO SUL
1418,2020-12-28T16:10:21,10,SWHP,GGB,OLHOS D´ÁGUA,ÁGUA BOA,MG,SUDESTE,BRASIL,AMÉRICA DO SUL


In [120]:
orig = orig.drop([632, 1129, 1240,1418])

##### id 1292:

In [121]:
orig[orig['id_aerodromo_origem']==1292]

Unnamed: 0,DT_CARGA_DW,id_aerodromo_origem,sg_icao_origem,sg_iata_origem,nm_aerodromo_origem,nm_municipio_origem,sg_uf_origem,nm_regiao_origem,nm_pais_origem,nm_continente_origem
982,2020-12-28T16:10:21,1292,SNRJ,,BREJO,BREJO,MA,NORDESTE,BRASIL,AMÉRICA DO SUL
992,2020-12-28T16:10:21,1292,SNRJ,WQB,BREJO,BREJO,MA,NORDESTE,BRASIL,AMÉRICA DO SUL
1135,2020-12-28T16:10:21,1292,SNRJ,WQB,BREJO,BREJO,MA,NORDESTE,BRASIL,AMÉRICA DO SUL
1256,2020-12-28T16:10:21,1292,SNRJ,WQB,BREJO,BREJO,MA,NORDESTE,BRASIL,AMÉRICA DO SUL
1409,2020-12-28T16:10:21,1292,SNRJ,WQB,BREJO,BREJO,MA,NORDESTE,BRASIL,AMÉRICA DO SUL


In [122]:
orig = orig.drop([982, 1135, 1256, 1409])

##### id 1479:

In [123]:
orig[orig['id_aerodromo_origem']==1479]

Unnamed: 0,DT_CARGA_DW,id_aerodromo_origem,sg_icao_origem,sg_iata_origem,nm_aerodromo_origem,nm_municipio_origem,sg_uf_origem,nm_regiao_origem,nm_pais_origem,nm_continente_origem
965,2020-12-28T16:10:21,1479,SWBG,,PONTES E LACERDA,PONTES E LACERDA,MT,CENTRO-OESTE,BRASIL,AMÉRICA DO SUL
997,2020-12-28T16:10:21,1479,SWBG,LCB,PONTES E LACERDA,PONTES E LACERDA,MT,CENTRO-OESTE,BRASIL,AMÉRICA DO SUL
1113,2020-12-28T16:10:21,1479,SWBG,LCB,PONTES E LACERDA,PONTES E LACERDA,MT,CENTRO-OESTE,BRASIL,AMÉRICA DO SUL
1323,2020-12-28T16:10:21,1479,SWBG,LCB,PONTES E LACERDA,PONTES E LACERDA,MT,CENTRO-OESTE,BRASIL,AMÉRICA DO SUL


In [124]:
orig = orig.drop([965, 113, 1323])

##### id 208:

In [125]:
orig[orig['id_aerodromo_origem']==208]

Unnamed: 0,DT_CARGA_DW,id_aerodromo_origem,sg_icao_origem,sg_iata_origem,nm_aerodromo_origem,nm_municipio_origem,sg_uf_origem,nm_regiao_origem,nm_pais_origem,nm_continente_origem
692,2020-12-28T16:10:21,208,SBUY,,URUCU,COARI,AM,NORTE,BRASIL,AMÉRICA DO SUL
1170,2020-12-28T16:10:21,208,SBUY,RPU,URUCU,COARI,AM,NORTE,BRASIL,AMÉRICA DO SUL
1173,2020-12-28T16:10:21,208,SBUY,RPU,URUCU,COARI,AM,NORTE,BRASIL,AMÉRICA DO SUL
1475,2020-12-28T16:10:21,208,SBUY,RPU,URUCU,COARI,AM,NORTE,BRASIL,AMÉRICA DO SUL


In [126]:
orig = orig.drop([692, 1173, 1475])

##### id 21243:

In [127]:
orig[orig['id_aerodromo_origem']==21243]

Unnamed: 0,DT_CARGA_DW,id_aerodromo_origem,sg_icao_origem,sg_iata_origem,nm_aerodromo_origem,nm_municipio_origem,sg_uf_origem,nm_regiao_origem,nm_pais_origem,nm_continente_origem
941,2020-12-28T16:10:21,21243,LTFM,IST,ISTANBUL AIRPORT,ISTANBUL,,,TURQUIA,EUROPA
944,2020-12-28T16:10:21,21243,LTFM,ISL,ISTANBUL AIRPORT,ISTANBUL,,,TURQUIA,EUROPA
1020,2020-12-28T16:10:21,21243,LTFM,IST,ISTANBUL AIRPORT,ISTANBUL,,,TURQUIA,EUROPA
1198,2020-12-28T16:10:21,21243,LTFM,IST,ISTANBUL AIRPORT,ISTANBUL,,,TURQUIA,EUROPA
1364,2020-12-28T16:10:21,21243,LTFM,IST,ISTANBUL AIRPORT,ISTANBUL,,,TURQUIA,EUROPA


In [128]:
orig = orig.drop([941, 944, 1020, 1364])

##### id 338:

In [129]:
orig[orig['id_aerodromo_origem']==338]

Unnamed: 0,DT_CARGA_DW,id_aerodromo_origem,sg_icao_origem,sg_iata_origem,nm_aerodromo_origem,nm_municipio_origem,sg_uf_origem,nm_regiao_origem,nm_pais_origem,nm_continente_origem
648,2020-12-28T16:10:21,338,LTBA,IST,ATATÜRK INTERNATIONAL AIRPORT,ISTANBUL,,,TURQUIA,EUROPA
652,2020-12-28T16:10:21,338,LTBA,ISL,ATATÜRK INTERNATIONAL AIRPORT,ISTANBUL,,,TURQUIA,EUROPA


In [130]:
orig = orig.drop(648)

#### Strategy 2

using the mode() function in combination with groupby(), we can find the most frequent value in a series. for each group, we want to find the most frequent value in each column. the result will be a new DataFrame with the most popular attributes for each id

In [144]:
df_grouped = orig.groupby('id_aerodromo_origem', as_index=False )
# Use mode() to get the most frequent values in each group
most_popular = df_grouped.agg(lambda x: x.value_counts().index[0] if not x.value_counts().empty else np.nan)

most_popular
orig = most_popular

## Companies

In [147]:
company.head()

Unnamed: 0,DT_CARGA_DW,id_empresa,sg_empresa_icao,sg_empresa_iata,nm_empresa,ds_tipo_empresa
0,2020-12-28T16:10:21,1001391,BLC,,TAM-TRANSPORTE AEREOS MERIDIONAIS,TRANSPORTE AÉREO REGULAR
1,2020-12-28T16:10:21,1001672,ITB,,INTERBRASIL,TRANSPORTE AÉREO REGULAR
2,2020-12-28T16:10:21,1001663,RSL,SL,RIO SUL,TRANSPORTE AÉREO REGULAR
3,2020-12-28T16:10:21,1001428,TBA,TR,TRANSBRASIL,TRANSPORTE AÉREO REGULAR
4,2020-12-28T16:10:21,1007106,VRG,RG,VARIG - VIAÇÃO AÉREA RIO-GRANDENSE,TRANSPORTE AÉREO REGULAR


In [148]:
company.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   DT_CARGA_DW      458 non-null    object
 1   id_empresa       458 non-null    int64 
 2   sg_empresa_icao  458 non-null    object
 3   sg_empresa_iata  373 non-null    object
 4   nm_empresa       458 non-null    object
 5   ds_tipo_empresa  458 non-null    object
dtypes: int64(1), object(5)
memory usage: 21.6+ KB


#### as before, I'm checking if there are duplicates. if there are, then use the same approach of strategy 2 for origin dataset

In [161]:
dupl = company['id_empresa'].value_counts()
dupl[dupl>1]

id_empresa
1001374    5
1000909    5
1000831    5
1000355    4
1000365    4
          ..
1000711    2
1007200    2
1000100    2
1007305    2
1006477    2
Name: count, Length: 78, dtype: int64

since there are a lot of duplicates, I proceed as before

In [163]:
# keep as before the most frequent values
df_grouped = company.groupby('id_empresa', as_index=False )
# Use mode() to get the most frequent values in each group
most_popular = df_grouped.agg(lambda x: x.value_counts().index[0] if not x.value_counts().empty else np.nan)

most_popular
company = most_popular

company.head()

Unnamed: 0,id_empresa,DT_CARGA_DW,sg_empresa_icao,sg_empresa_iata,nm_empresa,ds_tipo_empresa
0,1000002,2020-12-28T16:10:21,AEA,UX,AIR EUROPA LINEAS AEREAS SOCIEDAD ANONIMA,ESTRANGEIRA REGULAR
1,1000004,2020-12-28T16:10:21,MWM,WD,MODERN TRANSPORTE AÉREO DE CARGA S.A.,TRANSPORTE AÉREO REGULAR
2,1000010,2020-12-28T16:10:21,STR,,STERNA LINHAS AÉREAS LTDA.,TRANSPORTE AÉREO REGULAR
3,1000049,2020-12-28T16:10:21,VCV,V0,CONSORCIO VENEZOLANO DE INDUSTRIAS AERONAUTICA...,ESTRANGEIRA NÃO REGULAR
4,1000077,2020-12-28T16:10:21,LOT,LO,LOT POLISH AIRLINES,ESTRANGEIRA NÃO REGULAR


In [164]:
company.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 265 entries, 0 to 264
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id_empresa       265 non-null    int64 
 1   DT_CARGA_DW      265 non-null    object
 2   sg_empresa_icao  265 non-null    object
 3   sg_empresa_iata  194 non-null    object
 4   nm_empresa       265 non-null    object
 5   ds_tipo_empresa  265 non-null    object
dtypes: int64(1), object(5)
memory usage: 12.6+ KB


## Equipements

In [167]:
equip.head()

Unnamed: 0,DT_CARGA_DW,id_equipamento,sg_equipamento_icao,ds_modelo
0,2020-12-28T16:10:21,12,A310,AIRBUS A310
1,2020-12-28T16:10:21,375,MD88,MCDONNELL DOUGLAS MD88
2,2020-12-28T16:10:21,285,F100,FOKKER 100
3,2020-12-28T16:10:21,264,E120,EMBRAER EMB.120 BRASILIA
4,2020-12-28T16:10:21,549,F50,FOKKER 50


In [168]:
equip.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232 entries, 0 to 231
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   DT_CARGA_DW          232 non-null    object
 1   id_equipamento       232 non-null    int64 
 2   sg_equipamento_icao  231 non-null    object
 3   ds_modelo            229 non-null    object
dtypes: int64(1), object(3)
memory usage: 7.4+ KB


In [169]:
equip['id_equipamento'].value_counts()

id_equipamento
113    5
98     5
101    5
117    5
116    5
      ..
330    1
327    1
25     1
508    1
527    1
Name: count, Length: 92, dtype: int64

In [174]:
# keep as before the most frequent values
df_grouped = equip.groupby('id_equipamento', as_index=False )
# Use mode() to get the most frequent values in each group
most_popular = df_grouped.agg(lambda x: x.value_counts().index[0] if not x.value_counts().empty else np.nan)
equip = most_popular

equip.head()

Unnamed: 0,id_equipamento,DT_CARGA_DW,sg_equipamento_icao,ds_modelo
0,0,2020-12-28T16:10:21,,
1,5,2020-12-28T16:10:21,A124,ANTONOV AN-124 RUSLAN
2,11,2020-12-28T16:10:21,A30B,AIRBUS INDUSTRIE A300C4/F4 FREIGHTER
3,12,2020-12-28T16:10:21,A310,AIRBUS A310
4,13,2020-12-28T16:10:21,A318,AIRBUS A318


In [175]:
equip.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92 entries, 0 to 91
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   id_equipamento       92 non-null     int64 
 1   DT_CARGA_DW          92 non-null     object
 2   sg_equipamento_icao  91 non-null     object
 3   ds_modelo            90 non-null     object
dtypes: int64(1), object(3)
memory usage: 3.0+ KB


## Route Type

In [176]:
line.head()

Unnamed: 0,DT_CARGA_DW,id_tipo_linha,cd_tipo_linha,ds_tipo_linha,ds_natureza_tipo_linha,ds_servico_tipo_linha
0,2020-12-28T16:10:21,8,L,DOMÉSTICA REDE POSTAL,DOMÉSTICA,CARGUEIRO
1,2020-12-28T16:10:21,6,H,INTERNACIONAL SUB-REGIONAL,INTERNACIONAL,PASSAGEIRO
2,2020-12-28T16:10:21,1,N,DOMÉSTICA MISTA,DOMÉSTICA,PASSAGEIRO
3,2020-12-28T16:10:21,2,C,DOMÉSTICA CARGUEIRA,DOMÉSTICA,CARGUEIRO
4,2020-12-28T16:10:21,3,I,INTERNACIONAL MISTA,INTERNACIONAL,PASSAGEIRO


In [177]:
line.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   DT_CARGA_DW             24 non-null     object
 1   id_tipo_linha           24 non-null     int64 
 2   cd_tipo_linha           24 non-null     object
 3   ds_tipo_linha           24 non-null     object
 4   ds_natureza_tipo_linha  24 non-null     object
 5   ds_servico_tipo_linha   24 non-null     object
dtypes: int64(1), object(5)
memory usage: 1.3+ KB


In [178]:
line['id_tipo_linha'].value_counts()

id_tipo_linha
1    4
2    4
3    4
0    4
4    4
8    1
6    1
5    1
7    1
Name: count, dtype: int64

In [184]:
# keep as before the most frequent values
df_grouped = line.groupby('id_tipo_linha', as_index=False )
# Use mode() to get the most frequent values in each group
most_popular = df_grouped.agg(lambda x: x.value_counts().index[0] if not x.value_counts().empty else np.nan)
line = most_popular

line.head()

Unnamed: 0,id_tipo_linha,DT_CARGA_DW,cd_tipo_linha,ds_tipo_linha,ds_natureza_tipo_linha,ds_servico_tipo_linha
0,0,2020-12-28T16:10:21,X,NÃO IDENTIFICADA,NÃO IDENTIFICADA,NÃO IDENTIFICADO
1,1,2020-12-28T16:10:21,N,DOMÉSTICA MISTA,DOMÉSTICA,PASSAGEIRO
2,2,2020-12-28T16:10:21,C,DOMÉSTICA CARGUEIRA,DOMÉSTICA,CARGUEIRO
3,3,2020-12-28T16:10:21,I,INTERNACIONAL MISTA,INTERNACIONAL,PASSAGEIRO
4,4,2020-12-28T16:10:21,G,INTERNACIONAL CARGUEIRA,INTERNACIONAL,CARGUEIRO


In [185]:
line.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   id_tipo_linha           9 non-null      int64 
 1   DT_CARGA_DW             9 non-null      object
 2   cd_tipo_linha           9 non-null      object
 3   ds_tipo_linha           9 non-null      object
 4   ds_natureza_tipo_linha  9 non-null      object
 5   ds_servico_tipo_linha   9 non-null      object
dtypes: int64(1), object(5)
memory usage: 564.0+ bytes


## Flights

In [67]:
voos_path = 'dataset/DW_VOOS.csv'
voos = pd.read_csv(voos_path, encoding='latin1')

  voos = pd.read_csv(voos_path, encoding='latin1')


In [143]:
voos.head()

Unnamed: 0,id_basica,id_empresa,nr_voo,nr_singular,id_di,cd_di,ds_di,ds_grupo_di,dt_referencia,nr_semana_referencia,...,nr_correio_km,nr_bagagem_paga_km,nr_bagagem_gratis_km,nr_ask,nr_rpk,nr_atk,nr_rtk,id_arquivo,nr_linha,dt_sistema
0,20204791,1001208,270,750,5,3,RETORNO,IMPRODUTIVO,2000-01-05,2,...,0.0,0.0,0.0,0.0,0.0,582340.0,463844.0,15757,141.0,2020-02-06T05:26:59
1,20204792,1001208,270,750,5,3,RETORNO,IMPRODUTIVO,2000-01-05,2,...,0.0,0.0,0.0,0.0,0.0,503250.0,400847.0,15757,139.0,2020-02-06T05:26:59
2,20204793,1001208,265,750,5,3,RETORNO,IMPRODUTIVO,2000-01-30,6,...,0.0,0.0,0.0,0.0,0.0,503250.0,56222.0,15757,138.0,2020-02-06T05:26:59
3,20204794,1001208,265,750,5,3,RETORNO,IMPRODUTIVO,2000-01-30,6,...,0.0,0.0,0.0,0.0,0.0,582340.0,65057.0,15757,136.0,2020-02-06T05:26:59
4,20204813,1001208,264,750,5,3,RETORNO,IMPRODUTIVO,2000-01-23,5,...,0.0,0.0,0.0,0.0,0.0,582340.0,343278.0,15757,102.0,2020-02-06T05:26:59


In [69]:
voos.columns

Index(['id_basica', 'id_empresa', 'nr_voo', 'nr_singular', 'id_di', 'cd_di',
       'ds_di', 'ds_grupo_di', 'dt_referencia', 'nr_semana_referencia',
       'id_tipo_linha', 'ds_natureza_etapa', 'hr_partida_real',
       'dt_partida_real', 'nr_semana_partida_real', 'id_aerodromo_origem',
       'nr_etapa', 'hr_chegada_real', 'dt_chegada_real',
       'nr_semana_chegada_real', 'id_equipamento', 'ds_modelo', 'ds_matricula',
       'id_aerodromo_destino', 'lt_combustivel', 'nr_assentos_ofertados',
       'kg_payload', 'km_distancia', 'nr_passag_pagos', 'nr_passag_gratis',
       'kg_bagagem_livre', 'kg_bagagem_excesso', 'kg_carga_paga',
       'kg_carga_gratis', 'kg_correio', 'nr_decolagem', 'nr_horas_voadas',
       'kg_peso', 'nr_velocidade_media', 'nr_pax_gratis_km',
       'nr_carga_paga_km', 'nr_carga_gratis_km', 'nr_correio_km',
       'nr_bagagem_paga_km', 'nr_bagagem_gratis_km', 'nr_ask', 'nr_rpk',
       'nr_atk', 'nr_rtk', 'id_arquivo', 'nr_linha', 'dt_sistema'],
      dtype='obj

In [74]:
voos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18051399 entries, 0 to 18051398
Data columns (total 52 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   id_basica               int64  
 1   id_empresa              int64  
 2   nr_voo                  int64  
 3   nr_singular             object 
 4   id_di                   int64  
 5   cd_di                   object 
 6   ds_di                   object 
 7   ds_grupo_di             object 
 8   dt_referencia           object 
 9   nr_semana_referencia    int64  
 10  id_tipo_linha           int64  
 11  ds_natureza_etapa       object 
 12  hr_partida_real         object 
 13  dt_partida_real         object 
 14  nr_semana_partida_real  float64
 15  id_aerodromo_origem     int64  
 16  nr_etapa                int64  
 17  hr_chegada_real         object 
 18  dt_chegada_real         object 
 19  nr_semana_chegada_real  float64
 20  id_equipamento          int64  
 21  ds_modelo               objec

In [187]:
dupl = voos['id_basica'].value_counts()
dupl[dupl>1]

Series([], Name: count, dtype: int64)

no duplicates here:D

## first round of cleaned data export

In [188]:
#export data
dest.to_csv('dataset/modified_data/DW_ARPT_DEST.csv', index=False)
orig.to_csv('dataset/modified_data/DW_ARPT_ORIGEM.csv', index=False)
company.to_csv('dataset/modified_data/DW_EMPRESA.csv', index=False)
equip.to_csv('dataset/modified_data/DW_EQPT.csv', index=False)
line.to_csv('dataset/modified_data/DW_TIPO_LINHA.csv', index=False)
