In [None]:
import pandas as pd 
import pandas_profiling as pp
import plotly.express as px 
import sqlite3

pd.options.display.max_columns = None

## Import des datasets 

In [None]:
df_aeroports = pd.read_parquet("../data/processed/train_data/aeroports.gzip")
df_compagnies = pd.read_parquet("../data/processed/train_data/compagnies.gzip")
df_vols = pd.read_parquet("../data/processed/train_data/vols.gzip")
df_fuel = pd.read_parquet("../data/processed/train_data/prix_fuel.gzip")
df_test = pd.read_parquet("../data/processed/test_data/vols.gzip")

In [None]:
df_aeroports.head()

In [None]:
fig = px.box(df_aeroports, y="PRIX RETARD PREMIERE 20 MINUTES", notched=True)
fig.show()

In [None]:
fig = px.box(df_aeroports, y="PRIS RETARD POUR CHAQUE MINUTE APRES 10 MINUTES", notched=True)
fig.show()

In [None]:
df_compagnies.head()

In [None]:
df_compagnies['COMPAGNIE'].value_counts()

In [None]:
df_fuel.head()

In [None]:
df_fuel.nunique()

In [None]:
fig = px.histogram(df_fuel, x="PRIX DU BARIL")
fig.show()

**Il y a une valeur unique du prix du baril. Ce dataset n'est donc pas pertinent à utiliser. Les deux tables "VOLS" et "COMPAGNIES" ne vont pas être utilisées dans le pre-processing mais vont être plutôt utilisées après les prédictions des retards pour évaluer l'impact financier des retards prévisibles durant la période des vols du dataset de test sur le CA des compagnies aériennes.**

In [None]:
df_vols.head()

In [None]:
df_vols.iloc[1,:]

In [None]:
vols_avec_annulation = df_vols[df_vols["ANNULATION"]!=0]
vols_avec_annulation.iloc[1,:]

In [None]:
vols_avec_detournement = df_vols[df_vols["DETOURNEMENT"]!=0]
vols_avec_detournement.iloc[3,:]

In [None]:
df_vols.nunique()

In [None]:
df_test.nunique()

In [None]:
print(df_vols["NIVEAU DE SECURITE"].unique())
print(df_test["NIVEAU DE SECURITE"].unique())

La colonne "NIVEAU DE SECURITE" a une unique valeur. Elle n'est donc pas pertinente. 

In [None]:
df_vols = df_vols.drop(columns=["NIVEAU DE SECURITE"])

In [None]:
df_vols[(df_vols["RETARD A L'ARRIVEE"] > 0) & (df_vols["RETART DE DEPART"] > 0)].head()

In [None]:
missing_columns_in_test_df = df_vols.columns.difference(df_test.columns)
missing_columns_in_test_df

In [None]:
df_vols_train = df_vols.drop(columns=missing_columns_in_test_df)
df_vols_train.head()

On vérifie les valeurs manquantes dans les colonnes : 

In [None]:
for column in df_vols_train.columns:
    if df_vols_train[column].isnull().values.any():
        print(column, df_vols_train[column].isnull().sum(), df_vols_train[column].isnull().sum()/df_vols_train.shape[0])

Vu le pourcentage assez bas des valeurs manquantes, une suggestion est de supprimer les lignes correspondantes : 

In [None]:
df_vols_train = df_vols_train.dropna()
df_vols_train.shape

In [None]:
indexes_deleted = df_vols.index.difference(df_vols_train.index)
indexes_deleted

In [None]:
df_vols = df_vols.drop(indexes_deleted)

In [None]:
df_vols.shape

In [None]:
df_vols = df_vols.reset_index(drop=True)
df_vols_train = df_vols_train.reset_index(drop=True)

In [None]:
df_vols_train.head()

In [None]:
def formatter_date(x):
    while len(x) < 4:
        x = '0' + x
    return pd.to_timedelta(x[:-2] + ':' + x[-2:] + ':00')

In [None]:
df_vols_train["ARRIVEE PROGRAMMEE"] = df_vols_train["ARRIVEE PROGRAMMEE"].astype(str).apply(lambda x: formatter_date(x))
df_vols_train.head()

In [None]:
df_vols_train["DEPART PROGRAMME"] = df_vols_train["DEPART PROGRAMME"].astype(str).apply(lambda x: formatter_date(x))
df_vols_train.head()

In [None]:
df_vols_train.dtypes

In [None]:
target = df_vols[missing_columns_in_test_df]

In [None]:
target.head()

In [None]:
target = target.reset_index(drop=True)

In [None]:
df_vols_train.head()

In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np

scaler_temps_programme = StandardScaler()
scaler_temps_programme = scaler_temps_programme.fit(np.array(df_vols_train['TEMPS PROGRAMME']).reshape(-1,1))
df_vols_train['TEMPS PROGRAMME'] = scaler_temps_programme.transform(np.array(df_vols_train['TEMPS PROGRAMME']).reshape(-1,1))
df_vols_train.head()

In [None]:
scaler_distance = StandardScaler()
scaler_distance = scaler_distance.fit(np.array(df_vols_train['DISTANCE']).reshape(-1,1))
df_vols_train['DISTANCE'] = scaler_distance.transform(np.array(df_vols_train['DISTANCE']).reshape(-1,1))
df_vols_train.head()

In [None]:
scaler = StandardScaler()
scaler = scaler.fit(np.array(df_vols_train['TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE']).reshape(-1,1))
df_vols_train['TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE'] = scaler.transform(np.array(df_vols_train['TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE']).reshape(-1,1))
df_vols_train.head()

In [None]:
scaler = StandardScaler()
scaler = scaler.fit(np.array(df_vols_train["TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE"]).reshape(-1,1))
df_vols_train["TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE"] = scaler.transform(np.array(df_vols_train["TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE"]).reshape(-1,1))
df_vols_train.head()

In [None]:
scaler = StandardScaler()
scaler = scaler.fit(np.array(df_vols_train["NOMBRE DE PASSAGERS"]).reshape(-1,1))
df_vols_train["NOMBRE DE PASSAGERS"] = scaler.transform(np.array(df_vols_train["NOMBRE DE PASSAGERS"]).reshape(-1,1))
df_vols_train.head()

In [None]:
df_vols_train.nunique()

In [None]:
df_vols_train['DAY OF THE WEEK'] = df_vols_train['DATE'].dt.dayofweek + 1

In [None]:
df_vols_train.head()

In [None]:
def check_weekend(x):
    return 1 if x>5 else 0
    
df_vols_train['WEEKEND'] = df_vols_train['DAY OF THE WEEK'].apply(lambda x: check_weekend(x))

In [None]:
df_vols_train.head()

In [None]:
df_vols_train['MONTH'] = df_vols_train['DATE'].dt.month

In [None]:
df_vols_train.head()

In [None]:
df_vols_train['DAY OF THE MONTH'] = df_vols_train['DATE'].dt.day

In [None]:
df_vols_train.head()