In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install pyspark



In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

https://drive.google.com/file/d/1uzIrvqLTBSYrcB2XmJZD7bbdNjuAvFY_/view?usp=sharing


In [None]:
df = spark.read.csv('/content/drive/My Drive/ESGI/projet/flights.csv', header=True)

In [None]:
 df.show(10)

+----+-----+---+-----------+-------+-------------+-----------+--------------+-------------------+-------------------+--------------+---------------+--------+----------+--------------+------------+--------+--------+---------+-------+-----------------+------------+-------------+--------+---------+-------------------+----------------+--------------+-------------+-------------------+-------------+
|YEAR|MONTH|DAY|DAY_OF_WEEK|AIRLINE|FLIGHT_NUMBER|TAIL_NUMBER|ORIGIN_AIRPORT|DESTINATION_AIRPORT|SCHEDULED_DEPARTURE|DEPARTURE_TIME|DEPARTURE_DELAY|TAXI_OUT|WHEELS_OFF|SCHEDULED_TIME|ELAPSED_TIME|AIR_TIME|DISTANCE|WHEELS_ON|TAXI_IN|SCHEDULED_ARRIVAL|ARRIVAL_TIME|ARRIVAL_DELAY|DIVERTED|CANCELLED|CANCELLATION_REASON|AIR_SYSTEM_DELAY|SECURITY_DELAY|AIRLINE_DELAY|LATE_AIRCRAFT_DELAY|WEATHER_DELAY|
+----+-----+---+-----------+-------+-------------+-----------+--------------+-------------------+-------------------+--------------+---------------+--------+----------+--------------+------------+--------+-

# Cast

In [None]:
from pyspark.sql.functions import col, avg, sum,count
from pyspark.sql.types import IntegerType

cols_to_cast = [
    "DEPARTURE_DELAY", "TAXI_OUT", "ELAPSED_TIME", "AIR_TIME",
    "DISTANCE", "TAXI_IN", "ARRIVAL_DELAY",
    "AIR_SYSTEM_DELAY", "SECURITY_DELAY", "AIRLINE_DELAY",
    "LATE_AIRCRAFT_DELAY", "WEATHER_DELAY"
]

df = df.select(*[
    col(c).cast(IntegerType()) if c in cols_to_cast else col(c)
    for c in df.columns
])


# Analytic

In [None]:
import plotly.express as px

In [None]:

fig = px.bar(df.limit(100000).toPandas().groupby('AIRLINE')['CANCELLED'].apply(lambda x: (x == "1").sum()).reset_index(name='CANCELLED_COUNT'), x='AIRLINE', y='CANCELLED_COUNT')
fig.show()

In [None]:
fig = px.bar(df.limit(100000).toPandas().groupby('AIRLINE')['DIVERTED'].apply(lambda x: (x == '1').sum()).reset_index(name='DIVERTED_COUNT'), x='AIRLINE', y='DIVERTED_COUNT')
fig.show()

In [None]:
avg_delay_df = df.groupBy("AIRLINE") \
    .agg(avg("ARRIVAL_DELAY").alias("avg_arrival_delay"))

avg_delay_pd = avg_delay_df.toPandas()

fig = px.bar(
    avg_delay_pd.sort_values(by="avg_arrival_delay", ascending=False),
    x="AIRLINE",
    y="avg_arrival_delay",
    title="Retard moyen à l'arrivée par compagnie",
    labels={"avg_arrival_delay": "Retard moyen (min)", "AIRLINE": "Compagnie"}
)
fig.show()

In [None]:
df_delay_day = df.groupBy("DAY_OF_WEEK") \
    .agg(avg("ARRIVAL_DELAY").alias("avg_arrival_delay")) \
    .toPandas()

fig = px.bar(df_delay_day.sort_values("DAY_OF_WEEK"),
             x="DAY_OF_WEEK", y="avg_arrival_delay",
             title="Retard moyen par jour de la semaine")
fig.show()

In [None]:
avg_delay_origin = df.groupBy("ORIGIN_AIRPORT").agg(avg("DEPARTURE_DELAY").alias("avg_dep_delay"))
plot3 = avg_delay_origin.toPandas().sort_values("avg_dep_delay", ascending=False).head(10)

fig3 = px.bar(
    plot3, x="ORIGIN_AIRPORT", y="avg_dep_delay",
    title="Top 10 aéroports les plus retardés au départ",
    labels={"ORIGIN_AIRPORT": "Aéroport", "avg_dep_delay": "Retard moyen (min)"}
)
fig3.show()

In [None]:
df = df.withColumn("CANCELLED", col("CANCELLED").cast("int"))

cancel_rate = df.groupBy("AIRLINE").agg(
    (sum("CANCELLED") / count("*")).alias("cancel_rate")
)
plot4 = cancel_rate.toPandas()

fig4 = px.bar(
    plot4.sort_values("cancel_rate", ascending=False),
    x="AIRLINE", y="cancel_rate",
    title="Taux d’annulation par compagnie",
    labels={"AIRLINE": "Compagnie", "cancel_rate": "Taux d’annulation"}
)
fig4.show()

In [None]:
delay_grouped = df.groupBy("ORIGIN_AIRPORT", "DAY_OF_WEEK") \
    .agg(avg("ARRIVAL_DELAY").alias("avg_arrival_delay")) \
    .toPandas()


fig = px.bar(
    delay_grouped,
    x="ORIGIN_AIRPORT",
    y="avg_arrival_delay",
    color="DAY_OF_WEEK",
    barmode="group",
    title="Retard moyen à l’arrivée par aéroport et par jour de la semaine",
    labels={
        "ORIGIN_AIRPORT": "Aéroport d'origine",
        "avg_arrival_delay": "Retard moyen (min)",
        "DAY_OF_WEEK": "Jour de la semaine"
    },
    color_discrete_sequence=px.colors.qualitative.Set2
)

fig.update_layout(xaxis_tickangle=-45)
fig.show()

In [None]:
delay_grouped = df.groupBy("AIRLINE", "DAY_OF_WEEK") \
    .agg(avg("ARRIVAL_DELAY").alias("avg_arrival_delay")) \
    .toPandas()


fig = px.bar(
    delay_grouped,
    x="AIRLINE",
    y="avg_arrival_delay",
    color="DAY_OF_WEEK",
    barmode="group",
    title="Retard moyen à l’arrivée par compagine et par jour de la semaine",
    labels={
        "AIRLINE": "Aéroport d'origine",
        "avg_arrival_delay": "Retard moyen (min)",
        "DAY_OF_WEEK": "Jour de la semaine"
    },
    color_discrete_sequence=px.colors.qualitative.Set2
)

fig.update_layout(xaxis_tickangle=-45)
fig.show()

# Training

In [None]:
! pip install tensorflow



In [None]:
numeric_feature_names = ['SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'DISTANCE', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME','AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT']

In [None]:
df_cleaned = df.select(numeric_feature_names).dropna()

MAX_SIZE = 10000

df_limited = df_cleaned.limit(MAX_SIZE)
X = df_limited.toPandas()
y = df.select(['DIVERTED', 'CANCELLED', 'ARRIVAL_DELAY']).dropna().limit(MAX_SIZE).toPandas()

In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
for col in ['AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT']:
    unique_values = X[col].unique()
    mapping = {val: idx for idx, val in enumerate(unique_values)}
    X_train[col] = X_train[col].map(mapping).fillna(-1).astype(int)
    X_test[col] = X_test[col].map(mapping).fillna(-1).astype(int)

y_train['ARRIVAL_DELAY'] = y_train['ARRIVAL_DELAY'].apply(lambda x: 1 if int(x) > 0 else 0)
y_test['ARRIVAL_DELAY'] = y_test['ARRIVAL_DELAY'].apply(lambda x: 1 if int(x) > 0 else 0)

y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(3, activation='sigmoid')
])



In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=32)


ValueError: Invalid dtype: object

In [None]:
model.save('model.h5')
from google.colab import files
files.download('model.h5')



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>