In [1]:
import pandas as pd
import numpy as np
import sklearn
import xgboost

In [2]:
columns_name = ['FL_DATE', 'ORIGIN', 'DEST', 'CRS_DEP_TIME', 'DEP_TIME', 'DEP_DELAY', 
                'CANCELLED', 'DIVERTED', 'CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME', 'AIR_TIME', 
                'DISTANCE']

iata_code = ['PHX', 'ABQ', 'DEN', 'SAT', 'DFW', 'IAH', 'MCI', 'MSP', 'STL', 'ORD', 'BNA', 'IND', 'ATL', 'DTW', 'JAX', 'CLT', 
             'MIA', 'PIT', 'YYZ', 'PHL', 'JFK', 'LGA', 'YUL', 'BOS', 'WAW', 'TLV', 'ETH', 'HFA', 'TLV']

columns_time = ['CRS_DEP_TIME', 'DEP_TIME']

city_iata = {'Phoenix': 'PHX', 'Albuquerque': 'ABQ','Denver': 'DEN', 'San Antonio':'SAT', 'Dallas': 'DFW', 'Houston': 'IAH', 
             'Kansas City': 'MCI', 'Minneapolis': 'MSP', 'Saint Louis': 'STL', 'Chicago': 'ORD', 'Nashville': 'BNA', 
             'Indianapolis': 'IND', 'Atlanta': 'ATL', 'Detroit': 'DTW', 'Jacksonville': 'JAX', 'Charlotte': 'CLT', 
             'Miami': 'MIA', 'Pittsburgh': 'PIT', 'Toronto': 'YYZ', 'Philadelphia': 'PHL', 'New York': 'JFK', 'Montreal': 'YUL',
             'Boston': 'BOS'}

columns = ['datetime', 'Phoenix', 'Albuquerque', 'Denver', 'San Antonio', 'Dallas', 'Houston', 'Kansas City', 'Minneapolis', 'Saint Louis', 'Chicago', 
           'Nashville', 'Indianapolis', 'Atlanta', 'Detroit', 'Jacksonville', 'Charlotte', 'Miami', 'Pittsburgh', 'Toronto', 
           'Philadelphia', 'New York', 'Montreal', 'Boston']

In [3]:
def format_time(value):
    value = str(value).replace('.0', '')[:4]
    # Преобразуем в строку и заполняем нулями слева, если нужно
    if len(value) == 1:
        return f"00:0{value}"
    elif len(value) == 2:
        return f"00:{value}"
    elif len(value) == 3:
        return f"{value[0]}:{value[1:]}"
    else:
        return f"{value[:2]}:{value[2:]}"

In [4]:
flights_2017 = pd.read_csv('./data/2017.csv', usecols=columns_name)
flights_2016 = pd.read_csv('./data/2016.csv', usecols=columns_name)
flights_2015 = pd.read_csv('./data/2015.csv', usecols=columns_name)
flights_2014 = pd.read_csv('./data/2014.csv', usecols=columns_name)
flights_2013 = pd.read_csv('./data/2013.csv', usecols=columns_name)
flights_2012 = pd.read_csv('./data/2012.csv', usecols=columns_name)

In [5]:
dataframes = [flights_2017, flights_2016, flights_2015, flights_2014, flights_2013, flights_2012]

In [6]:
humidity = pd.read_csv('./data/humidity.csv', usecols=columns)
pressure = pd.read_csv('./data/pressure.csv', usecols=columns)
temperature = pd.read_csv('./data/temperature.csv', usecols=columns)
wind_speed = pd.read_csv('./data/wind_speed.csv', usecols=columns)

In [7]:
humidity['datetime'] = pd.to_datetime(humidity['datetime'])
pressure['datetime'] = pd.to_datetime(pressure['datetime'])
temperature['datetime'] = pd.to_datetime(temperature['datetime'])
wind_speed['datetime'] = pd.to_datetime(wind_speed['datetime'])

In [8]:
humidity = humidity.melt(id_vars=["datetime"], var_name="ORIGIN", value_name="humidity")
pressure = pressure.melt(id_vars=["datetime"], var_name="ORIGIN", value_name="pressure")
temperature = temperature.melt(id_vars=["datetime"], var_name="ORIGIN", value_name="temperature")
wind_speed = wind_speed.melt(id_vars=["datetime"], var_name="ORIGIN", value_name="wind_speed")

weather = pd.concat([humidity, pressure, temperature, wind_speed], axis=1)
weather = weather.loc[:, ~weather.columns.duplicated()]
weather['ORIGIN'] = weather['ORIGIN'].map(city_iata)

In [9]:
weather['avg_humidity'] = weather['humidity'].rolling(window=3, min_periods=1).mean()
weather['avg_pressure'] = weather['pressure'].rolling(window=3, min_periods=1).mean()
weather['avg_temperature'] = weather['temperature'].rolling(window=3, min_periods=1).mean()
weather['avg_wind_speed'] = weather['wind_speed'].rolling(window=3, min_periods=1).mean()

In [10]:
def preprocess_data(df):
    df = df[(df['ORIGIN'].isin(iata_code)) & (df['DEST'].isin(iata_code))]
    #df.dropna(subset='DEP_DELAY', axis=0, inplace=True)
    df.loc[df["CANCELLED"] == 1, "DEP_DELAY"] = -999
    
    impute_column = ['DEP_TIME','CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME', 'AIR_TIME']
    for column in impute_column:
        df[column].fillna(0, inplace=True) 
    
    for column in columns_time:
        df[column] = df[column].apply(format_time)
        df[column] = df[column].str[:5]
    
    df['CRS_DEP_TIME'] = df['CRS_DEP_TIME'].str.replace('24:00', '00:00')
    df['DEP_TIME'] = df['DEP_TIME'].str.replace('24:00', '00:00')
   
    df['CRS_DEP_TIME'] = df['FL_DATE'] + ' ' + df['CRS_DEP_TIME'] 
    df['DEP_TIME'] = df['FL_DATE'] + ' ' + df['DEP_TIME']

    df['CRS_DEP_TIME'] = pd.to_datetime(df['CRS_DEP_TIME'])
    df['DEP_TIME'] = pd.to_datetime(df['DEP_TIME'])

    #df.dropna(inplace=True)
    
    # Объединяем по ORIGIN и DEP_TIME, берём ближайшую предшествующую влажность (не старше 3 часов)
    df_flights = pd.merge_asof(
        df.sort_values("CRS_DEP_TIME"),  # Обязательно сортируем!
        weather.sort_values("datetime"),
        left_on="CRS_DEP_TIME",
        right_on="datetime",
        by="ORIGIN",
        direction="backward",  # Берём предыдущее значение влажности
        tolerance=pd.Timedelta(hours=3)  # Ограничиваем 3 часами
    )
    
    drop_columns = ['datetime', 'humidity', 'pressure', 'temperature', 'wind_speed', 'FL_DATE', 'DIVERTED', 'DEP_TIME', 
                    'ORIGIN', 'DEST']
    
    df_flights.drop(drop_columns, axis=1, inplace=True)
    #df_flights.dropna(subset=['avg_humidity', 'avg_pressure', 'avg_temperature', 'avg_wind_speed'], inplace=True)
    
    df_flights['avg_pressure'] = round(df_flights['avg_pressure'] / 1.33322)
    df_flights['avg_temperature'] = round(df_flights['avg_temperature'] - 273.15)
    
    seasons = {1: 'winter', 2: 'spring', 3: 'summer', 4: 'autumn'}
    # Извлекаем год, месяц, день, день недели, час, четверть
    df_flights["year"] = df_flights["CRS_DEP_TIME"].dt.year
    df_flights["month"] = df_flights["CRS_DEP_TIME"].dt.month
    df_flights["day"] = df_flights["CRS_DEP_TIME"].dt.day
    df_flights["weekday"] = df_flights["CRS_DEP_TIME"].dt.weekday  # 0 - понедельник, 6 - воскресенье
    df_flights["hour"] = df_flights["CRS_DEP_TIME"].dt.hour
    df_flights["seasons"] = df_flights["CRS_DEP_TIME"].dt.quarter.map(seasons)

    # Выделяем период суток
    df_flights["time_of_day"] = pd.cut(df_flights["hour"], bins=[-1, 5, 9, 16, 24], labels=["night", "morning", "afternoon", "evening"])

    # Дополнительно можно добавить признак, связанный с выходными днями
    df_flights["weekend"] = df_flights["weekday"].isin([5, 6])  # 5 - суббота, 6 - воскресенье
    
    return df_flights

In [11]:
processed_dfs = [preprocess_data(df) for df in dataframes]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column].fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].apply(format_time)
A value is trying to be set on

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column].fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].apply(format_time)
A value is trying to be set on

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column].fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].apply(format_time)
A value is trying to be set on

In [12]:
fligths = pd.concat(processed_dfs)
fligths = pd.get_dummies(fligths, columns=['seasons', 'time_of_day'])

In [35]:
fligths.shape

(5967084, 25)

CRS_DEP_TIME             2016-11-03 12:25:00
DEP_DELAY                               -5.0
CANCELLED                                0.0
CRS_ELAPSED_TIME                       163.0
ACTUAL_ELAPSED_TIME                    152.0
AIR_TIME                               120.0
DISTANCE                               733.0
avg_humidity                             NaN
avg_pressure                             NaN
avg_temperature                          NaN
avg_wind_speed                           NaN
year                                    2016
month                                     11
day                                        3
weekday                                    3
hour                                      12
weekend                                False
seasons_autumn                          True
seasons_spring                         False
seasons_summer                         False
seasons_winter                         False
time_of_day_night                      False
time_of_da

In [115]:
data_for_ml = fligths.set_index('CRS_DEP_TIME')
data_for_ml['DEP_DELAY'] = data_for_ml['DEP_DELAY'].astype('int32')

def making_targets():
    data_for_ml["target-delay"] = data_for_ml["DEP_DELAY"].apply(lambda x: 1 if (x >= 60) or (x == -999) else 0)

def training():
    making_targets()
    X = data_for_ml.drop(["CANCELLED", "target-delay"], axis=1)
    y_delay = data_for_ml["target-delay"]
    y_cancel = data_for_ml["CANCELLED"]
    X_train, X_test, y_delay_train, y_delay_test = sklearn.model_selection.train_test_split(X, y_delay, test_size=0.3,
                                                                                            random_state=2303)
    X_train_cancel, X_test_cancel, y_cancel_train, y_cancel_test = sklearn.model_selection.train_test_split(X, y_cancel, test_size=0.3,
                                                                                   random_state=2303)
    model_cancel = xgboost.XGBClassifier()
    model_delay = xgboost.XGBClassifier()
    model_cancel.fit(X_train, y_cancel_train)
    y_cancel_prediction = model_cancel.predict_proba(X_test)

    model_delay.fit(X_train, y_delay_train)
    y_delay_prediction = model_delay.predict_proba(X_test)

    y_cancel_prediction_сlass = model_cancel.predict(X_test)
    y_delay_prediction_class = model_delay.predict(X_test)
    
    print(f"Точность предсказания отмены: {sklearn.metrics.accuracy_score(y_cancel_test, y_cancel_prediction_сlass)}")
    print(f"Точность предсказания задержки: {sklearn.metrics.accuracy_score(y_delay_test, y_delay_prediction_class)}")

    return X_test, y_cancel_prediction, y_delay_prediction
   
X_test, y_cancel_pred, y_delay_pred = training()

Точность предсказания отмены: 0.9999871517423913
Точность предсказания задержки: 0.9987989672235362


In [116]:
y_cancel_df = pd.DataFrame(y_cancel_pred, columns=['probability_of_no_cancelД', 'probability_of_cancell'])
y_cancel_df = y_cancel_df['probability_of_cancell'] 
y_delay_df = pd.DataFrame(y_delay_pred, columns=['probability_of_no_delay', 'probability_of_delay'])
y_delay_df = y_delay_df['probability_of_delay'] 

In [117]:
final_column = ['CRS_DEP_TIME', 'AIR_TIME', 'DISTANCE', 'probability_of_cancell', 'probability_of_delay']
X_test.reset_index(inplace=True)
final_data = pd.concat([X_test, y_cancel_df, y_delay_df], axis=1)
final_data = final_data[final_column]
final_data['probability_of_cancell'] = round(final_data['probability_of_cancell'] * 100, 2)
final_data['probability_of_delay'] = round(final_data['probability_of_delay'] * 100, 2)

In [119]:
final_data[final_data['probability_of_cancell'] < 0.1].sort_values('probability_of_cancell', ascending=False)

Unnamed: 0,CRS_DEP_TIME,AIR_TIME,DISTANCE,probability_of_cancell,probability_of_delay
794219,2014-01-27 15:25:00,0.0,936.0,0.07,0.01
1511862,2013-09-30 18:40:00,32.0,94.0,0.07,0.13
536346,2017-02-07 19:35:00,32.0,184.0,0.06,1.22
350521,2014-09-26 06:00:00,0.0,733.0,0.06,0.01
1467742,2015-08-31 06:00:00,0.0,733.0,0.05,0.01
...,...,...,...,...,...
596976,2012-05-02 12:05:00,86.0,528.0,0.00,0.00
596975,2013-09-11 07:15:00,119.0,844.0,0.00,0.00
596974,2013-12-29 12:25:00,105.0,852.0,0.00,0.00
596973,2013-10-09 09:19:00,65.0,449.0,0.00,0.00


In [128]:
def convert_to_procent(column):
    final_data[column] = final_data[column].apply(lambda x: '< 0.01%' if x < 0.01 else f'{x}%')

In [129]:
convert_to_procent('probability_of_cancell')
convert_to_procent('probability_of_delay')

TypeError: '<' not supported between instances of 'str' and 'float'

In [134]:
final_data.iloc[122:162]

Unnamed: 0,CRS_DEP_TIME,AIR_TIME,DISTANCE,probability_of_cancell,probability_of_delay
122,2013-06-29 06:00:00,0.0,828.0,100.00%,100.00%
123,2014-11-11 20:32:00,92.0,733.0,< 0.01%,< 0.01%
124,2013-03-18 10:00:00,122.0,675.0,< 0.01%,< 0.01%
125,2013-12-30 19:15:00,296.0,2153.0,< 0.01%,100.00%
126,2017-11-26 14:24:00,193.0,1587.0,< 0.01%,< 0.01%
127,2016-11-30 06:00:00,96.0,502.0,< 0.01%,< 0.01%
128,2016-12-07 18:40:00,173.0,1029.0,< 0.01%,< 0.01%
129,2013-06-26 12:15:00,56.0,328.0,< 0.01%,< 0.01%
130,2013-08-11 15:20:00,103.0,731.0,< 0.01%,< 0.01%
131,2014-05-11 11:06:00,168.0,1325.0,< 0.01%,< 0.01%
