<a href="https://colab.research.google.com/github/tlsehdgns1999/tlsehdgns1999/blob/main/airline_delay.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
### 필요 모듈 불러오기

import pandas as pd
import numpy as np
import os
import gc

In [None]:
### 베이스 라인 코드를 빌려옴(실행속도 향상)

def csv_to_parquet(csv_path, save_name):
     df = pd.read_csv(csv_path)
     df.to_parquet(f'./{save_name}.parquet')
     del df
     gc.collect()
     print(save_name, 'Done.')

train_path = '/content/drive/MyDrive/Colab Notebooks/dakondata/train.csv'
test_path = '/content/drive/MyDrive/Colab Notebooks/dakondata/test.csv'

csv_to_parquet(train_path, 'train')
csv_to_parquet(test_path, 'test')

train = pd.read_parquet('train.parquet')
test = pd.read_parquet('test.parquet')
sample_submission = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dakondata/sample_submission.csv', index_col = 0)

In [None]:
### Month 와 Date를 한 가지의 days 컬럼으로 합치는 과정 (train, test 적용)

def days(x) :
    month_days = {1:0, 2:31, 3:60, 4:91, 5:121, 6:152, 7:182, 8:213, 9:244, 10:274, 11:305, 12:335}
    return month_days[x]

df_list = [train, test]
for i in range(len(df_list)) :
    df_list[i].loc[:, 'Day'] = train['Month'].apply(lambda x : days(x))
    df_list[i]['Day'] = df_list[i]['Day'] + df_list[i]['Day_of_Month']

train = train.astype({'Day': object})
test = test.astype({'Day': object})

### 'Month', 'Date' 컬럼 제거

train_new = train.drop([train.columns[1], train.columns[2]], axis = 1)
test_new = test.drop([test.columns[1], test.columns[2]], axis = 1)

In [None]:
### 'Carrier_ID(DOT)' 는 'Airline'에 일대일 대응되는 지표이다. 
### 일대일 대응 되는 지표가 있으면 대체 하는 과정 

carrier_code_mapping = train_new.groupby('Airline')['Carrier_ID(DOT)'].apply(lambda x: x.mode().iloc[0])
train_new['Carrier_ID(DOT)'] = train_new['Carrier_ID(DOT)'].fillna(train_new['Airline'].map(carrier_code_mapping))

In [None]:
airline_code_mapping = train_new.groupby('Carrier_ID(DOT)')['Airline'].apply(lambda x: x.mode().iloc[0])
train_new['Airline'] = train_new['Airline'].fillna(train_new['Carrier_ID(DOT)'].map(airline_code_mapping))

In [None]:
### 대응 후 남은 'Carrier_ID(DOT)', 'Airline' 의 결측치 

train_new.isnull().sum()

In [None]:
### train set 이므로 남는 결측치는 전부 제거

train_new = train_new.dropna(subset = ['Carrier_ID(DOT)'], how='any', axis = 0)

In [None]:
train_new.isnull().sum()

In [None]:
### test도 같은 방식으로 진행, 하지만 없어지는 행이 없어야 하기 때문에 최빈값으로 대체하는 작업 필요

carrier_code_mapping2 = test_new.groupby('Airline')['Carrier_ID(DOT)'].apply(lambda x: x.mode().iloc[0])
test_new['Carrier_ID(DOT)'] = test_new['Carrier_ID(DOT)'].fillna(test_new['Airline'].map(carrier_code_mapping2))

In [None]:
airline_code_mapping2 = test_new.groupby('Carrier_ID(DOT)')['Airline'].apply(lambda x: x.mode().iloc[0])
test_new['Airline'] = test_new['Airline'].fillna(test_new['Carrier_ID(DOT)'].map(airline_code_mapping2))

In [None]:
test_new.isnull().sum()

In [None]:
from scipy.stats import mode

### 결측치 대체할 최빈값 계산
mode_value = test_new['Carrier_ID(DOT)'].mode().iloc[0]
test_new['Carrier_ID(DOT)'] = test_new['Carrier_ID(DOT)'].fillna(mode_value)

mode_value = test_new['Airline'].mode().iloc[0]
test_new['Airline'] = test_new['Airline'].fillna(mode_value)

In [None]:
test_new.isnull().sum()

In [None]:
### 남는 열 제거(중복사항)

col_drop = ['Cancelled', 'Diverted', 'Origin_Airport', 'Destination_Airport', 'Carrier_Code(IATA)', 'Airline', 'Origin_State', 'Destination_State']
train_new = train_new.drop(col_drop, axis=1)
test_new = test_new.drop(col_drop, axis=1)

In [None]:
train_new.info()

In [None]:
test_new.info()

In [None]:
train_new.head()

In [None]:
### 이 작업은 대강의 분 차이를 이용해 출발 또는 도착의 시간을 예상해 결측치를 채워넣는 방법이다.

### 먼저 'HHMM' 를 분으로 변환

def con_to_minutes(x) :
    if np.isnan(x) :
        return np.nan
    else:
        x = int(x)
        x = str(x)
        if len(x) > 2 :
            # 시간과 분으로 나눔
            hours = int(x[:-2])
            mins = int(x[-2:])
        else :
            hours = 0
            mins = int(x[-2:])
        return hours * 60 + mins

### 'Estimated_Departure_Time' 열의 값을 분으로 변환

train_new['Estimated_Departure_Time'] = train_new['Estimated_Departure_Time'].apply(con_to_minutes)

### 'Estimated_Arrival_Time' 열의 값을 분으로 변환

train_new['Estimated_Arrival_Time'] = train_new['Estimated_Arrival_Time'].apply(con_to_minutes)

In [None]:
train_new = train_new.dropna(subset = ['Estimated_Arrival_Time', 'Estimated_Departure_Time'], how = 'all', axis = 0)

In [None]:
train_new.info()

In [None]:
### Origin_Airport_ID와 Destination_Airport_ID를 키로, 평균 비행시간을 값으로 갖는 디렉토리 생성

from collections import defaultdict

time_flying = defaultdict(int)
time_number = defaultdict(int)

cond = ~train_new['Estimated_Arrival_Time'].isnull() & ~train_new['Estimated_Departure_Time'].isnull()

for _, row in train_new[cond].iterrows():
    origin, dest = row['Origin_Airport_ID'], row['Destination_Airport_ID']
    flying_time = (row['Estimated_Arrival_Time'] - row['Estimated_Departure_Time']) % 1440
    time_flying[(origin, dest)] += flying_time
    time_number[(origin, dest)] += 1

for key in time_flying:
    time_flying[key] /= time_number[key]

In [None]:
### 결측값 채우기

for idx, row in train_new[train_new['Origin_Airport_ID'].isnull() | train_new['Destination_Airport_ID'].isnull()].iterrows():
    origin, dest = row['Origin_Airport_ID'], row['Destination_Airport_ID']
    if origin in time_flying and not pd.isnull(origin):
        train_new.at[idx, 'Origin_Airport_ID'] = origin
    if dest in time_flying and not pd.isnull(dest):
        train_new.loc[idx, 'Destination_Airport_ID'] = dest
        train_new.loc[idx, 'Flying_Time'] = time_flying[(origin, dest)]

In [None]:
### Estimated_Arrival_Time - Estimated_Departure_Time 을 계산해 같은 출발, 도착지를 갖는 행의 결측치를 채움

mask_dep = train_new['Estimated_Departure_Time'].isnull()
mask_arr = train_new['Estimated_Arrival_Time'].isnull()

train_new.loc[mask_dep, 'Estimated_Departure_Time'] = train_new.loc[mask_dep].apply(lambda row: (row['Estimated_Arrival_Time'] - time_flying[(row['Origin_Airport_ID'], row['Destination_Airport_ID'])]) % 1440, axis=1)
train_new.loc[mask_arr, 'Estimated_Arrival_Time'] = train_new.loc[mask_arr].apply(lambda row: (row['Estimated_Departure_Time'] + time_flying[(row['Origin_Airport_ID'], row['Destination_Airport_ID'])]) % 1440, axis=1)


In [None]:
### test의 결측치를 최빈값으로 대체

mode_dep = test_new['Estimated_Departure_Time'].mode().iloc[0]
mode_arr = test_new['Estimated_Arrival_Time'].mode().iloc[0]

test_new['Estimated_Departure_Time'].fillna(mode_dep, inplace=True)
test_new['Estimated_Arrival_Time'].fillna(mode_arr, inplace=True)

In [None]:
### Dealy 의 NaN 값 제거 (라벨링 된 데이터로만 추론 진행)

train_new = train_new.dropna()

In [None]:
### Delayed, Not_Delayed 를 0, 1 로 카테고리 화, astype을 쓰지 않은 이유는 변환을 명확히 하기 위함

train_new['Delay'] = train_new['Delay'].replace('Delayed', 1)
train_new['Delay'] = train_new['Delay'].replace('Not_Delayed', 0)

In [None]:
### test셋에 비해 train셋의 양이 현저히 적어졌으므로 validation셋을 생성

from sklearn.model_selection import train_test_split


X = train_new.drop(columns=['ID', 'Delay'], axis = 1)
y = train_new['Delay']
test_val = test_new.drop(columns=['ID'])
X_train, X_valid , y_train, y_valid = train_test_split(X, y, test_size = 0.2, stratify = y)

In [None]:
### xgb 학습을 위해 질적 변수를 양적 변수로 변환함

from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

q_col = ['Tail_Number', 'Day']

for i in q_col:
    le = LabelEncoder()
    le=le.fit(X_train[i])
    X_train[i]=le.transform(X_train[i])

    for label in np.unique(X_valid[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    X_valid[i]=le.transform(X_valid[i])

    for label in np.unique(test_new[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_new[i]=le.transform(test_new[i])
print('Done.')

In [None]:
### 모델링

model = XGBClassifier(
                      booster = 'gbtree',
                      scale_pos_weight=5,
                      learning_rate=0.01,  
                      colsample_bytree = 0.7,
                      subsample = 0.5,
                      max_delta_step = 3,
                      reg_lambda = 2,
                     objective='binary:logistic',
                      n_estimators=818, 
                      max_depth=8,
                     )

In [None]:
%%time
eval_set = [(X_valid, y_valid)]
eval_metric = ["logloss"]
model.fit(X_train, y_train,early_stopping_rounds=50, eval_metric=eval_metric, eval_set = eval_set)

In [None]:
predictions = model.predict_proba(X_valid)
predictions

In [None]:
predictions = model.predict_proba(test_new)
predictions

In [None]:
sample_submission = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dakondata/sample_submission.csv', index_col = 0)

In [None]:
submission = pd.DataFrame(data=predictions, columns=sample_submission.columns, index=sample_submission.index)

In [None]:
submission.to_csv('/content/drive/MyDrive/Colab Notebooks/dakondata/sample_submission.csv', index=True)