
# 필요 라이브러리 구축 


In [None]:
import pandas as pd
import numpy as np
import random
import os
import gc

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression, SGDClassifier
import xgboost as xgb

from sklearn.calibration import CalibratedClassifierCV

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score


In [None]:
from sklearn.model_selection import KFold, GridSearchCV
from xgboost import XGBClassifier

from sklearn.ensemble import VotingClassifier

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
from lightgbm import LGBMClassifier

In [None]:
import seaborn as sns

In [None]:
from google.colab import drive 
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [None]:
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

In [None]:
csv_to_parquet('/content/drive/MyDrive/dacon/train.csv', 'train')
csv_to_parquet('/content/drive/MyDrive/dacon/test.csv', 'test')

train Done.
test Done.


In [None]:
train = pd.read_parquet('/content/train.parquet')
test = pd.read_parquet('/content/test.parquet')
sample_submission = pd.read_csv('/content/drive/MyDrive/dacon/sample_submission.csv', index_col = 0)

##EDA

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 19 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   ID                        1000000 non-null  object 
 1   Month                     1000000 non-null  int64  
 2   Day_of_Month              1000000 non-null  int64  
 3   Estimated_Departure_Time  890981 non-null   float64
 4   Estimated_Arrival_Time    890960 non-null   float64
 5   Cancelled                 1000000 non-null  int64  
 6   Diverted                  1000000 non-null  int64  
 7   Origin_Airport            1000000 non-null  object 
 8   Origin_Airport_ID         1000000 non-null  int64  
 9   Origin_State              890985 non-null   object 
 10  Destination_Airport       1000000 non-null  object 
 11  Destination_Airport_ID    1000000 non-null  int64  
 12  Destination_State         890921 non-null   object 
 13  Distance                  10

In [None]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 18 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   ID                        1000000 non-null  object 
 1   Month                     1000000 non-null  int64  
 2   Day_of_Month              1000000 non-null  int64  
 3   Estimated_Departure_Time  891016 non-null   float64
 4   Estimated_Arrival_Time    890952 non-null   float64
 5   Cancelled                 1000000 non-null  int64  
 6   Diverted                  1000000 non-null  int64  
 7   Origin_Airport            1000000 non-null  object 
 8   Origin_Airport_ID         1000000 non-null  int64  
 9   Origin_State              893495 non-null   object 
 10  Destination_Airport       1000000 non-null  object 
 11  Destination_Airport_ID    1000000 non-null  int64  
 12  Destination_State         893477 non-null   object 
 13  Distance                  10

In [None]:
# 누락값이 존재하는 행들 
['Estimated_Departure_Time ',  'Estimated_Arrival_Time', 'Origin_State', 'Destination_State', 'Airline', 'Carrier_Code(IATA)', 'Carrier_ID(DOT)', 'Tail_Number']

['Estimated_Departure_Time ',
 'Estimated_Arrival_Time',
 'Origin_State',
 'Destination_State',
 'Airline',
 'Carrier_Code(IATA)',
 'Carrier_ID(DOT)',
 'Tail_Number']

In [None]:
train

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay
0,TRAIN_000000,4,15,,,0,0,OKC,13851,Oklahoma,HOU,12191,Texas,419.0,Southwest Airlines Co.,WN,19393.0,N7858A,
1,TRAIN_000001,8,15,740.0,1024.0,0,0,ORD,13930,Illinois,SLC,14869,Utah,1250.0,SkyWest Airlines Inc.,UA,20304.0,N125SY,
2,TRAIN_000002,9,6,1610.0,1805.0,0,0,CLT,11057,North Carolina,LGA,12953,New York,544.0,American Airlines Inc.,AA,19805.0,N103US,
3,TRAIN_000003,7,10,905.0,1735.0,0,0,LAX,12892,California,EWR,11618,New Jersey,2454.0,United Air Lines Inc.,UA,,N595UA,
4,TRAIN_000004,1,11,900.0,1019.0,0,0,SFO,14771,California,ACV,10157,California,250.0,SkyWest Airlines Inc.,UA,20304.0,N161SY,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,TRAIN_999995,9,18,936.0,1243.0,0,0,ORD,13930,,PHL,14100,,678.0,United Air Lines Inc.,UA,19977.0,N477UA,
999996,TRAIN_999996,5,30,920.0,1028.0,0,0,FAR,11637,,MSP,13487,Minnesota,223.0,SkyWest Airlines Inc.,DL,,N439SW,
999997,TRAIN_999997,6,28,800.0,1340.0,0,0,OAK,13796,,HOU,12191,Texas,1642.0,Southwest Airlines Co.,WN,19393.0,N230WN,
999998,TRAIN_999998,9,27,1613.0,1824.0,0,0,BNA,10693,Tennessee,ATL,10397,,214.0,Delta Air Lines Inc.,DL,19790.0,N968DL,


In [None]:
train['Cancelled'].unique()

#취소는 0밖에 없다는 것을 확인 할 수 있었다. 제외 하자 
del train['Cancelled']

In [None]:
del test['Cancelled']

In [None]:
train['Diverted'].unique()

#우회도 존재하지 않음.  제외 하자 

array([0])

In [None]:
del train['Diverted']

In [None]:
del test['Diverted']

In [None]:
train['Month'].unique()

array([ 4,  8,  9,  7,  1,  6,  3, 12, 11,  5, 10,  2])

In [None]:
train.columns

Index(['ID', 'Month', 'Day_of_Month', 'Estimated_Departure_Time',
       'Estimated_Arrival_Time', 'Origin_Airport', 'Origin_Airport_ID',
       'Origin_State', 'Destination_Airport', 'Destination_Airport_ID',
       'Destination_State', 'Distance', 'Airline', 'Carrier_Code(IATA)',
       'Carrier_ID(DOT)', 'Tail_Number', 'Delay'],
      dtype='object')

In [None]:
train['Delay'].unique()

#Delay에 있는 결측치를 처리해야한다. 
#1. 같은 비행기를 타면 같은 결항일 것이다. 
#2. 특정 공항에서 전부 출발에 문제가 생겼다.
#3. 특정 지역에 문제가 생겼다. 
#4. 똑같은 코스는 똑같은 시간을 걸릴것이다. 

array([None, 'Not_Delayed', 'Delayed'], dtype=object)

In [None]:
# Day_of_Month: Month에 해당하는 월의 날짜
# Estimated_Departure_Time: 전산 시스템을 바탕으로 측정된 비행기의 출발 시간 (HH:MM 형식)
# Estimated_Arrival_Time: 전산 시스템을 바탕으로 측정된 비행기의 도착 시간 (HH:MM 형식)
# Cancelled: 해당 항공편의 취소 여부 (0: 취소되지 않음, 1: 취소됨)
# Diverted: 해당 항공편의 경유 여부 (0: 취소되지 않음, 1: 취소됨)
# Origin_Airport: 해당 항공편 출발 공항의 고유 코드 (IATA 공항 코드)
# Origin_Airport_ID: 해당 항공편 출발 공항의 고유 ID (US DOT ID)
# Origin_State: 해당 항공편 출발 공항이 위치한 주의 이름
# Destination_Airport: 해당 항공편 도착 공항의 고유 코드 (IATA 공항 코드)
# Destination_Airport_ID: 해당 항공편 도착 공항의 고유 ID (US DOT ID)
# Destination_State: 해당 항공편 도착 공항이 위치한 주의 이름
# Distance: 출발 공항과 도착 공항 사이의 거리 (mile 단위)
# Airline: 해당 항공편을 운항하는 항공사
# Carrier_Code(IATA): 해당 항공편을 운항하는 항공사의 고유 코드 
# (IATA 공항 코드, 단 다른 항공사가 같은 코드를 보유할 수도 있음)
# Carrier_ID(DOT): 해당 항공편을 운항하는 항공사의 고유 ID (US DOT ID)
# Tail_Number: 해당 항공편을 운항하는 항공기의 고유 등록번호
# Delay: 항공편 지연 여부 (Not_Delayed, Delayed)

## Data Pre-processing

#### Origin_State 처리 

In [None]:
##Origin_State 처리 
train['Origin_State'].isnull().sum()

109015

In [None]:
#출발 공항이 같은 면 같은 공항의 위치를 가지고 있을 것이다. 
answer = {}
for i in range(len(train['Origin_Airport'])):
    if train['Origin_State'][i]:
        answer[train['Origin_Airport'][i]] =train['Origin_State'][i]

In [None]:
## 두개 없음! 
len(answer)

374

In [None]:
len(train['Origin_Airport'])

1000000

In [None]:
temp = []
for i in range(len(train['Origin_Airport'])):
    try:
        temp.append(answer[train['Origin_Airport'][i]])
    except:
        temp.append('uknown')

train['filled_Origin_State'] = temp

In [None]:
train['filled_Origin_State'].isnull().sum()

0

In [None]:
del train['Origin_State']


In [None]:
temp_1 = []
for i in range(len(test['Origin_Airport'])):
    try:
        temp_1.append(answer[test['Origin_Airport'][i]])
    except:
        temp_1.append('uknown')

test['filled_Origin_State'] = temp_1

In [None]:
test['filled_Origin_State'].isnull().sum()

0

In [None]:
del test['Origin_State']


#### Destination_State 처리

In [None]:
train['Destination_State'].isnull().sum()

109079

In [None]:
#출발 공항이 같은 면 같은 공항의 위치를 가지고 있을 것이다. 
answer_1 = {}
for i in range(len(train['Destination_Airport'])):
    if train['Destination_State'][i]:
        answer_1[train['Destination_Airport'][i]] =train['Destination_State'][i]

In [None]:
## 두개 없음! 
len(answer_1)

374

In [None]:
temp = []
for i in range(len(train['Destination_Airport'])):
    try:
        temp.append(answer_1[train['Destination_Airport'][i]])
    except:
        temp.append('uknown')

train['filled_Destination_State'] = temp

In [None]:
train['filled_Destination_State'].isnull().sum()

0

In [None]:
temp_1 = []
for i in range(len(test['Origin_Airport'])):
    try:
        temp_1.append(answer_1[test['Origin_Airport'][i]])
    except:
        temp_1.append('uknown')

test['filled_Destination_State'] = temp_1

In [None]:
test['filled_Destination_State'].isnull().sum()

0

In [None]:
del test['Destination_State']
del train['Destination_State']

In [None]:
train

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Origin_Airport,Origin_Airport_ID,Destination_Airport,Destination_Airport_ID,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay,filled_Origin_State,filled_Destination_State
0,TRAIN_000000,4,15,,,OKC,13851,HOU,12191,419.0,Southwest Airlines Co.,WN,19393.0,N7858A,,Oklahoma,Texas
1,TRAIN_000001,8,15,740.0,1024.0,ORD,13930,SLC,14869,1250.0,SkyWest Airlines Inc.,UA,20304.0,N125SY,,Illinois,Utah
2,TRAIN_000002,9,6,1610.0,1805.0,CLT,11057,LGA,12953,544.0,American Airlines Inc.,AA,19805.0,N103US,,North Carolina,New York
3,TRAIN_000003,7,10,905.0,1735.0,LAX,12892,EWR,11618,2454.0,United Air Lines Inc.,UA,,N595UA,,California,New Jersey
4,TRAIN_000004,1,11,900.0,1019.0,SFO,14771,ACV,10157,250.0,SkyWest Airlines Inc.,UA,20304.0,N161SY,,California,California
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,TRAIN_999995,9,18,936.0,1243.0,ORD,13930,PHL,14100,678.0,United Air Lines Inc.,UA,19977.0,N477UA,,Illinois,Pennsylvania
999996,TRAIN_999996,5,30,920.0,1028.0,FAR,11637,MSP,13487,223.0,SkyWest Airlines Inc.,DL,,N439SW,,North Dakota,Minnesota
999997,TRAIN_999997,6,28,800.0,1340.0,OAK,13796,HOU,12191,1642.0,Southwest Airlines Co.,WN,19393.0,N230WN,,California,Texas
999998,TRAIN_999998,9,27,1613.0,1824.0,BNA,10693,ATL,10397,214.0,Delta Air Lines Inc.,DL,19790.0,N968DL,,Tennessee,Georgia


#### Airline 처리 

In [None]:
train[train['Airline'].isnull()]['Carrier_ID(DOT)']

7         20304.0
17        20452.0
18        20046.0
32        20366.0
56        20304.0
           ...   
999976    19687.0
999982    19805.0
999986    20500.0
999993    20436.0
999999    19790.0
Name: Carrier_ID(DOT), Length: 108920, dtype: float64

In [None]:
temp = train[train['Airline'].isnull()]['Carrier_ID(DOT)'].unique()

In [None]:
air = {}
for i in range(len(temp)):
    for x in train[train['Carrier_ID(DOT)'] == temp[i]]['Airline'].unique():
            if x:
                air[temp[i]] = x

In [None]:
air

{20304.0: 'SkyWest Airlines Inc.',
 20452.0: 'Republic Airlines',
 20046.0: 'Air Wisconsin Airlines Corp',
 20366.0: 'ExpressJet Airlines Inc.',
 19393.0: 'Southwest Airlines Co.',
 19977.0: 'United Air Lines Inc.',
 19790.0: 'Delta Air Lines Inc.',
 20368.0: 'Allegiant Air',
 20398.0: 'Envoy Air',
 19805.0: 'American Airlines Inc.',
 20427.0: 'Capital Cargo International',
 20409.0: 'JetBlue Airways',
 19687.0: 'Horizon Air',
 19930.0: 'Alaska Airlines Inc.',
 19690.0: 'Hawaiian Airlines Inc.',
 20397.0: 'Comair Inc.',
 20416.0: 'Spirit Air Lines',
 20363.0: 'Endeavor Air Inc.',
 21167.0: 'Compass Airlines',
 20378.0: 'Mesa Airlines Inc.',
 20436.0: 'Frontier Airlines Inc.',
 20237.0: 'Trans States Airlines',
 20500.0: 'GoJet Airlines, LLC d/b/a United Express',
 20445.0: 'Commutair Aka Champlain Enterprises, Inc.',
 21171.0: 'Virgin America',
 20253.0: 'Cape Air',
 20225.0: 'Peninsula Airways Inc.',
 20263.0: 'Empire Airlines Inc.'}

In [None]:
for key, value in air.items():
    train.loc[train[(train['Airline'].isnull()) & (train['Carrier_ID(DOT)'] == key)].index, 'Airline'] = value
    test.loc[test[(test['Airline'].isnull()) & (test['Carrier_ID(DOT)'] == key)].index, 'Airline'] = value



In [None]:
temp2 = train[train['Airline'].isnull()]['Tail_Number'].unique()
len(temp2)

4880

In [None]:
temp2[0]

'N461AS'

In [None]:
tail_num = {}
for i in range(len(temp2)):
    for x in train[train['Tail_Number'] == temp2[i]]['Airline'].unique():
            if x:
                tail_num[temp2[i]] = x

In [None]:
tail_num

{'N461AS': 'Alaska Airlines Inc.',
 'N509NK': 'Spirit Air Lines',
 'N8886A': 'Endeavor Air Inc.',
 'N534EA': 'Comair Inc.',
 'N918DH': 'Delta Air Lines Inc.',
 '242NV': 'Allegiant Air',
 'N988AL': 'American Airlines Inc.',
 'N7861J': 'Southwest Airlines Co.',
 'N466UA': 'United Air Lines Inc.',
 'N723TW': 'Delta Air Lines Inc.',
 'N8520Q': 'Southwest Airlines Co.',
 'N986DL': 'Delta Air Lines Inc.',
 'N910DU': 'Delta Air Lines Inc.',
 'N8647A': 'Southwest Airlines Co.',
 'N976AN': 'American Airlines Inc.',
 'N340DN': 'Delta Air Lines Inc.',
 'N985DL': 'Delta Air Lines Inc.',
 'N654AE': 'Capital Cargo International',
 'N483HA': 'Hawaiian Airlines Inc.',
 'N717SA': 'Southwest Airlines Co.',
 'N285WN': 'Southwest Airlines Co.',
 'N14904': 'ExpressJet Airlines Inc.',
 'N321DH': 'Delta Air Lines Inc.',
 'N848AE': 'Envoy Air',
 'N804AW': 'American Airlines Inc.',
 'N571JB': 'JetBlue Airways',
 'N841UA': 'United Air Lines Inc.',
 'N600BP': 'Capital Cargo International',
 'N260JS': 'Comair Inc

In [None]:
for key, value in tail_num.items():
    train.loc[train[(train['Airline'].isnull()) & (train['Tail_Number'] == key)].index, 'Airline'] = value
    test.loc[test[(test['Airline'].isnull()) & (test['Tail_Number'] == key)].index, 'Airline'] = value

In [None]:
train[train['Airline'].isnull()]

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Origin_Airport,Origin_Airport_ID,Destination_Airport,Destination_Airport_ID,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay,filled_Origin_State,filled_Destination_State
382926,TRAIN_382926,10,31,1740.0,,HNL,12173,ATL,10397,4502.0,,DL,,N867DA,,Hawaii,Georgia
856500,TRAIN_856500,1,13,2205.0,705.0,OGG,13830,SLC,14869,2935.0,,DL,,N153DL,,Hawaii,Utah


In [None]:
train['Airline'].isnull().sum(), test['Airline'].isnull().sum()

(2, 1719)

### Carrier_Code(IATA) 처리 


In [None]:
## carrier_id와 같은 의미라고 판단 지우고 시작한다. 
del train['Carrier_Code(IATA)']
del test['Carrier_Code(IATA)']


### Carrier_ID(DOT) 처리

In [None]:
temp3 = train[train['Carrier_ID(DOT)'].isnull()]['Airline'].unique()
temp3

array(['United Air Lines Inc.', 'Republic Airlines',
       'Southwest Airlines Co.', 'SkyWest Airlines Inc.',
       'Endeavor Air Inc.', 'Comair Inc.', 'Mesa Airlines Inc.',
       'Delta Air Lines Inc.', 'JetBlue Airways',
       'American Airlines Inc.', 'Frontier Airlines Inc.', 'Envoy Air',
       'Alaska Airlines Inc.', 'Air Wisconsin Airlines Corp',
       'Horizon Air', 'Spirit Air Lines',
       'GoJet Airlines, LLC d/b/a United Express',
       'ExpressJet Airlines Inc.', 'Trans States Airlines',
       'Hawaiian Airlines Inc.', 'Allegiant Air',
       'Commutair Aka Champlain Enterprises, Inc.', 'Compass Airlines',
       'Capital Cargo International', 'Empire Airlines Inc.',
       'Peninsula Airways Inc.', 'Virgin America', 'Cape Air', None],
      dtype=object)

In [None]:
airline_num = {}
for i in range(len(temp3)):
    for x in train[train['Airline'] == temp3[i]]['Carrier_ID(DOT)'].unique():
            if not np.isnan(x):
                airline_num[temp3[i]] = x

In [None]:
airline_num

{'United Air Lines Inc.': 19977.0,
 'Republic Airlines': 20452.0,
 'Southwest Airlines Co.': 19393.0,
 'SkyWest Airlines Inc.': 20304.0,
 'Endeavor Air Inc.': 20363.0,
 'Comair Inc.': 20397.0,
 'Mesa Airlines Inc.': 20378.0,
 'Delta Air Lines Inc.': 19790.0,
 'JetBlue Airways': 20409.0,
 'American Airlines Inc.': 19805.0,
 'Frontier Airlines Inc.': 20436.0,
 'Envoy Air': 20398.0,
 'Alaska Airlines Inc.': 19930.0,
 'Air Wisconsin Airlines Corp': 20046.0,
 'Horizon Air': 19687.0,
 'Spirit Air Lines': 20416.0,
 'GoJet Airlines, LLC d/b/a United Express': 20500.0,
 'ExpressJet Airlines Inc.': 20366.0,
 'Trans States Airlines': 20237.0,
 'Hawaiian Airlines Inc.': 19690.0,
 'Allegiant Air': 20368.0,
 'Commutair Aka Champlain Enterprises, Inc.': 20445.0,
 'Compass Airlines': 21167.0,
 'Capital Cargo International': 20427.0,
 'Empire Airlines Inc.': 20263.0,
 'Peninsula Airways Inc.': 20225.0,
 'Virgin America': 21171.0,
 'Cape Air': 20253.0}

In [None]:
for key, value in airline_num.items():
    train.loc[train[(train['Carrier_ID(DOT)'].isnull()) & (train['Airline'] == key)].index, 'Carrier_ID(DOT)'] = value
    test.loc[test[(test['Carrier_ID(DOT)'].isnull()) & (test['Airline'] == key)].index, 'Carrier_ID(DOT)'] = value

In [None]:
train.iloc[116764, 11] = 20409.0


In [None]:
train['Carrier_ID(DOT)'].isnull().sum(), test['Carrier_ID(DOT)'].isnull().sum()

(2, 1719)

### Tail_Number 처리

In [None]:
train['Tail_Number'].isnull().sum()

0

In [None]:
test['Tail_Number'].isnull().sum()

0

In [None]:
train_keep = train 
test_kepp = test 

In [None]:
## 별로 없기에 최빈값으로 처리 --> 
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 15 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   ID                        1000000 non-null  object 
 1   Month                     1000000 non-null  int64  
 2   Day_of_Month              1000000 non-null  int64  
 3   Estimated_Departure_Time  891016 non-null   float64
 4   Estimated_Arrival_Time    890952 non-null   float64
 5   Origin_Airport            1000000 non-null  object 
 6   Origin_Airport_ID         1000000 non-null  int64  
 7   Destination_Airport       1000000 non-null  object 
 8   Destination_Airport_ID    1000000 non-null  int64  
 9   Distance                  1000000 non-null  float64
 10  Airline                   998281 non-null   object 
 11  Carrier_ID(DOT)           998281 non-null   float64
 12  Tail_Number               1000000 non-null  object 
 13  filled_Origin_State       10

###  Estimated_Departure_Time 처리

In [None]:
train[train['Estimated_Departure_Time'].isnull()]

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Origin_Airport,Origin_Airport_ID,Destination_Airport,Destination_Airport_ID,Distance,Airline,Carrier_ID(DOT),Tail_Number,Delay,filled_Origin_State,filled_Destination_State
0,TRAIN_000000,4,15,,,OKC,13851,HOU,12191,419.0,Southwest Airlines Co.,19393.0,N7858A,,Oklahoma,Texas
25,TRAIN_000025,12,24,,2020.0,STS,15023,SEA,14747,618.0,Horizon Air,19687.0,N451QX,,California,Washington
74,TRAIN_000074,11,15,,749.0,BGR,10581,DCA,11278,590.0,Comair Inc.,20397.0,N215PS,Not_Delayed,Maine,Virginia
100,TRAIN_000100,12,12,,1446.0,IAH,12266,ORD,13930,925.0,United Air Lines Inc.,19977.0,N896UA,,Texas,Illinois
102,TRAIN_000102,8,30,,2139.0,DAB,11252,CLT,11057,416.0,Comair Inc.,20397.0,N522AE,,Florida,North Carolina
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999974,TRAIN_999974,4,4,,859.0,SFO,14771,LAX,12892,337.0,Delta Air Lines Inc.,19790.0,N3768,,California,California
999977,TRAIN_999977,6,22,,2310.0,BNA,10693,MDW,13232,395.0,Southwest Airlines Co.,19393.0,N929WN,,Tennessee,Illinois
999986,TRAIN_999986,6,13,,1356.0,RDU,14492,ORD,13930,646.0,"GoJet Airlines, LLC d/b/a United Express",20500.0,N669CA,,North Carolina,Illinois
999990,TRAIN_999990,1,21,,751.0,ORD,13930,DFW,11298,802.0,American Airlines Inc.,19805.0,N967NN,,Illinois,Texas


In [None]:
train.iloc[train[train['Estimated_Departure_Time'].isnull()].index[1]]

ID                          TRAIN_000025
Month                                 12
Day_of_Month                          24
Estimated_Departure_Time             NaN
Estimated_Arrival_Time            2020.0
Origin_Airport                       STS
Origin_Airport_ID                  15023
Destination_Airport                  SEA
Destination_Airport_ID             14747
Distance                           618.0
Airline                      Horizon Air
Carrier_ID(DOT)                  19687.0
Tail_Number                       N451QX
Delay                               None
filled_Origin_State           California
filled_Destination_State      Washington
Name: 25, dtype: object

In [None]:
(x1 - x2).min()


nan

In [None]:
idx1 = train['Origin_Airport_ID'].unique()  

In [None]:
idx2 = train['Destination_Airport_ID'].unique()

In [None]:
 


train[(train['Origin_Airport_ID'] == 15356) & (train['Destination_Airport_ID'] == 14082) ]

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Origin_Airport,Origin_Airport_ID,Destination_Airport,Destination_Airport_ID,Distance,Airline,Carrier_ID(DOT),Tail_Number,Delay,filled_Origin_State,filled_Destination_State
55551,TRAIN_055551,4,19,1636.0,,TTN,15356,PGD,14082,1008.0,Allegiant Air,20368.0,253NV,,New Jersey,Florida
345599,TRAIN_345599,4,29,1636.0,1917.0,TTN,15356,PGD,14082,1008.0,Allegiant Air,20368.0,221NV,,New Jersey,Florida
530020,TRAIN_530020,4,12,1636.0,1917.0,TTN,15356,PGD,14082,1008.0,Allegiant Air,20368.0,253NV,,New Jersey,Florida


In [None]:
##출발 도착 시간에 특징을 찾기 힘들었음. 
## 같은 요일이면 시간이 같나 ? X
## 같은 도착지 같은 출발지면 시간이 같나 ? X
## 같은 거리면 시간이 같나 ? X
## 따라서 새로운 변수인 평균 거리만 지정한다. 
## NAN가 있는곳은 최빈값을 주고 계산을 한다. 



temp = np.zeros(len(train))
train['Delay'] =temp
train['Delay']

0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
999995    0.0
999996    0.0
999997    0.0
999998    0.0
999999    0.0
Name: Delay, Length: 1000000, dtype: float64

In [None]:
idx_total = []
for x in idx1:
    for y in idx2:
        if len(train[(train['Origin_Airport_ID'] == x) & (train['Destination_Airport_ID'] == y)]):
            x1 = train[(train['Origin_Airport_ID'] == x) & (train['Destination_Airport_ID'] == y) ]['Estimated_Arrival_Time']
            x2 = train[(train['Origin_Airport_ID'] == x) & (train['Destination_Airport_ID'] == y) ]['Estimated_Departure_Time']
            temp_idx = (x1 - x2).index
            try:
                md = (x1 - x2).mode()[0]
            except:
                md = np.nan

            for k,z in enumerate(temp_idx):
                if not np.isnan((x1 - x2)[z]):
                    if (x1 - x2)[z] > md:
                        train.loc[z, 'delay'] = 1
                    else:
                        train.loc[z, 'delay'] = 0
                        
                else:
                    train.loc[z, 'delay'] = 0
            



In [None]:
train['delay'].isnull().sum()

0

In [None]:
train

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Origin_Airport,Origin_Airport_ID,Destination_Airport,Destination_Airport_ID,Distance,Airline,Carrier_ID(DOT),Tail_Number,Delay,filled_Origin_State,filled_Destination_State,delay
0,TRAIN_000000,4,15,,,OKC,13851,HOU,12191,419.0,Southwest Airlines Co.,19393.0,N7858A,0.0,Oklahoma,Texas,0.0
1,TRAIN_000001,8,15,740.0,1024.0,ORD,13930,SLC,14869,1250.0,SkyWest Airlines Inc.,20304.0,N125SY,0.0,Illinois,Utah,0.0
2,TRAIN_000002,9,6,1610.0,1805.0,CLT,11057,LGA,12953,544.0,American Airlines Inc.,19805.0,N103US,0.0,North Carolina,New York,1.0
3,TRAIN_000003,7,10,905.0,1735.0,LAX,12892,EWR,11618,2454.0,United Air Lines Inc.,19977.0,N595UA,0.0,California,New Jersey,1.0
4,TRAIN_000004,1,11,900.0,1019.0,SFO,14771,ACV,10157,250.0,SkyWest Airlines Inc.,20304.0,N161SY,0.0,California,California,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,TRAIN_999995,9,18,936.0,1243.0,ORD,13930,PHL,14100,678.0,United Air Lines Inc.,19977.0,N477UA,0.0,Illinois,Pennsylvania,1.0
999996,TRAIN_999996,5,30,920.0,1028.0,FAR,11637,MSP,13487,223.0,SkyWest Airlines Inc.,20304.0,N439SW,0.0,North Dakota,Minnesota,0.0
999997,TRAIN_999997,6,28,800.0,1340.0,OAK,13796,HOU,12191,1642.0,Southwest Airlines Co.,19393.0,N230WN,0.0,California,Texas,0.0
999998,TRAIN_999998,9,27,1613.0,1824.0,BNA,10693,ATL,10397,214.0,Delta Air Lines Inc.,19790.0,N968DL,0.0,Tennessee,Georgia,1.0


In [None]:
del train['ID']
del train['Estimated_Departure_Time']
del train['Estimated_Arrival_Time']
del train['Delay']

In [None]:
submission_ID = test['ID']

In [None]:
del test['ID']
del test['Estimated_Departure_Time']
del test['Estimated_Arrival_Time']


### 나머지 결측값 처리 

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 16 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   ID                        1000000 non-null  object 
 1   Month                     1000000 non-null  int64  
 2   Day_of_Month              1000000 non-null  int64  
 3   Estimated_Departure_Time  890981 non-null   float64
 4   Estimated_Arrival_Time    890960 non-null   float64
 5   Origin_Airport            1000000 non-null  object 
 6   Origin_Airport_ID         1000000 non-null  int64  
 7   Destination_Airport       1000000 non-null  object 
 8   Destination_Airport_ID    1000000 non-null  int64  
 9   Distance                  1000000 non-null  float64
 10  Airline                   999998 non-null   object 
 11  Carrier_ID(DOT)           999998 non-null   float64
 12  Tail_Number               1000000 non-null  object 
 13  Delay                     25

In [None]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 15 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   ID                        1000000 non-null  object 
 1   Month                     1000000 non-null  int64  
 2   Day_of_Month              1000000 non-null  int64  
 3   Estimated_Departure_Time  891016 non-null   float64
 4   Estimated_Arrival_Time    890952 non-null   float64
 5   Origin_Airport            1000000 non-null  object 
 6   Origin_Airport_ID         1000000 non-null  int64  
 7   Destination_Airport       1000000 non-null  object 
 8   Destination_Airport_ID    1000000 non-null  int64  
 9   Distance                  1000000 non-null  float64
 10  Airline                   998281 non-null   object 
 11  Carrier_ID(DOT)           998281 non-null   float64
 12  Tail_Number               1000000 non-null  object 
 13  filled_Origin_State       10

In [None]:
NaN_col = ['Airline', 'Carrier_ID(DOT)', 'Estimated_Departure_Time', 'Estimated_Arrival_Time']

for col in NaN_col:
    mode = train[col].mode()[0]

    test[col] = test[col].fillna(mode)

In [None]:
test.info()

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 16 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   ID                        1000000 non-null  object 
 1   Month                     1000000 non-null  int64  
 2   Day_of_Month              1000000 non-null  int64  
 3   Estimated_Departure_Time  890981 non-null   float64
 4   Estimated_Arrival_Time    890960 non-null   float64
 5   Origin_Airport            1000000 non-null  object 
 6   Origin_Airport_ID         1000000 non-null  int64  
 7   Destination_Airport       1000000 non-null  object 
 8   Destination_Airport_ID    1000000 non-null  int64  
 9   Distance                  1000000 non-null  float64
 10  Airline                   999998 non-null   object 
 11  Carrier_ID(DOT)           999998 non-null   float64
 12  Tail_Number               1000000 non-null  object 
 13  Delay                     25

In [None]:
train = train.dropna()


In [None]:
train_x = train.drop(columns=['Delay'])
train_y = train['Delay']


In [None]:
test_x = test

## 데이터 라벨링 인코딩 

In [None]:
#질적 변수들을 수치화합니다
qual_col = ['Origin_Airport',  'Destination_Airport',  'Airline', 'Tail_Number', 'filled_Origin_State', 'filled_Destination_State']

for i in qual_col:
    le = LabelEncoder()
    le=le.fit(train_x[i])
    train_x[i]=le.transform(train_x[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_x[i]=le.transform(test[i])
print('Done.')

Done.


In [None]:
le = le.fit(train_y)
train_y = le.transform(train_y)



In [None]:
del train_x['ID']
del test['ID']

## 모델링

In [None]:
clf = RandomForestClassifier()
clf.fit(train_x, train_y)

In [None]:
ada = AdaBoostClassifier()
ada.fit(train_x, train_y)

In [None]:
LR = LogisticRegression()
LR.fit(train_x, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
SGD = SGDClassifier(loss = 'log_loss')
SGD.fit(train_x, train_y)



In [None]:
XGB =xgb.XGBClassifier(n_estimators=50,
                       learning_rate=0.1,
                       max_depth=6)




In [None]:
XGB.fit(train_x, train_y)

In [None]:
LGBM = LGBMClassifier(n_estimators=200,
                      max_depth = 15,
                      min_child_samples = 40,
                      learning_rate = 0.01)

LGBM.fit(train_x, train_y)

In [None]:
NB = GaussianNB()
NB.fit(train_x, train_y)

## inference

In [None]:
y_pred = clf.predict_proba(test_x)

In [None]:
y_prd = clf.predict(train_x)

In [None]:
accuracy = accuracy_score(train_y, y_prd)
f1 = f1_score(train_y, y_prd, average='weighted')
precision = precision_score(train_y, y_prd, average='weighted')
recall = recall_score(train_y, y_prd, average='weighted')

print(f'Accuracy: {accuracy}')
print(f'F1 Score: {f1}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')


Accuracy: 0.9985739971479943
F1 Score: 0.9985739097350419
Precision: 0.9985739173421143
Recall: 0.9985739971479943


In [None]:
y_pred_ada = clf.predict_proba(test_x)

In [None]:
y_pred_LR = LR.predict_proba(test_x)

In [None]:
y_pred_SGD = SGD.predict_proba(test_x)

In [None]:
y_pred_XGB = XGB.predict_proba(test_x)

In [None]:
y_pred_LightGBM =LGBM.predict_proba(test_x)

In [None]:
y_pred_NB =NB.predict_proba(test_x)

In [None]:
RF = RandomForestClassifier(criterion = 'log_loss', n_estimators = 200, max_depth = 20)
ada = AdaBoostClassifier(n_estimators = 200)
LR = LogisticRegression()
SGD = SGDClassifier(loss = 'log_loss')
XGB =xgb.XGBClassifier(n_estimators=200, max_depth=6)
LGBM = LGBMClassifier(n_estimators=200, max_depth = 15, min_child_samples = 40)
NB = GaussianNB()


ensemble_model = VotingClassifier(estimators=[
            ('rf', RF), ('ada', ada), ('lr', LR), ('SGD', SGD), ('XGB', XGB), ('LGBM', LGBM), ('NB', NB)], voting='soft')


In [None]:
ensemble_model = ensemble_model.fit(train_x, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
y_pred_ensembel = ensemble_model.predict_proba(test_x)

In [None]:
RF = RandomForestClassifier(criterion = 'log_loss', n_estimators = 200, max_depth = 20)
ada = AdaBoostClassifier(n_estimators = 200)
LR = LogisticRegression()
#SGD = SGDClassifier(loss = 'log_loss')
XGB =xgb.XGBClassifier(n_estimators=200, max_depth=6)
LGBM = LGBMClassifier(n_estimators=200, max_depth = 15, min_child_samples = 40)
NB = GaussianNB()


ensemble_model_2 = VotingClassifier(estimators=[
            ('rf', RF), ('ada', ada), ('lr', LR),  ('XGB', XGB), ('LGBM', LGBM), ('NB', NB)], voting='soft')


In [None]:
ensemble_model_2 = ensemble_model_2.fit(train_x, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
y_pred_ensembel_2 = ensemble_model_2.predict_proba(test_x)

## Submission

In [None]:
sample_submission = pd.read_csv('/content/sample_submission.csv', index_col = 0)

In [None]:
submission = pd.DataFrame(data=y_pred, columns=sample_submission.columns, index=sample_submission.index)
submission.to_csv('submission.csv', index=True)

In [None]:
submission = pd.DataFrame(data=y_pred_ada, columns=sample_submission.columns, index=sample_submission.index)
submission.to_csv('submission_ada.csv', index=True)

In [None]:
submission = pd.DataFrame(data=y_pred_LR, columns=sample_submission.columns, index=sample_submission.index)
submission.to_csv('submission_LR.csv', index=True)

In [None]:
submission = pd.DataFrame(data=y_pred_SGD, columns=sample_submission.columns, index=sample_submission.index)
submission.to_csv('submission_SGD.csv', index=True)

In [None]:
submission = pd.DataFrame(data=y_pred_XGB, columns=sample_submission.columns, index=sample_submission.index)
submission.to_csv('submission_XGB.csv', index=True)

In [None]:
submission = pd.DataFrame(data=y_pred_LightGBM, columns=sample_submission.columns, index=sample_submission.index)
submission.to_csv('submission_LightGBM.csv', index=True)

In [None]:
submission = pd.DataFrame(data=y_pred_NB, columns=sample_submission.columns, index=sample_submission.index)
submission.to_csv('submission_NB.csv', index=True)

In [None]:
submission = pd.DataFrame(data=y_pred_ensembel, columns=sample_submission.columns, index=sample_submission.index)
submission.to_csv('submission_ensemble.csv', index=True)

In [None]:
submission = pd.DataFrame(data=y_pred_ensembel_2, columns=sample_submission.columns, index=sample_submission.index)
submission.to_csv('submission_ensemble_2.csv', index=True)

In [None]:
submission

Unnamed: 0_level_0,Not_Delayed,Delayed
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
TEST_000000,0.91,0.09
TEST_000001,0.78,0.22
TEST_000002,0.70,0.30
TEST_000003,0.53,0.47
TEST_000004,0.97,0.03
...,...,...
TEST_999995,0.42,0.58
TEST_999996,0.96,0.04
TEST_999997,0.69,0.31
TEST_999998,0.66,0.34


## 준지도 학습을 통한 구현하기 

In [None]:
# 시간 처리 이전에 멈춰있는 부분부터 시작  ~


from keras.layers import Input, Dense
from keras.models import Model, Sequential
from keras import regularizers
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn import preprocessing 
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np
import seaborn as sns

In [None]:
# train_x, train_y
# test
#까지 구해져 있는 상황 

In [None]:
train_y.index

Int64Index([     6,      8,     10,     12,     13,     19,     32,     34,
                36,     42,
            ...
            999940, 999941, 999943, 999950, 999952, 999955, 999963, 999969,
            999985, 999992],
           dtype='int64', length=202507)

In [None]:
X = train_x
y = train_y

## define the model
input_layer = Input(shape=(X.shape[1],))
encoded = Dense(100, activation='tanh', activity_regularizer=regularizers.l1(10e-5))(input_layer)
encoded = Dense(50, activation='relu')(encoded)
decoded = Dense(50, activation='tanh')(encoded)
decoded = Dense(100, activation='tanh')(decoded)
output_layer = Dense(X.shape[1], activation='relu')(decoded)

autoencoder = Model(input_layer, output_layer)
autoencoder.compile(optimizer="adadelta", loss="mse", run_eagerly = True)

In [None]:
scaler = preprocessing.MinMaxScaler()
scaler.fit(X.values)
X_scale = scaler.transform(X.values)
test_x_scale = scaler.transform(test.values)


In [None]:
x_perished, x_survived = X_scale[y == 0], X_scale[y == 1]


In [None]:
autoencoder.fit(x_perished, x_perished, epochs = 20, shuffle = True)

Epoch 1/20




Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fda49e4abf0>

In [None]:
hidden_representation = Sequential()
hidden_representation.add(autoencoder.layers[0])
hidden_representation.add(autoencoder.layers[1])
hidden_representation.add(autoencoder.layers[2])

In [None]:
perished_hid_rep = hidden_representation.predict(x_perished)
survived_hid_rep = hidden_representation.predict(x_survived)

rep_x = np.append(perished_hid_rep, survived_hid_rep, axis = 0)
y_n = np.zeros(perished_hid_rep.shape[0])
y_f = np.ones(survived_hid_rep.shape[0])
rep_y = np.append(y_n, y_f)



In [None]:
train_x, val_x, train_y, val_y = train_test_split(rep_x, rep_y, test_size=0.25)
clf = LogisticRegression().fit(train_x, train_y)
pred_y = clf.predict(val_x)

print (classification_report(val_y, pred_y))
print (accuracy_score(val_y, pred_y))

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00      8842
         1.0       0.83      1.00      0.90     41785

    accuracy                           0.83     50627
   macro avg       0.41      0.50      0.45     50627
weighted avg       0.68      0.83      0.75     50627

0.8253501096252988


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
test_rep_x.shape

(1000000, 50)

In [None]:
test_rep_x = hidden_representation.predict(test_x_scale)




In [None]:
temp = clf.predict_proba(test_rep_x)
temp

array([[0.16097187, 0.83902813],
       [0.20129108, 0.79870892],
       [0.2167734 , 0.7832266 ],
       ...,
       [0.23122836, 0.76877164],
       [0.10070375, 0.89929625],
       [0.14296672, 0.85703328]])

In [None]:
temp.shape

(1000000, 2)

In [None]:
submission = pd.DataFrame(data=temp, columns=sample_submission.columns, index=sample_submission.index)
submission.to_csv('submission_autoencoder.csv', index=True)

In [None]:
RF = RandomForestClassifier(criterion = 'log_loss', n_estimators = 200, max_depth = 20)
ada = AdaBoostClassifier(n_estimators = 200)
LR = LogisticRegression()
#SGD = SGDClassifier(loss = 'log_loss')
XGB =xgb.XGBClassifier(n_estimators=200, max_depth=6)
LGBM = LGBMClassifier(n_estimators=200, max_depth = 15, min_child_samples = 40)
NB = GaussianNB()


ensemble_model_2 = VotingClassifier(estimators=[
            ('rf', RF), ('ada', ada), ('lr', LR),  ('XGB', XGB), ('LGBM', LGBM), ('NB', NB)], voting='soft')


In [None]:
ensemble_model_2 = ensemble_model_2.fit(train_x, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
temp = clf.predict_proba(test_rep_x)
temp

array([[0.16097187, 0.83902813],
       [0.20129108, 0.79870892],
       [0.2167734 , 0.7832266 ],
       ...,
       [0.23122836, 0.76877164],
       [0.10070375, 0.89929625],
       [0.14296672, 0.85703328]])

In [None]:
submission = pd.DataFrame(data=temp, columns=sample_submission.columns, index=sample_submission.index)
submission.to_csv('submission_ensemble_autoencoder.csv', index=True)