<a href="https://colab.research.google.com/github/todoo123/todoo123/blob/circleci-project-setup/escfinalproject_SSL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
# !pip install category_encoders
import category_encoders as ce

In [2]:
train = pd.read_parquet('/content/train.parquet')
test = pd.read_parquet('/content/test.parquet')

In [3]:
def days_process(df_train, df_test):
    train = df_train.copy()
    test  = df_test.copy()
    # day 변수 처리, month, day를 하나의 변수로 합치고자함 
    def to_days(x):
        month_to_days = {1:0, 2:31, 3:60, 4:91, 5:121, 6:152, 7:182, 8:213, 9:244, 10:274, 11:305, 12:335}
        return month_to_days[x]

    train.loc[:, 'Day'] = train['Month'].apply(lambda x: to_days(x))
    train['Day'] = train['Day'] + train['Day_of_Month']

    test.loc[:, 'Day'] = test['Month'].apply(lambda x: to_days(x))
    test['Day'] = test['Day'] + test['Day_of_Month']

    train = train.astype({'Day':object})
    test = test.astype({'Day':object})

    print("Day Done.")
    return train, test


In [4]:
def cid_process(df_train, df_test):
    train = df_train.copy()
    test  = df_test.copy()
    airline_to_cid = {}
    for _, row in train[(~train['Carrier_ID(DOT)'].isnull() & ~train['Airline'].isnull())].iterrows():
        airline_to_cid[row['Airline']] = row['Carrier_ID(DOT)']
    # 복구하기
    def to_cid(x):
        return airline_to_cid[x]

    cond1 = train['Carrier_ID(DOT)'].isnull()
    cond2 = ~train['Airline'].isnull()
    train.loc[cond1&cond2, 'Carrier_ID(DOT)'] = train.loc[cond1&cond2, 'Airline'].apply(lambda x: to_cid(x))

    train = train.dropna(subset=['Carrier_ID(DOT)'], how='any', axis=0)

    # (Test Data Only)
    # Airline, Carrier_Code 둘 다 없으면 최빈 값으로 대체
    NaN_col = ['Carrier_ID(DOT)']
    cond1 = test['Airline'].isnull()
    cond2 = test['Carrier_ID(DOT)'].isnull()

    for col in NaN_col:
        mode = test[col].mode()[0]
        test.loc[cond1&cond2, col] = mode

    # 나머진 Airline에서 대체
    cond1 = test['Carrier_ID(DOT)'].isnull()
    cond2 = ~test['Airline'].isnull()
    test.loc[cond1&cond2, 'Carrier_ID(DOT)'] = test.loc[cond1&cond2, 'Airline'].apply(lambda x: to_cid(x))

    print("Cid Done.")
    
    return train, test


In [5]:
def drop_process(df_train, df_test):
    train = df_train.copy()
    test  = df_test.copy()
    
    col_drop = ['Month', 'Day_of_Month', 'Cancelled', 'Diverted', 'Origin_Airport', 'Destination_Airport', 'Carrier_Code(IATA)', 'Airline', 'Origin_State', 'Destination_State']
    train = train.drop(col_drop, axis=1)
    test = test.drop(col_drop, axis=1)
    print("Drop Done.")
    return train, test

In [6]:
def EAD_EDT_process(df_train, df_test):
    train = df_train.copy()
    test  = df_test.copy()
    
    # Estimated Departure Time (EDT), Estimated Arrival Time (EAT) 복구
    # 출발하는 공항과 도착하는 공항의 평균 비행시간을 이용하여 복구
    def to_minutes(x):
        x = int(x)
        x = str(x)
        if len(x) > 2:
            hours, mins = int(x[:-2]), int(x[-2:])
        else:
            hours, mins = 0, int(x[-2:])
        return hours*60+mins

    estimated_times = ['Estimated_Departure_Time', 'Estimated_Arrival_Time']

    for ET in estimated_times:
        cond = ~train[ET].isnull()
        train.loc[cond, ET] = train.loc[cond, ET].apply(lambda x: to_minutes(x))
        cond2 = ~test[ET].isnull()
        test.loc[cond2, ET] = test.loc[cond2, ET].apply(lambda x: to_minutes(x))

    train = train.dropna(subset=['Estimated_Arrival_Time', 'Estimated_Departure_Time'], how ='all', axis=0)

    time_flying = defaultdict(int)
    time_number = defaultdict(int)

    cond_arr2 = ~train['Estimated_Arrival_Time'].isnull()
    cond_dep2 = ~train['Estimated_Departure_Time'].isnull()

    for _, row in train.loc[cond_arr2 & cond_dep2, :].iterrows():
        OAID, DAID = row['Origin_Airport_ID'], row['Destination_Airport_ID']
        time_flying[(OAID,DAID)] += (row['Estimated_Arrival_Time'] - row['Estimated_Departure_Time'])%1440 # 하루 최대는 1440분
        time_number[(OAID,DAID)] += 1


    for key in time_flying.keys():
        time_flying[key] /= time_number[key]

    for index, row in train.loc[train['Estimated_Departure_Time'].isnull(),].iterrows():
        OAID, DAID = row['Origin_Airport_ID'], row['Destination_Airport_ID']
        train.loc[index,'Estimated_Departure_Time'] = \
            (train.loc[index]['Estimated_Arrival_Time'] - time_flying[(OAID, DAID)])%1440

    for index, row in train.loc[train['Estimated_Arrival_Time'].isnull(),].iterrows():
        OAID, DAID = row['Origin_Airport_ID'], row['Destination_Airport_ID']
        train.loc[index,'Estimated_Arrival_Time'] = \
            (train.loc[index]['Estimated_Departure_Time'] + time_flying[(OAID, DAID)])%1440
        
    # (Test Data Only)
    # 둘 다 없으면 최빈값으로 대체
    cond_1 = test['Estimated_Departure_Time'].isnull()
    cond_2 = test['Estimated_Arrival_Time'].isnull()

    mode = test['Estimated_Departure_Time'].mode()[0]
    mode2 = test['Estimated_Arrival_Time'].mode()[0]
    test.loc[cond_1&cond_2, ['Estimated_Departure_Time', 'Estimated_Arrival_Time']] = mode, mode2


    # Departure만 없을 때,
    for index, row in test.loc[test['Estimated_Departure_Time'].isnull(),].iterrows():
        OAID, DAID = row['Origin_Airport_ID'], row['Destination_Airport_ID']
        test.loc[index,'Estimated_Departure_Time'] = \
            (test.loc[index]['Estimated_Arrival_Time'] - time_flying[(OAID, DAID)])%1440


    # Arrival만 없을 때,
    for index, row in test.loc[test['Estimated_Arrival_Time'].isnull(),].iterrows():
        OAID, DAID = row['Origin_Airport_ID'], row['Destination_Airport_ID']
        test.loc[index,'Estimated_Arrival_Time'] = \
            (test.loc[index]['Estimated_Departure_Time'] + time_flying[(OAID, DAID)])%1440


    # 모두 int로 바꾼다.
    estimated_times = ['Estimated_Departure_Time', 'Estimated_Arrival_Time']
    train = train.astype({'Estimated_Departure_Time':int, 'Estimated_Arrival_Time':int})
    test = test.astype({'Estimated_Departure_Time':int, 'Estimated_Arrival_Time':int})
    for ET in estimated_times:
        train.loc[train[ET] == 1440, ET] = 0
        test.loc[test[ET] == 1440, ET] = 0

    # EDT, EAT 48개의 bins에 담으면 된다. 1440(60*24) 계니까, 48씩 끊어서 하면 될 듯
    estimate_times = ['Estimated_Departure_Time', 'Estimated_Arrival_Time']
    names = {'Estimated_Departure_Time':'EDT', 'Estimated_Arrival_Time':'EAT'}
    for ET in estimated_times:
        for i in range(48):
            train.loc[train[ET].between(i*30, (i+1)*30, 'left'), names[ET]] = i
            test.loc[test[ET].between(i*30, (i+1)*30, 'left'), names[ET]] = i

    train = train.astype({'EDT':int, 'EAT':int})
    test = test.astype({'EDT':int, 'EAT':int})

    train = train.drop(['Estimated_Departure_Time', 'Estimated_Arrival_Time'], axis=1)
    test = test.drop(['Estimated_Departure_Time', 'Estimated_Arrival_Time'], axis=1)
    
    print("EAT_EDT Done.")
    return train, test

In [19]:
def travel_time_process(df_train, df_test):
    train = df_train.copy()
    test  = df_test.copy()
    
    train['Estimated_Travel_Time']=train.EAT-train.EDT
    test['Estimated_Travel_Time']=test.EAT-test.EDT
    def eliminate_outliers(df, a_col, b_col, threshold=1.5):
        correlation = df[a_col].corr(df[b_col])
        if correlation < 0:
            raise ValueError("negative")

        q_low = df[a_col].quantile(0.05)
        q_high = df[a_col].quantile(0.95)
        iqr = q_high - q_low

        lower_bound = q_low - threshold * iqr
        upper_bound = q_high + threshold * iqr

        filtered_df = df.copy() 
        filtered_df.loc[(df[a_col] < lower_bound) | (df[a_col] > upper_bound), a_col] = np.nan
        

        return filtered_df
    #Estimated_Travel_Time outlier np.nan으로 바뀐상태. drop.na하면 될듯
    train = eliminate_outliers(train, 'Estimated_Travel_Time', 'Distance')
    train = train.dropna(subset=['Estimated_Travel_Time'])
    test  = eliminate_outliers(test, 'Estimated_Travel_Time', 'Distance')
    
    print('Travel_Time Done')
    return train, test

In [8]:
def distance_process(df_train, df_test):
    train = df_train.copy()
    test  = df_test.copy()
    for i in range(51):
        train.loc[train['Distance'].between(i*100, (i+1)*100, 'left'), 'Distance'] = i
        test.loc[test['Distance'].between(i*100, (i+1)*100, 'left'), 'Distance'] = i

    train = train.astype({'Distance':int})
    test = test.astype({'Distance':int})

    train = train.astype({'Carrier_ID(DOT)':int})
    test = test.astype({'Carrier_ID(DOT)':int})
    
    print("distance Done.")
    
    return train, test

In [9]:
def holidays_process(df_train, df_test):
    train = df_train.copy()
    test  = df_test.copy()
    def holidays(x):
        # sig는 특정 공휴일이 들어 있는 구간 구분 용도 
        # sig 0은 어떤 공휴일에도 속하지 않음
        # 나머지는 위의 순서를 따름
        # 휴일 기간은 뇌피셜로 공휴일을 가운데 둔 5일 정도 잡음
        sig = 0
        if 364<=x or x<=2:  #sig =1 = 양력 설날
            sig = 1
        # 음력 설날은 계속 변해서 어떻게 해야 할지 모르겠음 일단 sig=2=음력설날
    #     if 58<= x and x<=62:#sig=3= 사육제
    #         sig = 3
    #     if 78<= x and x<=82: # 춘분 시즌
    #         sig= 4
    #     if 91<= x and x<=95: #라마단
    #         sig = 5
    #     if 104<= x and x<=108: # 성목요일,좋은 금요일, 성 토요일, 부활절 합침
    #         sig = 6
    #     if 122<= x and x<=126: # Eid al-Fitr
    #         sig = 7
    #     if 143<= x and x<=147: # 그리스도 승천일
    #         sig = 8
        if 174<= x and x<=178: # 여행 시즌
            sig = 9
        if 184<= x and x<=188: # 각 나라 독립 기념일
            sig = 10
    #     if 189<= x and x<=193: # 장난의 향연
    #         sig = 11
    #     if 226<= x and x<=230: #마리아의 가정 
    #         sig = 12
    #     if 266<= x and x<=270: # 춘분 시즌
    #         sig = 13
    #     if 279<= x and x<=283: #예언자 무함마드 탄생일 
    #         sig = 14
    #     if 303<= x and x<=307: # 모든 성도의 하루, 위령일 합침
    #         sig = 15
        if 357<= x and x<=361: # 크리스마스 여행 시즌
            sig = 16
        return sig
    
    train.loc[:, 'Holidays'] = train['Day'].apply(lambda x: holidays(x))


    test.loc[:, 'Holidays'] = test['Day'].apply(lambda x: holidays(x))


    train = train.astype({'Holidays':object})
    test = test.astype({'Holidays':object})

    print("Holidays Done.")
    
    return train, test

In [10]:
def airport_mean_encode_process(df_train, df_test):
    train = df_train.copy()
    test  = df_test.copy()
    
    df = train[train['Delay'].notnull()][['Destination_Airport_ID','Origin_Airport_ID','Delay']].copy()
    df['Delay'] = df['Delay'].apply(lambda x: 1 if x =="Delayed" else 0)
    df_grouped = df.groupby(['Destination_Airport_ID','Origin_Airport_ID']).mean()
    df_grouped.reset_index(inplace = True)

    df_grouped.rename(columns = {'Delay':'Delay_mean'},inplace = True)
    train_df = pd.merge(train, df_grouped, how = 'left',on = ['Destination_Airport_ID','Origin_Airport_ID'])
    test_df = pd.merge(test, df_grouped, how = 'left',on = ['Destination_Airport_ID','Origin_Airport_ID'])
    
    test_df[test_df['Delay_mean'].isnull()] = 0  
    train_df[train_df['Delay_mean'].isnull()] = 0
    
    train_df['Delay_mean'] = train_df['Delay_mean'].apply(lambda x: int(x*100))
    test_df['Delay_mean'] = test_df['Delay_mean'].apply(lambda x: int(x*100))
    
    print('Mean encoding Done.')
    
    return train_df, test_df
    
    

In [11]:
def final_data_process(df_train, df_test):
    train = df_train.copy()
    test  = df_test.copy()
    
    train.loc[:, 'Delay_num'] = train['Delay'].apply(lambda x: 1 if x=="Delayed" else 0)

    train_x = train.drop(columns=['ID', 'Delay', 'Delay_num'])
    train_y = train['Delay_num']
    test_x = test.drop(columns=['ID'])

    print('Training Prepared.')
    return train_x, train_y, test_x

In [12]:
# sample_submission = pd.read_csv('./sample_submission.csv')
# test   = pd.read_csv('./test.csv')
# train   = pd.read_csv('./train.csv')

FileNotFoundError: ignored

In [57]:
train_1, test_1 = days_process(train, test)
train_2, test_2 = cid_process(train_1, test_1)
train_3, test_3 = drop_process(train_2, test_2)
train_4, test_4 = EAD_EDT_process(train_3, test_3)
train_5, test_5 = travel_time_process(train_4, test_4)
train_6, test_6 = distance_process(train_5, test_5)
train_7, test_7 = holidays_process(train_6, test_6)
train_8, test_8 = airport_mean_encode_process(train_7, test_7)

Day Done.
Cid Done.
Drop Done.
EAT_EDT Done.
Travel_Time Done
distance Done.
Holidays Done.
Mean encoding Done.


In [42]:
train_8

Unnamed: 0,ID,Origin_Airport_ID,Destination_Airport_ID,Distance,Carrier_ID(DOT),Tail_Number,Delay,Day,EDT,EAT,Estimated_Travel_Time,Holidays,Delay_mean
0,TRAIN_000001,13930,14869,12,20304,N125SY,,228,15,20,5.0,0,21
1,TRAIN_000002,11057,12953,5,19805,N103US,,250,32,36,4.0,0,17
2,TRAIN_000003,12892,11618,24,19977,N595UA,,192,18,35,17.0,0,23
3,TRAIN_000004,14771,10157,2,20304,N161SY,,11,18,20,2.0,0,21
4,0,0,0,0,0,0,0,0,0,0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
944204,TRAIN_999995,13930,14100,6,19977,N477UA,,262,19,25,6.0,0,20
944205,TRAIN_999996,11637,13487,2,20304,N439SW,,151,18,20,2.0,0,11
944206,TRAIN_999997,13796,12191,16,19393,N230WN,,180,16,27,11.0,0,10
944207,TRAIN_999998,10693,10397,2,19790,N968DL,,271,32,36,4.0,0,14


In [43]:
train_8.info()


# Origin_Airport_ID, Destination_Airport_ID 의 관계가 Delay_mean 에 반영됨.
# Distance 는 연속형으로, 투입 가능
# Carrier_ID(DOT) 범주형으로 판단되는 숫자형으로 mean encoding 과 같은 preprocessing 필요함.
# Tail_Number 범주형으로 판단되는 숫자형으로 mean encoding 과 같은 preprocessing 필요함.
# Delay target 변수 one-hot encoding
# day 일 단위로 범주화해버림. - preprocessing 필요함
# EDT, EAT 의 경우, 연속형으로 예측되는 비행 거리를 표시하는 것 같은데 - 이 또한 extimated_Travel_Time 이 반영하는 것 같다.(모델링 과정에서 뺄 수 있을 것 같음)
# Holidays - 휴일인 경우 indexing 해버린 것 같음 - 일별로 범주화 했는데 의미있나?
# Delay_mean 은 Origin_Airport_ID 와 Destination_Airport_ID 가 매치되었을 때 나타나는 특징의 proxy 임. 


# todo
# Carrier_ID(DOT) 범주형으로 판단되는 숫자형으로 mean encoding 과 같은 preprocessing 필요함.
# Tail_Number 범주형으로 판단되는 숫자형으로 mean encoding 과 같은 preprocessing 필요함.
# Delay target 변수 one-hot encoding
# day 일 단위로 범주화해버림. - preprocessing 필요함
# Holidays - 휴일인 경우 indexing 해버린 것 같음 - 일별로 범주화 했는데 의미있나?

# 일단은 todo 에 있는 모든 칼럼들을 이용해서 catboost encoding 적용

<class 'pandas.core.frame.DataFrame'>
Int64Index: 944209 entries, 0 to 944208
Data columns (total 13 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   ID                      944209 non-null  object 
 1   Origin_Airport_ID       944209 non-null  int64  
 2   Destination_Airport_ID  944209 non-null  int64  
 3   Distance                944209 non-null  int64  
 4   Carrier_ID(DOT)         944209 non-null  int64  
 5   Tail_Number             944209 non-null  object 
 6   Delay                   242171 non-null  object 
 7   Day                     944209 non-null  object 
 8   EDT                     944209 non-null  int64  
 9   EAT                     944209 non-null  int64  
 10  Estimated_Travel_Time   944209 non-null  float64
 11  Holidays                944209 non-null  object 
 12  Delay_mean              944209 non-null  int64  
dtypes: float64(1), int64(7), object(5)
memory usage: 100.9+ MB


In [84]:

train_8 = train_8.replace("Delayed", 1)
train_8 = train_8.replace("Not_Delayed", 0)

labeled_data = train_8[train_8["Delay"].notnull()]
train_cat = labeled_data[["Carrier_ID(DOT)","Tail_Number","Day","Holidays"]].astype(str)
target_cat = labeled_data[["Delay"]].astype(int)
CBE_encoder = ce.CatBoostEncoder(handle_missing = "return_nan")
train_cbe = CBE_encoder.fit_transform(train_cat, target_cat)
# test_cbe = CBE_encoder.transform(train_cat)

In [91]:
train_cbe.columns

Index(['Carrier_ID(DOT)', 'Tail_Number', 'Day', 'Holidays'], dtype='object')

In [106]:
# 변경한 범주값을 원래 값에 매칭시켜 기존에 존재하던 unlabeled data 에도 매칭시키기 - 1. dictionary 만들기
cat_dict = dict()

for i in train_cbe.index:
    for j in train_cbe.columns:
        cat_dict[train_cat.loc[i,j]] = train_cbe.loc[i,j]

In [None]:
# 변경한 범주값을 원래 값에 매칭시켜 기존에 존재하던 unlabeled data 에도 매칭시키기 - 1. dictionary 를 이용해 unlabeled data 에 적용
for i in range(train_8.shape[0]):
    for j in train_cbe.columns:
        try:
            train_8.loc[i,j] = cat_dict[f"{train_8.loc[i,j]}"]
            
        except KeyError:
            train_8.loc[i,j] = None

In [None]:
train_8.info()

In [103]:
# split train data to train and valid dataset
valid_final = train_8[train_8["Delay"].notnull()].sample(frac = 0.2)
final_df = train_8.drop(valid.index)

4         104
5          20
7         165
9         226
11         12
         ... 
944172    285
944173    123
944178    284
944194    221
944201    365
Name: Day, Length: 242171, dtype: int64

In [None]:
valid_final.astype("float64")
final_df.astype("float64")

ul_df = final_df.loc[final_df.Delay.isnull()].drop(['ID','Delay'], axis=1)
l_df = final_df.loc[final_df.Delay.notnull()].drop(['ID'], axis=1)

le = LabelEncoder()
le.fit(l_df['Delay'])
l_df['Delay'] = le.transform(l_df['Delay'])

l_x = torch.tensor(l_df.drop(['Delay'], axis=1).values).type(torch.float32).to('cuda:0')  # 라벨 데이터 feature
l_y = torch.tensor(l_df['Delay'].values).type(torch.float32).to('cuda:0')                 # 라벨 데이터 target
u_x = torch.tensor(ul_df.values).type(torch.float32).to('cuda:0')                         # un 라벨 데이터

print(le.classes_)

# cuda form 으로 바꿔줌

In [None]:
class BaseModel(nn.Module):
    def __init__(self, d_columns, d_model=128, dropout=0.2):
        super().__init__()

        self.ff_activation = nn.ReLU()                 # Relu 활성화 함수
        self.ff_batchnorm = nn.BatchNorm1d(d_model)    # 배치 정규화

        self.ff_1 = nn.Linear(d_columns, d_model)
        self.ff_2 = nn.Linear(d_model, d_model)
        self.ff_3 = nn.Linear(d_model, d_model)
        self.ff_4 = nn.Linear(d_model, d_model)
        self.ff_5 = nn.Linear(d_model, d_model)        # 다중선형회귀 모형 5개 - 신경망 5겹

        self.classification = nn.Sequential(           # 모듈들을 인수로 받아서 연속적으로 수행하는 method
            nn.Linear(d_model, int(d_model/2)),
            nn.BatchNorm1d(int(d_model/2)),
            nn.Dropout(p=dropout),                     # 0.2 dropout - 신경망 중 일부를 랜덤 삭제 - 일반화 성능 향상
            nn.GELU(),                                 # GELU 활성화 함수?
            nn.Linear(int(d_model/2), int(d_model/4)), # 모델이 128 개 벡터에서 input-64개, output-32개로 감소
            nn.BatchNorm1d(int(d_model/4)),            
            nn.Dropout(p=dropout),
            nn.GELU(),
            nn.Linear(int(d_model/4), 1),              # for binary classification - input-8, output-1으로 
                                                       # 해당 확률값을 이용해서 classification 문제를 풀 수 있게 함
        )

    def forward(self, x):
        x1 = self.ff_1(x)                              
        x1 = self.ff_batchnorm(x1)                     # 위에서 정의한 배치 정규화
        x1 = self.ff_activation(x1)                    # 위에서 정의한 RELU 활성화 함수

        x2 = self.ff_2(x1)
        x2 = self.ff_batchnorm(x2)
        x2 = self.ff_activation(x2)

        x3 = self.ff_3(torch.add(x1,x2))               # 벡터 덧셈 - 첫 번째와 두 번째 레이어의 output 을 더해서 연산 수행
        x3 = self.ff_batchnorm(x3)
        x3 = self.ff_activation(x3)

        x4 = self.ff_4(x3)
        x4 = self.ff_batchnorm(x4)
        x4 = self.ff_activation(x4)

        x5 = self.ff_5(torch.add(x3,x4))               # 위와 마찬가지의 방법 적용
        x5 = self.ff_batchnorm(x5)
        x5 = self.ff_activation(x5)                    # 5개의 layer 통과

        logits = self.classification(x5)               # 분류를 위한 scalar 화
        return F.sigmoid(logits).view(-1)              # sigmoid 함수에 넣고 그 값을 1차원으로 변경

class CustomLoss(nn.Module):                           # custom 한 loss function dacon. 
    def __init__(self, xi):                            # 불균형 데이터셋을 위한 방법 중 하나 - custom 한 log loss function
        super(CustomLoss, self).__init__()
        self.xi = xi

    def forward(self, output, target):
        return torch.mean(-1.0*((1-self.xi)*target*torch.log(output) + self.xi*(1-target)*torch.log(1-output)))

In [None]:
teacher = BaseModel(d_columns=8).to('cuda:0')
student = BaseModel(d_columns=8).to('cuda:0')

t_optimizer = optim.SGD(teacher.parameters(),lr=0.0001, momentum=0.9)           # 최적화는 SGD사용
s_optimizer = optim.SGD(student.parameters(),lr=0.0001, momentum=0.9)

criterion = CustomLoss(xi=0.725)                                                # 경험적으로 xi = 0.725 사용
best_loss = np.inf
patient = 0

early_stop_epoch = 0
tl_loss, sl_loss = [], []
for epochs in tqdm(range(5000)):                                                # 5000 epoch
    teacher.train()                                                             # 처음에 default 값 세팅 (둘다)
    student.train()

    t_optimizer.zero_grad()                                                     # 가중치 업데이트를 위한 기울기 초기화 - SGD 방법
    s_optimizer.zero_grad()

    s_l_pred = student(l_x)
    s_l_loss = criterion(s_l_pred, l_y)

    sl_loss.append(s_l_loss.item())                                             # pseudo label 을 이용한 student model 의 loss 기록(처음에는 default 값 기록)

    t_u_pred = teacher(u_x)                                                     # unlabeled data 를 이용한 pseudo label
    pseudo_y = (t_u_pred >= torch.FloatTensor([0.5]).to('cuda:0')).type(torch.float32)
                                                                                # hard pseudo label - 0.5이상은 1 이하는 0 - 계산량 감소 위함
    s_u_pred = student(u_x)                                                     # unlabeled data 를 이용한 student model 의 예측값
    s_u_loss = criterion(s_u_pred, pseudo_y)                                    # pseudo label 을 이용한 student model 의 loss 기록(처음에는 default 값 기록)
    s_u_loss.backward()                                                         # student model 의 역전파 수행
    s_optimizer.step()                                                          # student model 의 parameter update(teacher model 의 예측값을 이용한)

    s_l_pred_new = student(l_x)                                                 # pseudo label 로 학습한 student model 로 labeled data 예측
    s_l_loss_new = criterion(s_l_pred_new, l_y)                                 # 위의 예측의 loss 값
    change = s_l_loss_new - s_l_loss                                            # pseudo label 을 이용하기 전후의 Loss 값의 차이

    t_l_pred = teacher(l_x)                                                     
    t_l_loss = criterion(t_l_pred, l_y)                                         # teacher model 의 labeled data 에 대한 손실 계산

    tl_loss.append(t_l_loss.item())                                             

    t_mpl_loss = change * criterion(t_u_pred, pseudo_y)                         # unlabel data 에 대한 teacher model 의 손실 계산 - change 값을 점곱

    (t_l_loss + t_mpl_loss).backward()                                          # teacher label loss 와 pseudo label loss를 이용해 역전파 수행
    t_optimizer.step()                                                          # teacher model 의 parameter update

#    if epochs+1 >= 250 and best_loss > s_l_loss.item():
#        best_loss = s_l_loss.item()
#        patient = 0
#    elif epochs+1 >= 250 and best_loss <= s_l_loss.item():
#        patient += 1
#
#    if patient == 20:
#        early_stop_epoch = epochs + 1
#
#        torch.save(teacher.state_dict(), root_path+'/models/teacher/es_ratio_loss_teacher_state_dict.pt')
#        torch.save(student.state_dict(), root_path+'/models/student/es_ratio_loss_student_state_dict.pt')

In [None]:
# valid set 0.2 빼고, 나머지는 labeled, unlabeled 로 바꿔야 함

In [None]:
## 결측치 제거를 한다면 아래 함수 실행 이전에 결측치 제거
train_x, train_y, test_x = final_data_process(train_8, test_8)

Training Prepared.
