In [1]:
# 필요한 패키지 임포트
import random
import torch
import pandas as pd
import numpy as np
import pandas_profiling

# 랜덤시드 고정
seed = 1
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [2]:
# 데이터 로드
train = pd.read_csv('../input/2022-ai-midterm-p5/train.csv')
test = pd.read_csv('../input/2022-ai-midterm-p5/test.csv')
submit = pd.read_csv('../input/2022-ai-midterm-p5/submit_sample.csv')

In [3]:
# 필요없는 column drop 및 정답 라벨 분리
y = train['target']
test = test.drop(['index', 'enrollee_id', 'city'], axis=1)
x = train.drop(['index', 'enrollee_id', 'target', 'city'], axis=1)

In [4]:
# 데이터 전처리

# x.city = [int(str(n)[5:]) for n in x.city]
# test.city = [int(str(n)[5:]) for n in test.city]

x.gender = [1 if n == 'Female' else 0 for n in x.gender]
test.gender = [1 if n == 'Female' else 0 for n in test.gender]

x.relevent_experience = [1 if n == 'Has relevent experience' else 0 for n in x.relevent_experience]
test.relevent_experience = [1 if n == 'Has relevent experience' else 0 for n in test.relevent_experience]

x.enrolled_university = [1 if n != 'no_enrollment' else 0 for n in x.enrolled_university]
test.enrolled_university = [1 if n != 'no_enrollment' else 0 for n in test.enrolled_university]

x.education_level.fillna('Graduate', inplace=True)
test.education_level.fillna('Graduate', inplace=True)
x.major_discipline.fillna('STEM', inplace=True)
test.major_discipline.fillna('STEM', inplace=True)

x.experience = ['0' if n == '<1' else n for n in x.experience]
test.experience = ['0' if n == '<1' else n for n in test.experience]
x.experience = ['20' if n == '>20' else n for n in x.experience]
test.experience = ['20' if n == '>20' else n for n in test.experience]
x.experience.fillna('20', inplace=True)
test.experience.fillna('20', inplace=True)

x.company_size.fillna('50-99', inplace=True)
test.company_size.fillna('50-99', inplace=True)
x.company_type.fillna('Pvt Ltd', inplace=True)
test.company_type.fillna('Pvt Ltd', inplace=True)

x.last_new_job.fillna('1', inplace=True)
test.last_new_job.fillna('1', inplace=True)


In [5]:
# 범주형 데이터 실수화

from sklearn.preprocessing import LabelEncoder

def label_encode(train, test, columns):
    for column in columns:
        le = LabelEncoder()
        le.fit(pd.concat([train[column], test[column]]).drop_duplicates())
        train[column] = le.transform(train[column])
        test[column] = le.transform(test[column])
        
label_encode(x, test, ['education_level', 'major_discipline', 'experience', 'company_size', 'company_type', 'last_new_job'])

In [6]:
# 데이터 정규화

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
x = sc.fit_transform(x)
test = sc.transform(test)

In [7]:
# torch tensor로 옮기기
x_tensor = torch.FloatTensor(x).cuda()
y_tensor = torch.LongTensor(y).cuda()
test_tensor = torch.FloatTensor(test).cuda()

In [8]:
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(x_tensor, y_tensor, test_size=0.1, shuffle=True)

In [9]:
# 모델 설계
layer1 = torch.nn.Linear(x_tensor.shape[1], 8).cuda()
layer2 = torch.nn.Linear(8, 2).cuda()

# 활성화 함수
relu = torch.nn.ReLU()
# Dropout
dropout = torch.nn.Dropout(p=0.3)
# Softmax
softmax = torch.nn.Softmax(dim=1)

# 레이어 init
torch.nn.init.xavier_normal_(layer1.weight)
torch.nn.init.xavier_normal_(layer2.weight)
# torch.nn.init.xavier_normal_(layer3.weight)
# torch.nn.init.xavier_normal_(layer4.weight)
# torch.nn.init.xavier_normal_(layer5.weight)

# 레이어 연결
model = torch.nn.Sequential(
    layer1, torch.nn.Sigmoid(),# dropout,
#     layer2, relu, dropout,
#     layer3, relu, dropout,    
#     layer4, relu, dropout,    
    layer2, softmax
).cuda()

#loss 함수 설정
loss = torch.nn.CrossEntropyLoss().cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [10]:
# 모델 학습
model.train()
epochs = 20000
for epoch in range(epochs):
    optimizer.zero_grad()
    cost = loss(model(train_x), train_y)
    cost.backward()
    optimizer.step()
    
    if epoch % 1000 == 0:
        model.eval()
        print(epoch, cost.item(), loss(model(test_x), test_y))
        model.train()

0 0.8463623523712158 tensor(0.8593, device='cuda:0', grad_fn=<NllLossBackward0>)
1000 0.6932202577590942 tensor(0.6946, device='cuda:0', grad_fn=<NllLossBackward0>)
2000 0.611426591873169 tensor(0.6030, device='cuda:0', grad_fn=<NllLossBackward0>)
3000 0.5805577039718628 tensor(0.5667, device='cuda:0', grad_fn=<NllLossBackward0>)
4000 0.5668907165527344 tensor(0.5499, device='cuda:0', grad_fn=<NllLossBackward0>)
5000 0.5590311884880066 tensor(0.5403, device='cuda:0', grad_fn=<NllLossBackward0>)
6000 0.552862286567688 tensor(0.5333, device='cuda:0', grad_fn=<NllLossBackward0>)
7000 0.5467850565910339 tensor(0.5271, device='cuda:0', grad_fn=<NllLossBackward0>)
8000 0.5408726930618286 tensor(0.5214, device='cuda:0', grad_fn=<NllLossBackward0>)
9000 0.5360369682312012 tensor(0.5171, device='cuda:0', grad_fn=<NllLossBackward0>)
10000 0.5327128171920776 tensor(0.5144, device='cuda:0', grad_fn=<NllLossBackward0>)
11000 0.5305232405662537 tensor(0.5128, device='cuda:0', grad_fn=<NllLossBackwar

In [11]:
# 정답 도출
predict = model(test_tensor).cpu().argmax(dim=1)
submit.target = [float(n) for n in predict.detach()]
submit.to_csv('submission.csv', index=False)