In [None]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
cd '../gdrive/MyDrive/SSAC/3조'

/gdrive/.shortcut-targets-by-id/15_BxZVEQYCdGCGiQ5nexpWPc1cgHVe4w/3조


In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler, StandardScaler, RobustScaler
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models import *
from deepctr_torch.callbacks import EarlyStopping, ModelCheckpoint

# 데이터 불러오기

In [None]:
df = pd.read_csv('data/total_data.csv')

# 평가함수 정의

In [None]:
from sklearn.metrics import log_loss, roc_auc_score

grouped_label = df.groupby('label').size()
avg_ctr = float(grouped_label[1]/grouped_label.sum())

def get_rig(train_y, test_y, pred, avg_ctr):
    prior = log_loss(train_y, [avg_ctr]*len(train_y))

    classifier = log_loss(test_y, pred)

    rig = (prior - classifier) / prior
    return rig


In [None]:
categorical = [
    'viewer_gender',  
    'content_used',
    'content_cat_1',
    #'content_cat_2',
    #'content_cat_3',
    "content_status",
    'content_delivery_fee',
    #'content_b_pay', 
    'content_place',
    ]

continuous = [
    'content_price',
    'adv_item_count',         
    'title_len',
    'bid_price', 
    'content_emergency_count',
    'content_comment_count',     
    'content_views',       
    #'content_likes',
    'adv_follower_count',
    #'adv_grade',       
    'adv_views',
    'adv_review_count',
    'adv_comment_count',
    'adv_pay_count',
    'adv_parcel_post_count', 
    'adv_transfer_count', 
    #'adv_chat_count',
    'viewer_age',
    'viewer_following_count',
    'viewer_pay_count',
    "viewer_parcel_post_count",
    'viewer_transfer_count',
    'viewer_chat_count'
    ]

# 전처리 함수 정의

In [None]:
def encoder(df, col, enc):
    df_ = df.copy()

    if enc == "label":
        col_ = col
        for feat in col:
            lbe = LabelEncoder()
            df_[feat] = lbe.fit_transform(df_[feat])

    elif enc == "onehot":
        num = len(df_.columns) - len(col)
        df_ = pd.get_dummies(df_, columns=col, drop_first=True)
        col_ = df_.columns[num:].tolist()

    return df_, col_

def scaler(df, col, name):
    df_ = df.copy()

    if name == "minmax":
       scaler = MinMaxScaler(feature_range=(0, 1))

    elif name == "standard":
       scaler = StandardScaler()
    elif name == "robust":
       scaler = RobustScaler()
    
    df_[col] = scaler.fit_transform(df_[col])
    
    return df_

# 결측치 채우기

def prepare_training(df, categorical, continuous, enc = "label", scale = "standard"):
    df[categorical] = df[categorical].fillna('-1', )
    df[continuous] = df[continuous].fillna(0, )
    df, categorical = encoder(df, categorical, enc=enc)
    df = scaler(df, continuous, name=scale)

    train, test = train_test_split(df, test_size=0.2, random_state=47)#, stratify=df['label'])

    fixlen_feature_columns = [SparseFeat(feat, df[feat].nunique())
                            for feat in categorical] + [DenseFeat(feat, 1, )
                                                            for feat in continuous]

    # dnn feature에 일단 모든 feature 정보 넣기
    dnn_feature_columns = fixlen_feature_columns

    # lenear feature에도 일단 모든 feature 정보 넣기
    linear_feature_columns = fixlen_feature_columns 

    # feature 이름들만 따로 저장
    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

    # train/test 데이터의 각 feature에 대응하는 컬럼을 딕셔너리로 저장
    train_model_input = {name: train[name] for name in feature_names}
    test_model_input = {name: test[name] for name in feature_names}

    return [train, test, linear_feature_columns, dnn_feature_columns, train_model_input, test_model_input]

# 학습 및 평가

In [None]:
[train, test, linear_feature_columns, dnn_feature_columns, train_model_input, test_model_input] = prepare_training(df, categorical, continuous)

target = ['label']
#device = 'cpu'
use_gpu = True
if use_gpu and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'

params = {"dnn_hidden_units" : (512, 256), "dnn_dropout" : 0.0, "dnn_activation" : 'relu', "dnn_use_bn" : False, "task" :'binary'}
model = DeepFM(linear_feature_columns, dnn_feature_columns, params, device=device)

# model = DeepFM(linear_feature_columns, dnn_feature_columns, dnn_hidden_units=(512, 256), dnn_dropout=0.0,
#            dnn_activation='relu', dnn_use_bn=False, task='binary', device=device)

# optimizer, loss 설정
model.compile("adam", "binary_crossentropy", metrics=["binary_crossentropy", "auc"], )


# 모델 학습
es = EarlyStopping(monitor='val_binary_crossentropy', min_delta=0, verbose=1, patience=5, mode='auto')
hist = model.fit(x = train_model_input, y = train[target].values, batch_size=512, epochs=50, verbose=2, validation_split=0.2, callbacks=[es])

pred_ans = model.predict(test_model_input, 512)
print("")
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))
print("test RIG", round(get_rig(train[target].values, test[target].values, pred_ans, avg_ctr), 4))

cuda ready...
cuda:0
Train on 470844 samples, validate on 117711 samples, 920 steps per epoch
Epoch 1/50
10s - loss:  0.1491 - binary_crossentropy:  0.1491 - auc:  0.6549 - val_binary_crossentropy:  0.1346 - val_auc:  0.6662
Epoch 2/50
10s - loss:  0.1362 - binary_crossentropy:  0.1362 - auc:  0.6749 - val_binary_crossentropy:  0.1332 - val_auc:  0.6751
Epoch 3/50
9s - loss:  0.1358 - binary_crossentropy:  0.1358 - auc:  0.6814 - val_binary_crossentropy:  0.1335 - val_auc:  0.6783
Epoch 4/50
10s - loss:  0.1353 - binary_crossentropy:  0.1353 - auc:  0.6875 - val_binary_crossentropy:  0.1331 - val_auc:  0.6894
Epoch 5/50
10s - loss:  0.1348 - binary_crossentropy:  0.1348 - auc:  0.6934 - val_binary_crossentropy:  0.1320 - val_auc:  0.6924
Epoch 6/50
9s - loss:  0.1345 - binary_crossentropy:  0.1345 - auc:  0.6978 - val_binary_crossentropy:  0.1318 - val_auc:  0.6944
Epoch 7/50
9s - loss:  0.1343 - binary_crossentropy:  0.1343 - auc:  0.7004 - val_binary_crossentropy:  0.1319 - val_auc: 