In [None]:
!pip uninstall lightgbm -y
!pip install lightgbm --install-option=--gpu

Found existing installation: lightgbm 2.2.3
Uninstalling lightgbm-2.2.3:
  Successfully uninstalled lightgbm-2.2.3
  cmdoptions.check_install_build_global(options)
Collecting lightgbm
  Downloading lightgbm-3.3.1.tar.gz (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 14.1 MB/s 
Skipping wheel build for lightgbm, due to binaries being disabled for it.
Installing collected packages: lightgbm
    Running setup.py install for lightgbm ... [?25l[?25hdone
Successfully installed lightgbm-3.3.1


In [None]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
cd '../gdrive/MyDrive/SSAC/3조'

/gdrive/.shortcut-targets-by-id/15_BxZVEQYCdGCGiQ5nexpWPc1cgHVe4w/3조


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
from datetime import datetime, date, time
import lightgbm as lgb
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

pd.options.display.max_info_columns =200
pd.options.display.max_columns = 200
pd.options.display.max_info_rows =999
pd.options.display.max_rows = 999

import warnings
warnings.filterwarnings('ignore')

In [None]:
df1 = pd.read_csv("data/total_data.csv")

In [None]:
# 클릭율
grouped_label = df1.groupby('label').size()
average_ctr = float(grouped_label[1]/grouped_label.sum())
average_ctr

0.03200107653453746

# 평가지표 함수

In [None]:
# 평가지표 함수
def get_rig(train_y, test_y, pred):
    avg_ctr = average_ctr
    prior = log_loss(train_y, [avg_ctr]*len(train_y))

    classifier = log_loss(test_y, pred)

    rig = (prior - classifier) / prior
    return rig

# 학습과정 전처리 함수

In [None]:
# 전처리 함수
def process_missing_values(df):
    df_pre = df.copy()
    for categorical_col in categorical:
        df_pre[categorical_col] = df_pre[categorical_col].astype(str)
        df_pre[categorical_col] = df_pre[categorical_col].fillna('0')
        df_pre[categorical_col] = preprocessing.LabelEncoder().fit_transform(df_pre[categorical_col])

    for continuous_col in continuous:
        df_pre[continuous_col] = df_pre[continuous_col].fillna(0)

    return df_pre

# LGB 모델 학습 함수 정의

In [None]:
# Train_test_split
def split_dataset(df, features):
    train_test_df = df[['label'] + features]
    train, test = train_test_split(train_test_df, test_size = 0.2, random_state=47)

    X_train = train[features]
    y_train = train['label']

    X_test = test[features]
    y_test = test['label']
    return X_train, y_train, X_test, y_test

In [None]:
# 학습
def train_lgb(X_train, y_train):
    model = lgb.LGBMClassifier(n_estimators=50,
        random_state=47,
        learning_rate=0.1,
        num_leaves=127,
        max_depth=15,
        zero_as_missing=True,
        n_jobs=os.cpu_count(),
        objective='binary')

    print('start training')
    model.fit(X_train, y_train)
    return model
    
# 예측
def evaluate_lgb(model, X_test):
    print('predicting')
    pred = model.predict_proba(X_test)[:,1]

    print(f'auc : {roc_auc_score(y_test, pred)}, rig: {get_rig(y_train, y_test, pred)}')

# 학습 피쳐 조정

In [None]:
categorical = [
    'viewer_gender',  
    'content_used',
    'content_cat_1',
    # 'content_cat_2',
    # 'content_cat_3',
    # 'content_b_pay',
    #"content_status",
    'content_delivery_fee']

continuous = [
    'bid_price', 
    'content_price',
    'content_emergency_count',
    'content_comment_count',     
    'content_views',       
    'content_likes',    
    'adv_follower_count',
    'adv_grade',       
    'adv_item_count',         
    'adv_views', 
    'adv_review_count',
    'adv_comment_count',
    'adv_pay_count',
    'adv_parcel_post_count', 
    'adv_transfer_count', 
    # 'adv_chat_count',
    'viewer_age',
    # 'viewer_age_ch',
    'viewer_following_count',
    'viewer_pay_count',
    #"viewer_trans_pay_count", 
    'viewer_transfer_count']
    #'viewer_chat_count']

features = categorical + continuous
df = process_missing_values(df1)

In [None]:
# features만 수정하면 됨
X_train, y_train, X_test, y_test = split_dataset(df, features)

# 파라미터 지정

In [None]:
params = {"boosting_type": ["dart"],
        "n_estimators": [200],
        "learning_rate": [0.1, 0.01],
        "random_state": [47],
        "num_leaves": [31, 63, 127, 255],
        "max_depth": [-1, 10, 15, 20],
        "min_data_in_leaf": [200],
        "objective": ["binary"],
        "device": ["gpu"]}

# GridSearch 및 결과 확인

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
grid = GridSearchCV(lgb.LGBMClassifier(), params, cv=5)
grid.fit(X_train, y_train)
best = grid.best_estimator_
y_pred = best.predict(X_test)

In [None]:
best

LGBMClassifier(boosting_type='dart', device='gpu', max_depth=20,
               min_data_in_leaf=200, n_estimators=200, num_leaves=255,
               objective='binary', random_state=47)

In [None]:
evaluate_lgb(best, X_test)

predicting
auc : 0.777951250365639, rig: 0.11318623471581411


In [None]:
grid.cv_results_

{'mean_fit_time': array([ 50.3326879 ,  61.01032391,  77.62546968, 101.09838419,
         47.63939996,  61.6473855 ,  80.91424537, 100.23506918,
         48.18628311,  60.52036142,  78.87162266, 103.22829924,
         47.84273462,  62.80924568,  78.7325685 , 102.93349905,
         43.99654026,  54.96774154,  71.94878612,  91.09892764,
         44.23625317,  57.12534661,  72.62133241,  95.63952212,
         42.69388556,  55.61905661,  71.49992085,  89.29080353,
         43.05218329,  55.77378597,  72.33958316,  90.81853843]),
 'mean_score_time': array([0.91794572, 1.07466373, 1.3642745 , 2.04710684, 0.94336653,
        1.12328563, 1.40329247, 2.11960077, 0.92518249, 1.08257189,
        1.39808373, 2.14830484, 0.90934834, 1.10333438, 1.40976124,
        2.15974383, 0.59235373, 0.67210689, 0.83661151, 1.39627609,
        0.56832666, 0.68878284, 0.85044732, 1.51127234, 0.58062162,
        0.68093114, 0.83037577, 1.4454185 , 0.58829761, 0.66438613,
        0.83798885, 1.41380553]),
 'mean_t

In [None]:
# 가장 잘 나왔던 모델(파라미터) 보여줌
best
# 가장 잘 나온 결과 평가
evaluate_lgb(best, X_test)
# 지금까지 내용 저장
pd.DataFrame(grid.cv_results_).to_csv('경로', index=False)