In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# FM
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.metrics import BinaryAccuracy
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer

# GPU 확인
tf.config.list_physical_devices('GPU')

# 자료형 선언
tf.keras.backend.set_floatx('float32')

In [None]:
# 데이터 로드
scaler = MinMaxScaler()
file = load_breast_cancer()
X, Y = file['data'], file['target']
X = scaler.fit_transform(X)

n = X.shape[0]
p = X.shape[1]
k = 10
batch_size = 8
# epochs = 10

In [None]:
class FM(tf.keras.Model):
    def __init__(self):
        super(FM, self).__init__()

        # 모델의 파라미터 정의
        self.w_0 = tf.Variable([0.0])
        self.w = tf.Variable(tf.zeros([p]))
        self.V = tf.Variable(tf.random.normal(shape=(p, k)))

    def call(self, inputs):
        linear_terms = tf.reduce_sum(tf.math.multiply(self.w, inputs), axis=1)

        interactions = 0.5 * tf.reduce_sum(               #reduce_sum: x 배열의 y차원을 축소 or 기준으로 연산을 수행하는 것.
            tf.math.pow(tf.matmul(inputs, self.V), 2)     #matmul: 행렬의 곱셈
            - tf.matmul(tf.math.pow(inputs, 2), tf.math.pow(self.V, 2)), 
            1,
            keepdims=False
        )

        y_hat = tf.math.sigmoid(self.w_0 + linear_terms + interactions)

        return y_hat

In [None]:
# Forward
def train_on_batch(model, optimizer, accuracy, inputs, targets):
    with tf.GradientTape() as tape:   #GradientTape(): 오차 역전파를 자동 미분하여 중간연산 과정을 tape에 기록.
        y_pred = model(inputs)
        loss = tf.keras.losses.binary_crossentropy(from_logits=False,  #loss function = binary_crossentropy (이진 분류기, 딥러닝 손실함수), label들이 독립적.두 개의 클래스 중에서 예측할 때
                                                   y_true=targets,
                                                   y_pred=y_pred)
    
    # loss를 모델의 파라미터로 편미분하여 gradients를 구한다.
    grads = tape.gradient(target=loss, sources=model.trainable_variables)  #미분한 값

    # apply_gradients()를 통해 processed gradients를 적용한다.
    optimizer.apply_gradients(zip(grads, model.trainable_variables))    #parameter에 대응되는 gradient를 묶어 그라디언트 디센트에 반영

    # accuracy: update할 때마다 정확도는 누적되어 계산된다.
    accuracy.update_state(targets, y_pred)

    return loss

In [None]:
# 반복 학습 함수
def train(epochs):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y)

    train_ds = tf.data.Dataset.from_tensor_slices(
        (tf.cast(X_train, tf.float32), tf.cast(Y_train, tf.float32))).shuffle(500).batch(8)

    test_ds = tf.data.Dataset.from_tensor_slices(
        (tf.cast(X_test, tf.float32), tf.cast(Y_test, tf.float32))).shuffle(200).batch(8)

    model = FM()
    optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
    accuracy = BinaryAccuracy(threshold=0.5)
    loss_history = []

    for i in range(epochs):
      for x, y in train_ds:
          loss = train_on_batch(model, optimizer, accuracy, x, y)
          loss_history.append(loss)

      if i % 2 == 0:
          print("스텝 {:03d}에서 누적 평균 손실: {:.4f}".format(i, np.mean(loss_history)))
          print("스텝 {:03d}에서 누적 정확도: {:.4f}".format(i, accuracy.result().numpy()))

    test_accuracy = BinaryAccuracy(threshold=0.5)
    for x, y in test_ds:
        y_pred = model(x)
        test_accuracy.update_state(y, y_pred)

    print("테스트 정확도: {:.4f}".format(test_accuracy.result().numpy()))

In [None]:
epochs_num = [10, 50, 100, 150, 200]
for e in epochs_num:
  train(e)

스텝 000에서 누적 평균 손실: 1.2747
스텝 000에서 누적 정확도: 0.4549
스텝 002에서 누적 평균 손실: 0.9412
스텝 002에서 누적 정확도: 0.5361
스텝 004에서 누적 평균 손실: 0.8005
스텝 004에서 누적 정확도: 0.6047
스텝 006에서 누적 평균 손실: 0.7110
스텝 006에서 누적 정확도: 0.6533
스텝 008에서 누적 평균 손실: 0.6463
스텝 008에서 누적 정확도: 0.6947
테스트 정확도: 0.8667
스텝 000에서 누적 평균 손실: 0.5198
스텝 000에서 누적 정확도: 0.7691
스텝 002에서 누적 평균 손실: 0.4683
스텝 002에서 누적 정확도: 0.7971
스텝 004에서 누적 평균 손실: 0.4373
스텝 004에서 누적 정확도: 0.8153
스텝 006에서 누적 평균 손실: 0.4149
스텝 006에서 누적 정확도: 0.8271
스텝 008에서 누적 평균 손실: 0.3964
스텝 008에서 누적 정확도: 0.8377
스텝 010에서 누적 평균 손실: 0.3813
스텝 010에서 누적 정확도: 0.8454
스텝 012에서 누적 평균 손실: 0.3683
스텝 012에서 누적 정확도: 0.8528
스텝 014에서 누적 평균 손실: 0.3571
스텝 014에서 누적 정확도: 0.8588
스텝 016에서 누적 평균 손실: 0.3470
스텝 016에서 누적 정확도: 0.8641
스텝 018에서 누적 평균 손실: 0.3379
스텝 018에서 누적 정확도: 0.8691
스텝 020에서 누적 평균 손실: 0.3296
스텝 020에서 누적 정확도: 0.8733
스텝 022에서 누적 평균 손실: 0.3220
스텝 022에서 누적 정확도: 0.8778
스텝 024에서 누적 평균 손실: 0.3149
스텝 024에서 누적 정확도: 0.8815
스텝 026에서 누적 평균 손실: 0.3084
스텝 026에서 누적 정확도: 0.8850
스텝 028에서 누적 평균 손실: 0.3023
스텝 028에서

# DeepFM model - Cancer Data

In [None]:
import os
os.getcwd()

'/content'

In [2]:
!pip install config

Collecting config
  Downloading config-0.5.1-py2.py3-none-any.whl (20 kB)
Installing collected packages: config
Successfully installed config-0.5.1


In [3]:
# import config
# from preprocess import get_modified_data
# from DeepFM import DeepFM
import numpy as np
import pandas as pd
from time import perf_counter
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.metrics import BinaryAccuracy, AUC
from sklearn.datasets import load_breast_cancer

In [4]:
import tensorflow as tf
from tensorflow.keras import layers

tf.keras.backend.set_floatx('float32')

In [None]:
print(tf.__version__)

2.7.0


In [5]:
import warnings
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) 
warnings.filterwarnings('ignore')

In [None]:
file = load_breast_cancer()
file.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [None]:
# config.py
file_nm = load_breast_cancer()
# X = pd.DataFrame(file.data, columns=file.feature_names)
# Y = pd.DataFrame(file.target, columns=['target'])

ALL_FIELDS = file_nm.feature_names
# ['mean radius', 'mean texture', 'mean perimeter', 'mean area',
#        'mean smoothness', 'mean compactness', 'mean concavity',
#        'mean concave points', 'mean symmetry', 'mean fractal dimension',
#        'radius error', 'texture error', 'perimeter error', 'area error',
#        'smoothness error', 'compactness error', 'concavity error',
#        'concave points error', 'symmetry error',
#        'fractal dimension error', 'worst radius', 'worst texture',
#        'worst perimeter', 'worst area', 'worst smoothness',
#        'worst compactness', 'worst concavity', 'worst concave points',
#        'worst symmetry', 'worst fractal dimension']#
# ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
#              'marital-status', 'occupation', 'relationship', 'race',
#              'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'country']
# print(ALL_FIELDS)
CONT_FIELDS = file_nm.feature_names
# ['age', 'fnlwgt', 'education-num',
#                'capital-gain', 'capital-loss', 'hours-per-week']

CAT_FIELDS = list(set(ALL_FIELDS).difference(CONT_FIELDS))
# print(CAT_FIELDS)

# Hyper-parameters for Experiment
NUM_BIN = 10
BATCH_SIZE = 300
EMBEDDING_SIZE = 5  #임베딩: 이산된(discrete) 데이터를 연속된 벡터로 만들어주는 학습된 저차원 표현방법

In [None]:
len(pd.DataFrame(file.data, columns=file.feature_names))

569

## FM model 생성하기

In [None]:
class FM_layer(tf.keras.layers.Layer):
    def __init__(self, num_feature, num_field, embedding_size, field_index):
        super(FM_layer, self).__init__()
        self.embedding_size = embedding_size    # k: 임베딩 벡터의 차원(크기)
        self.num_feature = num_feature          # f: 원래 feature 개수
        self.num_field = num_field              # m: grouped field 개수
        self.field_index = field_index          # 인코딩된 X의 칼럼들이 본래 어디 소속이었는지

        # Parameters of FM Layer
        # w: capture 1st order interactions
        # V: capture 2nd order interactions
        self.w = tf.Variable(tf.random.normal(shape=[num_feature],        #tf.Variable: 모델링에서 weight나 bias와 같은 변수 값을 초기화하는 훈련가능한 변수, 
                                              mean=0.0, stddev=1.0), name='w')
        self.V = tf.Variable(tf.random.normal(shape=(num_field, embedding_size),
                                              mean=0.0, stddev=0.01), name='V')

    def call(self, inputs):
        x_batch = tf.reshape(inputs, [-1, self.num_feature, 1])
        # Parameter V를 field_index에 맞게 복사하여 num_feature에 맞게 늘림
        embeds = tf.nn.embedding_lookup(params=self.V, ids=self.field_index)

        # Deep Component에서 쓸 Input
        # (batch_size, num_feature, embedding_size)
        new_inputs = tf.math.multiply(x_batch, embeds)

        # (batch_size, )
        linear_terms = tf.reduce_sum(
            tf.math.multiply(self.w, inputs), axis=1, keepdims=False)

        # (batch_size, )
        interactions = 0.5 * tf.subtract(
            tf.square(tf.reduce_sum(new_inputs, [1, 2])),
            tf.reduce_sum(tf.square(new_inputs), [1, 2])
        )

        linear_terms = tf.reshape(linear_terms, [-1, 1])
        interactions = tf.reshape(interactions, [-1, 1])

        y_fm = tf.concat([linear_terms, interactions], 1)

        return y_fm, new_inputs

## 모델 전처리

In [None]:
# Preprocess
import config
from itertools import repeat
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

def get_modified_data(X, all_fields, continuous_fields, categorical_fields, is_bin=False):
    field_dict = dict()
    field_index = []
    X_modified = pd.DataFrame()

    for index, col in enumerate(X.columns):
        if col not in all_fields:
            print("{} not included: Check your column list".format(col))
            raise ValueError

        if col in continuous_fields:
            scaler = MinMaxScaler()

            # 연속형 변수 구간화 여부
            if is_bin:
                X_bin = pd.cut(scaler.fit_transform(X[[col]]).reshape(-1, ), NUM_BIN, labels=False)
                X_bin = pd.Series(X_bin).astype('str')

                X_bin_col = pd.get_dummies(X_bin, prefix=col, prefix_sep='-')
                field_dict[index] = list(X_bin_col.columns)
                field_index.extend(repeat(index, X_bin_col.shape[1]))
                X_modified = pd.concat([X_modified, X_bin_col], axis=1)

            else:
                X_cont_col = pd.DataFrame(scaler.fit_transform(X[[col]]), columns=[col])
                field_dict[index] = col
                field_index.append(index)
                X_modified = pd.concat([X_modified, X_cont_col], axis=1)

        if col in categorical_fields:
            X_cat_col = pd.get_dummies(X[col], prefix=col, prefix_sep='-')
            field_dict[index] = list(X_cat_col.columns)
            field_index.extend(repeat(index, X_cat_col.shape[1]))
            X_modified = pd.concat([X_modified, X_cat_col], axis=1)

    print('Data Prepared...')
    print('X shape: {}'.format(X_modified.shape))
    print('# of Feature: {}'.format(len(field_index)))
    print('# of Field: {}'.format(len(field_dict)))

    return field_dict, field_index, X_modified

In [None]:
class DeepFM(tf.keras.Model):

    def __init__(self, num_feature, num_field, embedding_size, field_index):
        super(DeepFM, self).__init__()
        self.embedding_size = embedding_size    # k: 임베딩 벡터의 차원(크기)
        self.num_feature = num_feature          # f: 원래 feature 개수
        self.num_field = num_field              # m: grouped field 개수
        self.field_index = field_index          # 인코딩된 X의 칼럼들이 본래 어디 소속이었는지

        self.fm_layer = FM_layer(num_feature, num_field, embedding_size, field_index)

        self.layers1 = tf.keras.layers.Dense(units=64, activation='relu')
        self.dropout1 = tf.keras.layers.Dropout(rate=0.2)
        self.layers2 = tf.keras.layers.Dense(units=16, activation='relu')
        self.dropout2 = tf.keras.layers.Dropout(rate=0.2)
        self.layers3 = tf.keras.layers.Dense(units=2, activation='relu')

        self.final = tf.keras.layers.Dense(units=1, activation='sigmoid')

    def __repr__(self):
        return "DeepFM Model: #Field: {}, #Feature: {}, ES: {}".format(
            self.num_field, self.num_feature, self.embedding_size)

    def call(self, inputs):
        # 1) FM Component: (num_batch, 2)
        y_fm, new_inputs = self.fm_layer(inputs)

        # retrieve Dense Vectors: (num_batch, num_feature*embedding_size)
        new_inputs = tf.reshape(new_inputs, [-1, self.num_feature*self.embedding_size])

        # 2) Deep Component
        y_deep = self.layers1(new_inputs)
        y_deep = self.dropout1(y_deep)
        y_deep = self.layers2(y_deep)
        y_deep = self.dropout2(y_deep)
        y_deep = self.layers3(y_deep)

        # Concatenation
        y_pred = tf.concat([y_fm, y_deep], 1)
        y_pred = self.final(y_pred)
        y_pred = tf.reshape(y_pred, [-1, ])

        return y_pred

In [None]:
def get_data():
    file = load_breast_cancer() #pd.read_csv('data/adult.data', header=None)
    # scaler = MinMaxScaler()
    X = pd.DataFrame(file.data, columns=file.feature_names)
    # X = scaler.fit_transform(X)
    Y = file.target#.map({' <=50K': 0, ' >50K': 1})

    X.columns = ALL_FIELDS
    field_dict, field_index, X_modified = get_modified_data(X, ALL_FIELDS, CONT_FIELDS, CAT_FIELDS)

    X_train, X_test, Y_train, Y_test = train_test_split(X_modified, Y, test_size=0.2, stratify=Y)

    train_ds = tf.data.Dataset.from_tensor_slices( #tf.data.Dataset.from_tensor_slices 함수: tf.data.Dataset 를 생성하는 함수로 입력된 텐서로부터 slices를 생성.
          (tf.cast(X_train.values, tf.float32), tf.cast(Y_train, tf.float32))
        ).shuffle(500).batch(BATCH_SIZE) #shuffle 함수는 고정된 버퍼 크기로 데이터를 섞는데, 데이터가 완전히 랜덤적으로 뒤섞기 위해서는 입력된 데이터 크기보다 큰 수를 입력해 주셔야 합니다.

    test_ds = tf.data.Dataset.from_tensor_slices(
          (tf.cast(X_test.values, tf.float32), tf.cast(Y_test, tf.float32))
        ).shuffle(200).batch(BATCH_SIZE)

    return train_ds, test_ds, field_dict, field_index

In [None]:
# Batch 단위 학습
def train_on_batch(model, optimizer, acc, auc, inputs, targets):
    with tf.GradientTape() as tape:
        y_pred = model(inputs)
        loss = tf.keras.losses.binary_crossentropy(from_logits=False, y_true=targets, y_pred=y_pred)

    grads = tape.gradient(target=loss, sources=model.trainable_variables)

    # apply_gradients()를 통해 processed gradients를 적용함
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    # accuracy & auc
    acc.update_state(targets, y_pred)
    auc.update_state(targets, y_pred)

    return loss

In [None]:
# 반복 학습 함수
def train(epochs):
  train_ds, test_ds, field_dict, field_index = get_data()
  model = DeepFM(embedding_size= EMBEDDING_SIZE, num_feature=len(field_index),
                  num_field=len(field_dict), field_index=field_index)

  optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)

  print("Start Training: Batch Size: {}, Embedding Size: {}".format(BATCH_SIZE, EMBEDDING_SIZE))
  
  start = perf_counter()
  for i in range(epochs):
      acc = BinaryAccuracy(threshold=0.5)
      auc = AUC()
      loss_history = []

      for x, y in train_ds:
          loss = train_on_batch(model, optimizer, acc, auc, x, y)
          loss_history.append(loss)

      print("Epoch {}: 누적 Loss: {:.4f}, Acc: {:.4f}, AUC: {:.4f}".format(i, np.mean(loss_history), acc.result().numpy(), auc.result().numpy()))

  test_acc = BinaryAccuracy(threshold=0.5)
  test_auc = AUC()
  for x, y in test_ds:
      y_pred = model(x)
      test_acc.update_state(y, y_pred)
      test_auc.update_state(y,y_pred)

  print("테스트 ACC: {:.4f}, AUC: {:.4f}".format(test_acc.result().numpy(), test_auc.result().numpy()))
  print("Batch Size: {}, Embedding Size: {}".format(BATCH_SIZE, EMBEDDING_SIZE))
  print("걸린 시간: {:.3f}".format(perf_counter() - start))
  # model.save_weights('weights/weights-epoch({})-batch({})-embedding({}).h5'.format(epochs, BATCH_SIZE, EMBEDDING_SIZE))

In [None]:
epochs_num = [10, 50, 100, 150, 200]
for e in epochs_num:
  if __name__ == '__main__':
    train(epochs = e)

Data Prepared...
X shape: (569, 30)
# of Feature: 30
# of Field: 30
Start Training: Batch Size: 50, Embedding Size: 5
Epoch 0: 누적 Loss: 0.7839, Acc: 0.2380, AUC: 0.2260
Epoch 1: 누적 Loss: 0.7734, Acc: 0.2380, AUC: 0.2294
Epoch 2: 누적 Loss: 0.7625, Acc: 0.2340, AUC: 0.2280
Epoch 3: 누적 Loss: 0.7538, Acc: 0.2660, AUC: 0.2345
Epoch 4: 누적 Loss: 0.7370, Acc: 0.2700, AUC: 0.2319
Epoch 5: 누적 Loss: 0.7352, Acc: 0.2760, AUC: 0.2334
Epoch 6: 누적 Loss: 0.7277, Acc: 0.2780, AUC: 0.2372
Epoch 7: 누적 Loss: 0.7205, Acc: 0.3540, AUC: 0.2454
Epoch 8: 누적 Loss: 0.7079, Acc: 0.4580, AUC: 0.2533
Epoch 9: 누적 Loss: 0.7039, Acc: 0.4800, AUC: 0.2534
테스트 ACC: 0.6010, AUC: 0.2578
Batch Size: 50, Embedding Size: 5
걸린 시간: 6.564
Data Prepared...
X shape: (569, 30)
# of Feature: 30
# of Field: 30
Start Training: Batch Size: 50, Embedding Size: 5
Epoch 0: 누적 Loss: 0.7069, Acc: 0.6000, AUC: 0.1296
Epoch 1: 누적 Loss: 0.7008, Acc: 0.6180, AUC: 0.1330
Epoch 2: 누적 Loss: 0.6959, Acc: 0.6380, AUC: 0.1316
Epoch 3: 누적 Loss: 0.7001,

In [None]:
epochs_num = [10, 50, 100, 150, 200]
for e in epochs_num:
  if __name__ == '__main__':
    train(epochs = e)

Data Prepared...
X shape: (569, 30)
# of Feature: 30
# of Field: 30
Start Training: Batch Size: 150, Embedding Size: 10
Epoch 0: 누적 Loss: 0.8289, Acc: 0.5717, AUC: 0.3978
Epoch 1: 누적 Loss: 0.9525, Acc: 0.5233, AUC: 0.4040
Epoch 2: 누적 Loss: 0.8154, Acc: 0.5717, AUC: 0.4103
Epoch 3: 누적 Loss: 0.7367, Acc: 0.6200, AUC: 0.4134
Epoch 4: 누적 Loss: 0.7188, Acc: 0.6683, AUC: 0.4200
Epoch 5: 누적 Loss: 0.8827, Acc: 0.4750, AUC: 0.4232
Epoch 6: 누적 Loss: 0.7633, Acc: 0.6200, AUC: 0.4280
Epoch 7: 누적 Loss: 0.6747, Acc: 0.6200, AUC: 0.4326
Epoch 8: 누적 Loss: 0.7131, Acc: 0.6200, AUC: 0.4347
Epoch 9: 누적 Loss: 0.7143, Acc: 0.6200, AUC: 0.4376
테스트 ACC: 0.6316, AUC: 0.3576
Batch Size: 150, Embedding Size: 10
걸린 시간: 3.821
Data Prepared...
X shape: (569, 30)
# of Feature: 30
# of Field: 30
Start Training: Batch Size: 150, Embedding Size: 10
Epoch 0: 누적 Loss: 0.7054, Acc: 0.3617, AUC: 0.2983
Epoch 1: 누적 Loss: 0.7149, Acc: 0.3250, AUC: 0.2959
Epoch 2: 누적 Loss: 0.6986, Acc: 0.3767, AUC: 0.3014
Epoch 3: 누적 Loss: 0

In [None]:
epochs_num = [10, 50, 100, 150, 200]
for e in epochs_num:
  if __name__ == '__main__':
    train(epochs = e)

Data Prepared...
X shape: (569, 30)
# of Feature: 30
# of Field: 30
Start Training: Batch Size: 150, Embedding Size: 5
Epoch 0: 누적 Loss: 0.6728, Acc: 0.4283, AUC: 0.9259
Epoch 1: 누적 Loss: 0.6503, Acc: 0.4783, AUC: 0.9262
Epoch 2: 누적 Loss: 0.6429, Acc: 0.4783, AUC: 0.9274
Epoch 3: 누적 Loss: 0.6944, Acc: 0.3333, AUC: 0.9274
Epoch 4: 누적 Loss: 0.6316, Acc: 0.5267, AUC: 0.9259
Epoch 5: 누적 Loss: 0.6841, Acc: 0.3333, AUC: 0.9275
Epoch 6: 누적 Loss: 0.6800, Acc: 0.3833, AUC: 0.9284
Epoch 7: 누적 Loss: 0.6845, Acc: 0.3383, AUC: 0.9256
Epoch 8: 누적 Loss: 0.6859, Acc: 0.3450, AUC: 0.9270
Epoch 9: 누적 Loss: 0.6823, Acc: 0.3483, AUC: 0.9269
테스트 ACC: 0.4123, AUC: 0.8986
Batch Size: 150, Embedding Size: 5
걸린 시간: 3.703
Data Prepared...
X shape: (569, 30)
# of Feature: 30
# of Field: 30
Start Training: Batch Size: 150, Embedding Size: 5
Epoch 0: 누적 Loss: 1.2229, Acc: 0.5717, AUC: 0.0165
Epoch 1: 누적 Loss: 0.9722, Acc: 0.6683, AUC: 0.0166
Epoch 2: 누적 Loss: 0.8943, Acc: 0.7167, AUC: 0.0164
Epoch 3: 누적 Loss: 1.15

In [None]:
epochs_num = [10, 50, 100, 150, 200]
for e in epochs_num:
  if __name__ == '__main__':
    train(epochs = e)

Data Prepared...
X shape: (569, 30)
# of Feature: 30
# of Field: 30
Start Training: Batch Size: 300, Embedding Size: 5
Epoch 0: 누적 Loss: 0.6957, Acc: 0.5240, AUC: 0.4265
Epoch 1: 누적 Loss: 0.6971, Acc: 0.5180, AUC: 0.4264
Epoch 2: 누적 Loss: 0.6997, Acc: 0.5151, AUC: 0.4274
Epoch 3: 누적 Loss: 0.6976, Acc: 0.5182, AUC: 0.4283
Epoch 4: 누적 Loss: 0.6953, Acc: 0.5214, AUC: 0.4299
Epoch 5: 누적 Loss: 0.6960, Acc: 0.5356, AUC: 0.4307
Epoch 6: 누적 Loss: 0.6948, Acc: 0.5294, AUC: 0.4302
Epoch 7: 누적 Loss: 0.6963, Acc: 0.5310, AUC: 0.4312
Epoch 8: 누적 Loss: 0.6919, Acc: 0.5403, AUC: 0.4322
Epoch 9: 누적 Loss: 0.6908, Acc: 0.5434, AUC: 0.4329
테스트 ACC: 0.5702, AUC: 0.4158
Batch Size: 300, Embedding Size: 5
걸린 시간: 2.172
Data Prepared...
X shape: (569, 30)
# of Feature: 30
# of Field: 30
Start Training: Batch Size: 300, Embedding Size: 5
Epoch 0: 누적 Loss: 0.7574, Acc: 0.3771, AUC: 0.3445
Epoch 1: 누적 Loss: 0.7575, Acc: 0.3849, AUC: 0.3458
Epoch 2: 누적 Loss: 0.7540, Acc: 0.3901, AUC: 0.3456
Epoch 3: 누적 Loss: 0.75

- batch_size, epoch 수를 늘릴수록 정확도, auc 값이 높아짐..
- batch_size를 늘리면 학습시간이 빨라짐.

#Deep FM Model - Boston Housing data

In [None]:
import sklearn
dir(sklearn.datasets)

['__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_base',
 '_california_housing',
 '_covtype',
 '_kddcup99',
 '_lfw',
 '_olivetti_faces',
 '_openml',
 '_rcv1',
 '_samples_generator',
 '_species_distributions',
 '_svmlight_format_fast',
 '_svmlight_format_io',
 '_twenty_newsgroups',
 'clear_data_home',
 'data',
 'descr',
 'dump_svmlight_file',
 'fetch_20newsgroups',
 'fetch_20newsgroups_vectorized',
 'fetch_california_housing',
 'fetch_covtype',
 'fetch_kddcup99',
 'fetch_lfw_pairs',
 'fetch_lfw_people',
 'fetch_olivetti_faces',
 'fetch_openml',
 'fetch_rcv1',
 'fetch_species_distributions',
 'get_data_home',
 'load_boston',
 'load_breast_cancer',
 'load_diabetes',
 'load_digits',
 'load_files',
 'load_iris',
 'load_linnerud',
 'load_sample_image',
 'load_sample_images',
 'load_svmlight_file',
 'load_svmlight_files',
 'load_wine',
 'make_biclusters',
 'make_blobs',
 'make_checkerboard',
 'make_cir

In [6]:
from sklearn.datasets import *
file = load_wine()
print(file.feature_names)

['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']


In [None]:
pd.DataFrame(file.data, columns = file.feature_names).head(3)

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0


In [7]:
# config.py
file_nm = load_wine()

ALL_FIELDS = file_nm.feature_names
print(ALL_FIELDS)

CONT_FIELDS = file_nm.feature_names
print(CONT_FIELDS)

CAT_FIELDS = list(set(ALL_FIELDS).difference(CONT_FIELDS))
print(CAT_FIELDS)

# Hyper-parameters for Experiment
NUM_BIN = 10
BATCH_SIZE = 50
EMBEDDING_SIZE = 5  #임베딩: 이산된(discrete) 데이터를 연속된 벡터로 만들어주는 학습된 저차원 표현방법

['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']
['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']
[]


In [8]:
class FM_layer(tf.keras.layers.Layer):
    def __init__(self, num_feature, num_field, embedding_size, field_index):
        super(FM_layer, self).__init__()
        self.embedding_size = embedding_size    # k: 임베딩 벡터의 차원(크기)
        self.num_feature = num_feature          # f: 원래 feature 개수
        self.num_field = num_field              # m: grouped field 개수
        self.field_index = field_index          # 인코딩된 X의 칼럼들이 본래 어디 소속이었는지

        # Parameters of FM Layer
        # w: capture 1st order interactions
        # V: capture 2nd order interactions
        self.w = tf.Variable(tf.random.normal(shape=[num_feature],        #tf.Variable: 모델링에서 weight나 bias와 같은 변수 값을 초기화하는 훈련가능한 변수, 
                                              mean=0.0, stddev=1.0), name='w')
        self.V = tf.Variable(tf.random.normal(shape=(num_field, embedding_size),
                                              mean=0.0, stddev=0.01), name='V')

    def call(self, inputs):
        x_batch = tf.reshape(inputs, [-1, self.num_feature, 1])
        # Parameter V를 field_index에 맞게 복사하여 num_feature에 맞게 늘림
        embeds = tf.nn.embedding_lookup(params=self.V, ids=self.field_index)

        # Deep Component에서 쓸 Input
        # (batch_size, num_feature, embedding_size)
        new_inputs = tf.math.multiply(x_batch, embeds)

        # (batch_size, )
        linear_terms = tf.reduce_sum(
            tf.math.multiply(self.w, inputs), axis=1, keepdims=False)

        # (batch_size, )
        interactions = 0.5 * tf.subtract(
            tf.square(tf.reduce_sum(new_inputs, [1, 2])),
            tf.reduce_sum(tf.square(new_inputs), [1, 2])
        )

        linear_terms = tf.reshape(linear_terms, [-1, 1])
        interactions = tf.reshape(interactions, [-1, 1])

        y_fm = tf.concat([linear_terms, interactions], 1)

        return y_fm, new_inputs

In [9]:
# Preprocess
import config
from itertools import repeat
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

def get_modified_data(X, all_fields, continuous_fields, categorical_fields, is_bin=False):
    field_dict = dict()
    field_index = []
    X_modified = pd.DataFrame()

    for index, col in enumerate(X.columns):
        if col not in all_fields:
            print("{} not included: Check your column list".format(col))
            raise ValueError

        if col in continuous_fields:
            scaler = MinMaxScaler()

            # 연속형 변수 구간화 여부
            if is_bin:
                X_bin = pd.cut(scaler.fit_transform(X[[col]]).reshape(-1, ), NUM_BIN, labels=False)
                X_bin = pd.Series(X_bin).astype('str')

                X_bin_col = pd.get_dummies(X_bin, prefix=col, prefix_sep='-')
                field_dict[index] = list(X_bin_col.columns)
                field_index.extend(repeat(index, X_bin_col.shape[1]))
                X_modified = pd.concat([X_modified, X_bin_col], axis=1)

            else:
                X_cont_col = pd.DataFrame(scaler.fit_transform(X[[col]]), columns=[col])
                field_dict[index] = col
                field_index.append(index)
                X_modified = pd.concat([X_modified, X_cont_col], axis=1)

        if col in categorical_fields:
            X_cat_col = pd.get_dummies(X[col], prefix=col, prefix_sep='-')
            field_dict[index] = list(X_cat_col.columns)
            field_index.extend(repeat(index, X_cat_col.shape[1]))
            X_modified = pd.concat([X_modified, X_cat_col], axis=1)

    print('Data Prepared...')
    print('X shape: {}'.format(X_modified.shape))
    print('# of Feature: {}'.format(len(field_index)))
    print('# of Field: {}'.format(len(field_dict)))

    return field_dict, field_index, X_modified

In [10]:
class DeepFM(tf.keras.Model):

    def __init__(self, num_feature, num_field, embedding_size, field_index):
        super(DeepFM, self).__init__()
        self.embedding_size = embedding_size    # k: 임베딩 벡터의 차원(크기)
        self.num_feature = num_feature          # f: 원래 feature 개수
        self.num_field = num_field              # m: grouped field 개수
        self.field_index = field_index          # 인코딩된 X의 칼럼들이 본래 어디 소속이었는지

        self.fm_layer = FM_layer(num_feature, num_field, embedding_size, field_index)

        self.layers1 = tf.keras.layers.Dense(units=64, activation='relu')
        self.dropout1 = tf.keras.layers.Dropout(rate=0.2)
        self.layers2 = tf.keras.layers.Dense(units=16, activation='relu')
        self.dropout2 = tf.keras.layers.Dropout(rate=0.2)
        self.layers3 = tf.keras.layers.Dense(units=2, activation='relu')

        self.final = tf.keras.layers.Dense(units=1, activation='sigmoid')

    def __repr__(self):
        return "DeepFM Model: #Field: {}, #Feature: {}, ES: {}".format(
            self.num_field, self.num_feature, self.embedding_size)

    def call(self, inputs):
        # 1) FM Component: (num_batch, 2)
        y_fm, new_inputs = self.fm_layer(inputs)

        # retrieve Dense Vectors: (num_batch, num_feature*embedding_size)
        new_inputs = tf.reshape(new_inputs, [-1, self.num_feature*self.embedding_size])

        # 2) Deep Component
        y_deep = self.layers1(new_inputs)
        y_deep = self.dropout1(y_deep)
        y_deep = self.layers2(y_deep)
        y_deep = self.dropout2(y_deep)
        y_deep = self.layers3(y_deep)

        # Concatenation
        y_pred = tf.concat([y_fm, y_deep], 1)
        y_pred = self.final(y_pred)
        y_pred = tf.reshape(y_pred, [-1, ])

        return y_pred

In [11]:
def get_data():
    file = load_wine() 
    X = pd.DataFrame(file.data, columns=file.feature_names)
    Y = file.target

    X.columns = ALL_FIELDS
    field_dict, field_index, X_modified = get_modified_data(X, ALL_FIELDS, CONT_FIELDS, CAT_FIELDS)

    X_train, X_test, Y_train, Y_test = train_test_split(X_modified, Y, test_size=0.2, stratify=Y)

    train_ds = tf.data.Dataset.from_tensor_slices( #tf.data.Dataset.from_tensor_slices 함수: tf.data.Dataset 를 생성하는 함수로 입력된 텐서로부터 slices를 생성.
          (tf.cast(X_train.values, tf.float32), tf.cast(Y_train, tf.float32))
        ).shuffle(500).batch(BATCH_SIZE) #shuffle 함수는 고정된 버퍼 크기로 데이터를 섞는데, 데이터가 완전히 랜덤적으로 뒤섞기 위해서는 입력된 데이터 크기보다 큰 수를 입력해 주셔야 합니다.

    test_ds = tf.data.Dataset.from_tensor_slices(
          (tf.cast(X_test.values, tf.float32), tf.cast(Y_test, tf.float32))
        ).shuffle(200).batch(BATCH_SIZE)

    return train_ds, test_ds, field_dict, field_index

In [12]:
# Batch 단위 학습
def train_on_batch(model, optimizer, acc, auc, inputs, targets):
    with tf.GradientTape() as tape:
        y_pred = model(inputs)
        loss = tf.keras.losses.binary_crossentropy(from_logits=False, y_true=targets, y_pred=y_pred)

    grads = tape.gradient(target=loss, sources=model.trainable_variables)

    # apply_gradients()를 통해 processed gradients를 적용함
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    # accuracy & auc
    acc.update_state(targets, y_pred)
    auc.update_state(targets, y_pred)

    return loss

In [13]:
# 반복 학습 함수
def train(epochs):
  train_ds, test_ds, field_dict, field_index = get_data()
  model = DeepFM(embedding_size= EMBEDDING_SIZE, num_feature=len(field_index),
                  num_field=len(field_dict), field_index=field_index)

  optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)

  print("Start Training: Batch Size: {}, Embedding Size: {}".format(BATCH_SIZE, EMBEDDING_SIZE))
  
  start = perf_counter()
  for i in range(epochs):
      acc = BinaryAccuracy(threshold=0.5)
      auc = AUC()
      loss_history = []

      for x, y in train_ds:
          loss = train_on_batch(model, optimizer, acc, auc, x, y)
          loss_history.append(loss)

      print("Epoch {}: 누적 Loss: {:.4f}, Acc: {:.4f}, AUC: {:.4f}".format(i, np.mean(loss_history), acc.result().numpy(), auc.result().numpy()))

  test_acc = BinaryAccuracy(threshold=0.5)
  test_auc = AUC()
  for x, y in test_ds:
      y_pred = model(x)
      test_acc.update_state(y, y_pred)
      test_auc.update_state(y,y_pred)

  print("테스트 ACC: {:.4f}, AUC: {:.4f}".format(test_acc.result().numpy(), test_auc.result().numpy()))
  print("Batch Size: {}, Embedding Size: {}".format(BATCH_SIZE, EMBEDDING_SIZE))
  print("걸린 시간: {:.3f}".format(perf_counter() - start))
  # model.save_weights('weights/weights-epoch({})-batch({})-embedding({}).h5'.format(epochs, BATCH_SIZE, EMBEDDING_SIZE))

In [None]:
epochs_num = [10, 50, 100, 150, 200]
for e in epochs_num:
  if __name__ == '__main__':
    train(epochs = e)

Data Prepared...
X shape: (178, 13)
# of Feature: 13
# of Field: 13
Start Training: Batch Size: 300, Embedding Size: 5
Epoch 0: 누적 Loss: 2.4570, Acc: 0.3310, AUC: 0.9688
Epoch 1: 누적 Loss: 2.3678, Acc: 0.3310, AUC: 0.9683
Epoch 2: 누적 Loss: 2.2825, Acc: 0.3310, AUC: 0.9654
Epoch 3: 누적 Loss: 2.1996, Acc: 0.3310, AUC: 0.9681
Epoch 4: 누적 Loss: 2.1190, Acc: 0.3310, AUC: 0.9671
Epoch 5: 누적 Loss: 2.0408, Acc: 0.3310, AUC: 0.9682
Epoch 6: 누적 Loss: 1.9649, Acc: 0.3310, AUC: 0.9692
Epoch 7: 누적 Loss: 1.8913, Acc: 0.3310, AUC: 0.9692
Epoch 8: 누적 Loss: 1.8201, Acc: 0.3310, AUC: 0.9681
Epoch 9: 누적 Loss: 1.7513, Acc: 0.3310, AUC: 0.9679
테스트 ACC: 0.3333, AUC: 1.0000
Batch Size: 300, Embedding Size: 5
걸린 시간: 1.483
Data Prepared...
X shape: (178, 13)
# of Feature: 13
# of Field: 13
Start Training: Batch Size: 300, Embedding Size: 5
Epoch 0: 누적 Loss: 0.8313, Acc: 0.1127, AUC: 0.2268
Epoch 1: 누적 Loss: 0.8262, Acc: 0.1056, AUC: 0.2272
Epoch 2: 누적 Loss: 0.8211, Acc: 0.1056, AUC: 0.2277
Epoch 3: 누적 Loss: 0.81

In [None]:
epochs_num = [10, 50, 100, 150, 200]
for e in epochs_num:
  if __name__ == '__main__':
    train(epochs = e)

Data Prepared...
X shape: (178, 13)
# of Feature: 13
# of Field: 13
Start Training: Batch Size: 50, Embedding Size: 5
Epoch 0: 누적 Loss: 1.0573, Acc: 0.3219, AUC: 0.4566
Epoch 1: 누적 Loss: 0.9864, Acc: 0.3337, AUC: 0.4544
Epoch 2: 누적 Loss: 0.9367, Acc: 0.3089, AUC: 0.4599
Epoch 3: 누적 Loss: 0.8680, Acc: 0.2819, AUC: 0.4521
Epoch 4: 누적 Loss: 0.8262, Acc: 0.2543, AUC: 0.4582
Epoch 5: 누적 Loss: 0.7775, Acc: 0.2540, AUC: 0.4516
Epoch 6: 누적 Loss: 0.7325, Acc: 0.2644, AUC: 0.4540
Epoch 7: 누적 Loss: 0.6841, Acc: 0.2448, AUC: 0.4543
Epoch 8: 누적 Loss: 0.6413, Acc: 0.2448, AUC: 0.4549
Epoch 9: 누적 Loss: 0.6037, Acc: 0.2724, AUC: 0.4542
테스트 ACC: 0.2222, AUC: 0.4549
Batch Size: 50, Embedding Size: 5
걸린 시간: 2.774
Data Prepared...
X shape: (178, 13)
# of Feature: 13
# of Field: 13
Start Training: Batch Size: 50, Embedding Size: 5
Epoch 0: 누적 Loss: 1.1277, Acc: 0.4229, AUC: 0.4392
Epoch 1: 누적 Loss: 1.0998, Acc: 0.4241, AUC: 0.4403
Epoch 2: 누적 Loss: 1.0614, Acc: 0.4263, AUC: 0.4331
Epoch 3: 누적 Loss: 1.0357,

- 데이터 수가 적을 수록 batch_size를 작게 해줘야 유의미한 결과 값이 나옴.