In [58]:
import pandas as pd

from sklearn.metrics import log_loss, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from deepctr.feature_column import SparseFeat,get_feature_names
from deepctr.models import FLEN, DeepFM

In [97]:
data = pd.read_csv('./dataset/movielens.csv')
data

Unnamed: 0,userId,title,genres,tag,rating,target
0,1,Toy Story (1995),Adventure,toys,3.92,0
1,2,Toy Story (1995),Adventure,toys,3.92,0
2,3,Toy Story (1995),Adventure,toys,3.92,1
3,4,Toy Story (1995),Adventure,toys,3.92,0
4,5,Toy Story (1995),Adventure,toys,3.92,0
...,...,...,...,...,...,...
99995,996,"Lord of the Rings: The Return of the King, The...",Action,trilogy,4.14,0
99996,997,"Lord of the Rings: The Return of the King, The...",Action,trilogy,4.14,1
99997,998,"Lord of the Rings: The Return of the King, The...",Action,trilogy,4.14,0
99998,999,"Lord of the Rings: The Return of the King, The...",Action,trilogy,4.14,0


In [98]:
# 전처리, label Encoding
label_data = data.copy()
object_features  = ['title', 'genres', 'tag']
sparse_features = ['userId', 'title', 'genres', 'tag', 'rating']
target = ['target']

In [99]:
for feat in object_features:
    lbe = LabelEncoder()
    label_data[feat] = lbe.fit_transform(label_data[feat])
    

In [100]:
label_data

Unnamed: 0,userId,title,genres,tag,rating,target
0,1,92,1,65,3.92,0
1,2,92,1,65,3.92,0
2,3,92,1,65,3.92,1
3,4,92,1,65,3.92,0
4,5,92,1,65,3.92,0
...,...,...,...,...,...,...
99995,996,50,0,67,4.14,0
99996,997,50,0,67,4.14,1
99997,998,50,0,67,4.14,0
99998,999,50,0,67,4.14,0


In [112]:
# Label Encoding
# mMscaler = MinMaxScaler()
label_data['rating'] =  lbe.fit_transform(label_data['rating'].values.reshape(-1, 1))

  y = column_or_1d(y, warn=True)


In [113]:
label_data

Unnamed: 0,userId,title,genres,tag,rating,target
0,1,92,1,65,40,0
1,2,92,1,65,40,0
2,3,92,1,65,40,1
3,4,92,1,65,40,0
4,5,92,1,65,40,0
...,...,...,...,...,...,...
99995,996,50,0,67,55,0
99996,997,50,0,67,55,1
99997,998,50,0,67,55,0
99998,999,50,0,67,55,0


In [124]:

def identify_sparse_dense_features(df):
    sparse_featuress = []
    dense_features = []

    for col in df.columns:
        unique_values_count = df[col].nunique()
        total_values_count = len(df[col])
        # 임계값 설정하여 희소/밀집 특성 구분
        if unique_values_count / total_values_count < 0.005: # 임계값 예시 (사용자가 조정 가능)
            sparse_featuress.append(col)
        else:
            dense_features.append(col)

    return sparse_featuress, dense_features

# 데이터프레임 생성 예시

# 희소 특성과 밀집 특성 식별
sparse_featuress, dense_features = identify_sparse_dense_features(label_data)

print("Sparse Features:", sparse_featuress)
print("Dense Features:", dense_features)


Sparse Features: ['title', 'genres', 'tag', 'rating', 'target']
Dense Features: ['userId']


In [115]:
field_info = dict(
    userId='Dense', title = 'Dense', genres = 'Sparse', tag = 'Dense', rating = 'Sparse'
)

fixlen_feature_columns = [
    SparseFeat(name, vocabulary_size=label_data[name].max() + 1, embedding_dim=16, use_hash=False, dtype='int32',
               group_name=field_info[name]) for name in sparse_features]


In [116]:
field_info

{'userId': 'Dense',
 'title': 'Dense',
 'genres': 'Sparse',
 'tag': 'Dense',
 'rating': 'Sparse'}

In [117]:
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [118]:
# train, test split
train, test = train_test_split(label_data, test_size=0.2, random_state=2020)
train_model_input = {name: train[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}

In [142]:
# model
model = FLEN(linear_feature_columns, dnn_feature_columns, task='binary')

model.compile("adam", "binary_crossentropy",
                metrics=['binary_crossentropy'], )

history = model.fit(train_model_input, train[target].values,
                    batch_size=256, epochs=20, verbose=2, validation_split=0.2, )

pred_ans = model.predict(test_model_input, batch_size=256)

print("test LogLoss", round(log_loss(test[target].values, pred_ans), 5))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 5))

Epoch 1/20
250/250 - 2s - loss: 0.4795 - binary_crossentropy: 0.4794 - val_loss: 0.4453 - val_binary_crossentropy: 0.4451
Epoch 2/20
250/250 - 1s - loss: 0.4347 - binary_crossentropy: 0.4345 - val_loss: 0.4398 - val_binary_crossentropy: 0.4395
Epoch 3/20
250/250 - 1s - loss: 0.4191 - binary_crossentropy: 0.4187 - val_loss: 0.4160 - val_binary_crossentropy: 0.4156
Epoch 4/20
250/250 - 1s - loss: 0.3933 - binary_crossentropy: 0.3929 - val_loss: 0.3970 - val_binary_crossentropy: 0.3965
Epoch 5/20
250/250 - 1s - loss: 0.3751 - binary_crossentropy: 0.3746 - val_loss: 0.3879 - val_binary_crossentropy: 0.3873
Epoch 6/20
250/250 - 1s - loss: 0.3633 - binary_crossentropy: 0.3627 - val_loss: 0.3838 - val_binary_crossentropy: 0.3831
Epoch 7/20
250/250 - 1s - loss: 0.3541 - binary_crossentropy: 0.3533 - val_loss: 0.3811 - val_binary_crossentropy: 0.3803
Epoch 8/20
250/250 - 1s - loss: 0.3453 - binary_crossentropy: 0.3445 - val_loss: 0.3766 - val_binary_crossentropy: 0.3758
Epoch 9/20
250/250 - 1s 

In [147]:
test

Unnamed: 0,userId,title,genres,tag,rating,target
36082,83,72,6,34,64,0
31787,788,17,0,25,4,0
33964,965,32,9,48,44,0
26097,98,31,4,51,28,0
41101,102,2,1,13,31,0
...,...,...,...,...,...,...
50361,362,70,0,25,31,0
28870,871,53,0,26,8,0
74300,301,27,0,22,32,0
70029,30,89,0,15,39,1


In [148]:
# 예측 확률을 0 또는 1로 변환
pred_classes = (pred_ans > 0.5).astype(int)

# 결과 확인
pred_classes

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [149]:
import pandas as pd

# 새로운 데이터프레임 생성
df = pd.DataFrame(test)
df['pred_classes'] = pred_classes

# 결과 출력

In [151]:
df.head(30)

Unnamed: 0,userId,title,genres,tag,rating,target,pred_classes
36082,83,72,6,34,64,0,0
31787,788,17,0,25,4,0,0
33964,965,32,9,48,44,0,0
26097,98,31,4,51,28,0,0
41101,102,2,1,13,31,0,0
83285,286,54,0,15,59,0,0
59935,936,58,1,28,57,0,0
23191,192,1,4,26,3,0,0
96546,547,49,1,27,55,0,0
44808,809,10,0,11,16,1,1
