[논문 링크](https://arxiv.org/pdf/1606.07792.pdf)

## 참고 자료
- Google의 AI Blog([링크](https://ai.googleblog.com/2016/06/wide-deep-learning-better-together-with.html))
- Google의 Tensorflow github([링크](https://github.com/tensorflow/tensorflow/blob/v2.4.0/tensorflow/python/keras/premade/wide_deep.py#L34-L219))
- TensorFlow v2.4 API
  - [tf.keras.experimental.WideDeepModel](https://www.tensorflow.org/api_docs/python/tf/keras/experimental/WideDeepModel?hl=en#methods_2)
  - [tf.estimator.DNNLinearCombinedClassifier](https://www.tensorflow.org/api_docs/python/tf/estimator/DNNLinearCombinedClassifier)
- [pytorch-widedeep](https://github.com/jrzaurin/pytorch-widedeep)

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [2]:
path = "./movielens_data"

In [3]:
df = pd.read_csv(os.path.join(path,'ratings.csv'))
train_df, val_df = train_test_split(df, test_size=0.2, random_state=1234, shuffle=True)

In [4]:
train_df.shape

(80668, 4)

In [5]:
# 시간문제로 일부만 활용
train_df = train_df[:10000]

In [6]:
# Load all related dataframe
movies_df = pd.read_csv(os.path.join(path, 'movies.csv'))
movies_df = movies_df.set_index('movieId')

In [7]:
print(movies_df.shape)
movies_df.head()

(9742, 2)


Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [8]:
movies_df.columns

Index(['title', 'genres'], dtype='object')

In [9]:
# 장르 이외에 다양한 피처를 추가 가능
dummy_genres_df = movies_df['genres'].str.get_dummies(sep='|')
train_genres_df = train_df['movieId'].apply(lambda x: dummy_genres_df.loc[x])
train_genres_df.head()

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
95713,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
61560,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
77204,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
93367,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0
90892,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0


In [10]:
len(movies_df['title'])

9742

In [11]:
year_lst = []
cnt = 0
for i in movies_df['title']:
    year = i.split('(')[-1][:-1]
    if len(year) == 4:
        year_lst.append(year)
        cnt+=1
    else:
        year_lst.append(-1)
        cnt+=1

In [12]:
movies_df['year'] = year_lst

In [13]:
train_df['year'] = train_df.apply(lambda x: movies_df.loc[x['movieId']]['year'], axis=1)

In [14]:
train_df = pd.concat([train_df, train_genres_df], axis=1)

In [15]:
train_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,year,(no genres listed),Action,Adventure,Animation,Children,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
95713,600,5943,3.0,1237714356,2002,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
61560,407,2571,5.0,1424349171,1999,0,1,0,0,0,...,0,0,0,0,0,0,1,1,0,0
77204,482,8958,4.0,1105397126,2004,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
93367,599,2322,2.5,1498515283,1998,0,1,0,0,0,...,0,0,0,0,0,0,1,0,1,0
90892,590,2959,3.5,1258416553,1999,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [16]:
wide_cols = list(dummy_genres_df.columns)

In [17]:
wide_cols

['(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [18]:
len(wide_cols)

20

In [19]:
wide_cols = wide_cols[:3] # 전부 다 할시 시간이 너무 오래 걸림
# wide_cols = ['genre', 'grade']
# cross_cols = [('genre', 'grade')]
wide_cols

['(no genres listed)', 'Action', 'Adventure']

In [20]:
import itertools
from itertools import product  
unique_combinations = list(list(zip(wide_cols, element)) 
                           for element in product(wide_cols, repeat = len(wide_cols))) 

print(unique_combinations)

[[('(no genres listed)', '(no genres listed)'), ('Action', '(no genres listed)'), ('Adventure', '(no genres listed)')], [('(no genres listed)', '(no genres listed)'), ('Action', '(no genres listed)'), ('Adventure', 'Action')], [('(no genres listed)', '(no genres listed)'), ('Action', '(no genres listed)'), ('Adventure', 'Adventure')], [('(no genres listed)', '(no genres listed)'), ('Action', 'Action'), ('Adventure', '(no genres listed)')], [('(no genres listed)', '(no genres listed)'), ('Action', 'Action'), ('Adventure', 'Action')], [('(no genres listed)', '(no genres listed)'), ('Action', 'Action'), ('Adventure', 'Adventure')], [('(no genres listed)', '(no genres listed)'), ('Action', 'Adventure'), ('Adventure', '(no genres listed)')], [('(no genres listed)', '(no genres listed)'), ('Action', 'Adventure'), ('Adventure', 'Action')], [('(no genres listed)', '(no genres listed)'), ('Action', 'Adventure'), ('Adventure', 'Adventure')], [('(no genres listed)', 'Action'), ('Action', '(no gen

In [21]:
cross_cols = [item for sublist in unique_combinations for item in sublist]
cross_cols = [x for x in cross_cols if x[0] != x[1]]
cross_cols = list(set(cross_cols))
print(cross_cols)

[('Adventure', 'Action'), ('Action', 'Adventure'), ('Adventure', '(no genres listed)'), ('Action', '(no genres listed)'), ('(no genres listed)', 'Action'), ('(no genres listed)', 'Adventure')]


In [22]:
# embed_cols = [('genre', 16),('grade', 16)]
embed_cols = list(set([(x[0], 16) for x in cross_cols]))
continuous_cols = ['year']

In [23]:
print(embed_cols)

[('Action', 16), ('(no genres listed)', 16), ('Adventure', 16)]


In [24]:
print(continuous_cols)

['year']


In [25]:
target = train_df['rating'].apply(lambda x: 1 if x > 9 else 0).values

In [26]:
target

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

## Wide & Deep

In [27]:
from pytorch_widedeep import Trainer

from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
from pytorch_widedeep.models import Wide, TabMlp, WideDeep
from pytorch_widedeep.metrics import Accuracy

### Wide Component

In [28]:
preprocess_wide = WidePreprocessor(wide_cols=wide_cols, crossed_cols=cross_cols)
X_wide = preprocess_wide.fit_transform(train_df)
wide = Wide(wide_dim=np.unique(X_wide).shape[0], pred_dim=1)

  and should_run_async(code)


In [29]:
X_wide.size

90000

In [30]:
wide

Wide(
  (wide_linear): Embedding(27, 1, padding_idx=0)
)

### Deep Component

In [31]:
preprocess_deep = TabPreprocessor(embed_cols=embed_cols, continuous_cols=continuous_cols)
X_deep = preprocess_deep.fit_transform(train_df)
deeptabular = TabMlp(
    mlp_hidden_dims=[64, 32],
    column_idx=preprocess_deep.column_idx,
    embed_input=preprocess_deep.embeddings_input,
    continuous_cols=continuous_cols,
)

In [32]:
deeptabular

  and should_run_async(code)


TabMlp(
  (embed_layers): ModuleDict(
    (emb_layer_Action): Embedding(3, 16, padding_idx=0)
    (emb_layer_(no genres listed)): Embedding(3, 16, padding_idx=0)
    (emb_layer_Adventure): Embedding(3, 16, padding_idx=0)
  )
  (embedding_dropout): Dropout(p=0.1, inplace=False)
  (tab_mlp): MLP(
    (mlp): Sequential(
      (dense_layer_0): Sequential(
        (0): Dropout(p=0.1, inplace=False)
        (1): Linear(in_features=49, out_features=64, bias=True)
        (2): ReLU(inplace=True)
      )
      (dense_layer_1): Sequential(
        (0): Dropout(p=0.1, inplace=False)
        (1): Linear(in_features=64, out_features=32, bias=True)
        (2): ReLU(inplace=True)
      )
    )
  )
)

### Build and Train

In [33]:
# build, compile and fit
model = WideDeep(wide=wide, deeptabular=deeptabular)
trainer = Trainer(model, objective="binary", metrics=[Accuracy])
trainer.fit(
    X_wide=X_wide,
    X_tab=X_deep,
    target=target,
    n_epochs=5,
    batch_size=256,
    val_split=0.1,
)

epoch 1: 100%|██████████████████████████████████████| 36/36 [00:41<00:00,  1.17s/it, loss=0.0202, metrics={'acc': 1.0}]
valid: 100%|████████████████████████████████████████| 4/4 [00:41<00:00, 10.50s/it, loss=0.000739, metrics={'acc': 1.0}]
epoch 2: 100%|████████████████████████████████████| 36/36 [00:42<00:00,  1.19s/it, loss=0.000206, metrics={'acc': 1.0}]
valid: 100%|██████████████████████████████████████████| 4/4 [00:42<00:00, 10.57s/it, loss=3.6e-5, metrics={'acc': 1.0}]
epoch 3: 100%|█████████████████████████████████████| 36/36 [00:41<00:00,  1.16s/it, loss=3.24e-5, metrics={'acc': 1.0}]
valid: 100%|█████████████████████████████████████████| 4/4 [00:40<00:00, 10.00s/it, loss=1.02e-5, metrics={'acc': 1.0}]
epoch 4: 100%|█████████████████████████████████████| 36/36 [00:40<00:00,  1.13s/it, loss=9.78e-6, metrics={'acc': 1.0}]
valid: 100%|█████████████████████████████████████████| 4/4 [00:40<00:00, 10.24s/it, loss=2.69e-6, metrics={'acc': 1.0}]
epoch 5: 100%|██████████████████████████

In [34]:
X_deep.shape

  and should_run_async(code)


(10000, 4)

In [35]:
X_wide.shape

(10000, 9)

In [None]:
# predict
X_wide_te = WidePreprocessor.transform(df_test)
X_tab_te = tab_preprocessor.transform(df_test)
preds = trainer.predict(X_wide=X_wide_te, X_tab=X_tab_te)