In [12]:
import os
os.chdir('/content/drive/MyDrive/yeonjun/공부/RecSys/intro_to_recsys/data')

In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from datetime import datetime
from collections import defaultdict

In [14]:
class Config:
    learning_rate = 0.001
    weight_decay = 0.01
    embed_dim = 32
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    epochs = 20
    
config = Config()

In [15]:
CSV_COLUMNS = [
    "age", "workclass", "fnlwgt", "education", "education_num",
    "marital_status", "occupation", "relationship", "race", "gender",
    "capital_gain", "capital_loss", "hours_per_week", "native_country",
    "income_bracket"
]

cat_cols = ['workclass', 'education', 'marital_status', 'occupation', 
           'relationship', 'race', 'gender', 'native_country', ]

cont_cols = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss',
            'hours_per_week', ]

cross_cols = ['education', 'occupation'] , ['native_country', 'occupation']

target = 'label'

In [16]:
def load_process_data():
    train = pd.read_csv('./adult.data', names=CSV_COLUMNS)
    test = pd.read_csv('./adult.test', names=CSV_COLUMNS).dropna(axis=0).reset_index(drop=True)

    train['label'] = train['income_bracket'].apply(lambda x: ">50K" in x).astype(int)
    train = train.drop(['income_bracket'], axis=1)

    test['label'] = test['income_bracket'].apply(lambda x: ">50K" in x).astype(int)
    test = test.drop(['income_bracket'], axis=1)

    train['is_train'] = 1
    test['is_train'] = 0

    full_df = pd.concat([train, test], axis=0).reset_index(drop=True)
    
    return full_df

In [17]:
def cross_interaction(data):
    new_cols = []
    for a in cross_cols:
        col_name = '-'.join(a)
        data[col_name] = data[a].astype(str).apply(lambda x: ''.join(x), axis=1)
        new_cols.append(col_name)
    
    return new_cols, data

def categorical_encoding(data, cols):
    encoder_record = {}
    for col in cols:
        encoder = LabelEncoder()
        encoder.fit(data[col].values)
        encoder_record[col] = encoder
    
    for col, enc in encoder_record.items():
        data[col] = enc.transform(data[col])
        
    return encoder_record, data

def continuous_encoding(data, cols):
    encoder = StandardScaler()
    encoder.fit(data[cols].values)
    data[cont_cols] = encoder.transform(data[cols].values)
    
    return encoder, data

In [18]:
def get_wide_data(df):
    
    new_cols, df = cross_interaction(df)
    oh_df = pd.get_dummies(df[new_cols + cat_cols])
    cont_encoder, df = continuous_encoding(df, cont_cols)
    
    train_mask = df['is_train'] == 1
    
    temp_df = pd.concat([df[cont_cols + [target]], oh_df], axis=1)
    train_df = temp_df[train_mask]
    test_df = temp_df[~train_mask]
    model_var = [col for col in train_df if col not in [target]]
    
    return train_df, test_df, model_var

def get_deep_data(df):
    new_cols, df = cross_interaction(df)
    cont_encoder, df = continuous_encoding(df, cont_cols)
    cat_encoders, df = categorical_encoding(df, cat_cols+new_cols)
    
    train_mask = df['is_train'] == 1
    
    train_df = df[cat_cols + new_cols + cont_cols + [target]][train_mask]
    test_df = df[cat_cols + new_cols + cont_cols + [target]][~train_mask]
    model_var = [col for col in train_df if col not in [target]]
    
    
    return train_df, test_df, model_var, cat_encoders

In [19]:
class TrainData:
    def __init__(self, data, model_var):
        
        self.label = data[target].values
        self.data = data[model_var].values
            
    def __len__(self):
        return len(self.label)
    
    def __getitem__(self, index):
        return {
            'x' : self.data[index],
            'y' : self.label[index]
        }

In [20]:
class WD(nn.Module):
    def __init__(self, cat_encoders, cont_dim, wide_input_dim, deep_input_dim):
        super(WD, self).__init__()
        cat_dim = [len(v.classes_) for k, v in cat_encoders.items()]
        
        # wide part
        self.wide_linear1 = nn.Linear(wide_input_dim, 1)
        
        # deep part
        self.embeddings = nn.ModuleList([nn.Embedding(v, config.embed_dim) for v in cat_dim])
        concat_dim = config.embed_dim * len(cat_dim) + cont_dim
        
        self.deep_linear1 = nn.Linear(concat_dim, 128)
        self.deep_linear2 = nn.Linear(128, 64)
        self.deep_linear3 = nn.Linear(64, 32)
        self.deep_out = nn.Linear(32, 1)
        
        self.logit = nn.Sigmoid()
        
        self.wide_deep_weight = nn.Parameter(torch.FloatTensor([0.5]))
        
    def forward(self, wide, deep):
        # wide part
        wide = self.wide_linear1(wide.float())
        wide = self.logit(wide)
        
        # deep part
        cont_tensor = deep[:, len(self.embeddings):]
        cat_tensor = deep[:, :len(self.embeddings)].long()
        
        cat_embed = [e(cat_tensor[:, i]) for i, e in enumerate(self.embeddings)]
        cat_embed = torch.cat(cat_embed, axis=1)
        deep = torch.cat([cont_tensor, cat_embed], axis=1)
        
        deep = F.relu(self.deep_linear1(deep.float()))
        deep = F.relu(self.deep_linear2(deep))
        deep = F.relu(self.deep_linear3(deep))
        deep = self.deep_out(deep)
        deep = self.logit(deep)
        
        pred = deep * self.wide_deep_weight + wide * (1-self.wide_deep_weight)
        
        return pred

In [21]:
full_df = load_process_data()

train_wide, test_wide, wide_cols = get_wide_data(full_df)
train_deep, test_deep, deep_cols, cat_encoders = get_deep_data(full_df)

tr_wide_dataset = TrainData(train_wide, wide_cols)
tr_deep_dataset = TrainData(train_deep, deep_cols)
tst_wide_dataset = TrainData(test_wide, wide_cols)
tst_deep_dataset = TrainData(test_deep, deep_cols)

tr_wide_loader = DataLoader(tr_wide_dataset, batch_size=128, drop_last=False)
tr_deep_loader = DataLoader(tr_deep_dataset, batch_size=128, drop_last=False)
tst_wide_loader = DataLoader(tst_wide_dataset, batch_size=128, drop_last=False)
tst_deep_loader = DataLoader(tst_deep_dataset, batch_size=128, drop_last=False)

model = WD(cat_encoders, 
           cont_dim=len(cont_cols),
           wide_input_dim=len(wide_cols), 
           deep_input_dim=len(deep_cols),
          )
model.to(config.device)

optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)
loss_fn = nn.BCEWithLogitsLoss()

start = datetime.now()
history = defaultdict(list)
for epoch in range(config.epochs):
    model.train()
    losses = []
    for wide, deep in zip(tr_wide_loader, tr_deep_loader):
        wide_x, y = wide['x'].to(config.device), wide['y'].to(config.device, dtype=torch.float)
        deep_x = deep['x'].to(config.device)

        optimizer.zero_grad()
        pred = model(wide_x, deep_x)
        loss = loss_fn(pred, y.unsqueeze(-1))
        loss.backward()
        optimizer.step()
        losses.append(loss.item())    

    losses_val = []
    for wide, deep in zip(tst_wide_loader, tst_deep_loader):
        wide_x, y = wide['x'].to(config.device), wide['y'].to(config.device, dtype=torch.float)
        deep_x = deep['x'].to(config.device)

        with torch.no_grad():
            pred = model(wide_x, deep_x)
            loss = loss_fn(pred, y.unsqueeze(-1))
            losses_val.append(loss.item())

    history['train_losses'].append(np.mean(losses))
    history['valid_losses'].append(np.mean(losses_val))
    
    print(f'EPOCH {epoch+1} : TRAIN LOGLOSS {np.mean(losses)}, TEST LOGLOSS {np.mean(losses_val)}')


EPOCH 1 : TRAIN LOGLOSS 0.7298136061313105, TEST LOGLOSS 0.7030239971354604
EPOCH 2 : TRAIN LOGLOSS 0.6963494349928463, TEST LOGLOSS 0.69267085660249
EPOCH 3 : TRAIN LOGLOSS 0.6887027922798605, TEST LOGLOSS 0.6843316205777228
EPOCH 4 : TRAIN LOGLOSS 0.6650571911942725, TEST LOGLOSS 0.615960257127881
EPOCH 5 : TRAIN LOGLOSS 0.554394904421825, TEST LOGLOSS 0.5092742326669395
EPOCH 6 : TRAIN LOGLOSS 0.4842543701330821, TEST LOGLOSS 0.4618658274412155
EPOCH 7 : TRAIN LOGLOSS 0.4476152765984629, TEST LOGLOSS 0.4329396076500416
EPOCH 8 : TRAIN LOGLOSS 0.42423265985414094, TEST LOGLOSS 0.4142673898022622
EPOCH 9 : TRAIN LOGLOSS 0.4081829720852422, TEST LOGLOSS 0.4004971766844392
EPOCH 10 : TRAIN LOGLOSS 0.3965034656664904, TEST LOGLOSS 0.3896687342785299
EPOCH 11 : TRAIN LOGLOSS 0.38766245386179754, TEST LOGLOSS 0.382590762572363
EPOCH 12 : TRAIN LOGLOSS 0.3807026554556454, TEST LOGLOSS 0.37637490197084844
EPOCH 13 : TRAIN LOGLOSS 0.37488799375646253, TEST LOGLOSS 0.370997273363173
EPOCH 14 :