# Wide&Deep
Using the wide&deep deep learning architecture with pytorch. 

[Docs](https://pytorch-widedeep.readthedocs.io/en/latest/index.html)

![text](https://github.com/jrzaurin/pytorch-widedeep/raw/master/docs/figures/widedeep_arch.png)

In [2]:
import vaex

df = vaex.open("data/adult.arrow").as_numpy()
df.head(2)

#,age,workclass,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,country,target
0,39,State-gov,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,False
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,False


In [4]:
import pandas as pd
import numpy as np
import torch
from pytorch_widedeep import Trainer
from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
from pytorch_widedeep.models import Wide, TabMlp, WideDeep
from pytorch_widedeep.metrics import Accuracy


train, test = df.ml.train_test_split(0.8)
target_col = "target"

# target
target = train[target_col].values
df_train = train.to_pandas_df() 
df_test= test.to_pandas_df()

df.head(2)

#,age,workclass,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,country,target
0,39,State-gov,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,False
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,False


# Modeling

In [5]:
# wide
wide_cols = [
    "education_num",
    "relationship",
    "workclass",
    "occupation",
    "country",
    "sex",
]
cross_cols = [("education_num", "occupation"), ("country", "occupation")]
wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=cross_cols)
X_wide = wide_preprocessor.fit_transform(df_train)
wide = Wide(wide_dim=np.unique(X_wide).shape[0], pred_dim=1)

# deeptabular
embed_cols = [
    ("education_num", 16),
    ("workclass", 16),
    ("occupation", 16),
    ("country", 32),
]
cont_cols = ["age", "hours_per_week"]
tab_preprocessor = TabPreprocessor(embed_cols=embed_cols, continuous_cols=cont_cols)
X_tab = tab_preprocessor.fit_transform(df_train)
deeptabular = TabMlp(
    mlp_hidden_dims=[64, 32],
    column_idx=tab_preprocessor.column_idx,
    embed_input=tab_preprocessor.embeddings_input,
    continuous_cols=cont_cols,
)

# wide and deep
model = WideDeep(wide=wide, deeptabular=deeptabular)

# train the model
trainer = Trainer(model, objective="binary", metrics=[Accuracy], verbose=False)
trainer.fit(
    X_wide=X_wide,
    X_tab=X_tab,
    target=target,
    n_epochs=5,
    batch_size=256,
    val_split=0.1,
)

## Add the model as a column

In [6]:
features = wide_cols + cont_cols

@vaex.register_function()
def predict(*columns):
    data = pd.DataFrame(np.array(columns).T, columns=features)    
    X_wide_te = wide_preprocessor.transform(data)
    X_tab_te = tab_preprocessor.transform(data)
    pred = trainer.predict(X_wide=X_wide_te, X_tab=X_tab_te)
    return pred

df.add_function('predict',predict)
df['predictions'] = df.func.predict(*features)
df.head(2)

#,age,workclass,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,country,target,predictions
0,39,State-gov,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,False,0
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,False,1


# Create pipeline

In [7]:
from goldilox import Pipeline

pipeline = Pipeline.from_vaex(df)
pipeline.inference(pipeline.raw)



#,age,workclass,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,country,target,predictions
0,39,State-gov,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,False,0
