In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from ds_utils.torch_utils import (
    train_val_test_split,
    Apply, map_idx
)
from ds_utils.train import EarlyStopper, State
from ds_utils.plotter import History
from ds_utils.utils import map

In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.feature_extraction import DictVectorizer
import nltk


%matplotlib inline

In [4]:
data = pd.read_csv("./Train_rev1.zip", compression='zip', index_col=None)
data['Log1pSalary'] = np.log1p(data['SalaryNormalized']).astype('float32')

############################################################
text_columns = ["Title", "FullDescription"]
categorical_columns = ["Category", "Company", "LocationNormalized", "ContractType", "ContractTime"]
TARGET_COLUMN = "Log1pSalary"

data[categorical_columns] = data[categorical_columns].fillna('NaN')

############################################################
tokenizer = nltk.tokenize.WordPunctTokenizer()

for col in text_columns:
    data[col] = data[col]\
        .astype(str)\
        .str.lower()\
        .apply(tokenizer.tokenize)\
        .apply(' '.join)


In [None]:
torch.load

In [None]:
class SalaryPredictor(nn.Module):
    def __init__(self, n_tokens, n_cat_features=len(categorical_vectorizer.vocabulary_),
                 hid_size=64, encoded_len=64):
        super().__init__()
        
        self.encoded_len = encoded_len
        outp_features = hid_size*3
        self.title_encoder = nn.Sequential(
            nn.Embedding(n_tokens, self.encoded_len),
            Apply(torch.permute, (0, 2, 1)),
            nn.Conv1d(self.encoded_len, hid_size, 3),
            nn.AdaptiveMaxPool1d(1),
            Apply(torch.squeeze, dim=-1)
        )
        self.description_encoder = nn.Sequential(
            nn.Embedding(n_tokens, self.encoded_len),
            Apply(torch.permute, (0, 2, 1)),
            nn.Conv1d(self.encoded_len, hid_size, 5, ),
            nn.AdaptiveMaxPool1d(1),
            Apply(torch.squeeze, dim=-1)
        )
        self.cat_encoder = nn.Sequential(
            nn.Linear(n_cat_features, hid_size, bias=False),
            nn.BatchNorm1d(hid_size),
            nn.ReLU(),
            nn.Dropout(.2),
            nn.Linear(hid_size, hid_size, bias=False)
        )
        self.output = nn.Sequential(
            nn.BatchNorm1d(outp_features),
            nn.ReLU(),
            nn.Linear(outp_features, outp_features, bias=False),
            nn.BatchNorm1d(outp_features),
            nn.ReLU(),
            nn.Dropout(.2),
            nn.Linear(outp_features, 1),
            Apply(torch.squeeze)
        )
        
    def forward(self, batch: dict[str, torch.TensorType]):
        title_enc = self.title_encoder(batch["title"])
        descr_enc = self.description_encoder(batch["descr"])
        cat_enc = self.cat_encoder(batch["cat"])
        concat = torch.cat([title_enc, descr_enc, cat_enc], axis=1)
        return self.output(concat)
    
    def predict(self, batch):
        return self.forward(batch)