# Data

["Recipe Reviews and User Feedback Dataset"](https://archive.ics.uci.edu/dataset/911/recipe+reviews+and+user+feedback+dataset)

In [241]:
def download_recipe_data(rootpath):
    import urllib.request
    import os
    rootpath = os.path.abspath(rootpath)
    os.makedirs(rootpath, exist_ok=True)
        
    url = 'https://archive.ics.uci.edu/static/public/911/recipe+reviews+and+user+feedback+dataset.zip'
    filename = url.rsplit('/', 1)[1]
    filepath = os.path.join(rootpath, filename)
    
    if os.path.isfile(filepath):
        print(f'===> Cowardly refusing to download "{filename}" as it already exists...')
        return filepath
    urllib.request.urlretrieve(url, filepath)
    return filepath

def unzip_recipe_data(zippath):
    import zipfile
    extractpath = os.path.abspath(os.path.dirname(zippath))
    with zipfile.ZipFile(zippath, 'r') as zipf:
        zipf.extractall(extractpath)
    # Expecting a single CSV file + a single ZIP file
    files = os.listdir(extractpath)
    assert len(files) == 2
    filename = files[0] if files[0].endswith('csv') else files[1]
    return os.path.join(extractpath, filename)

root_path = './data/'
csv_path = unzip_recipe_data(download_recipe_data(root_path))

===> Cowardly refusing to download "recipe+reviews+and+user+feedback+dataset.zip" as it already exists...


In [357]:
import pandas as pd
import numpy as np

import torch
from torch import nn

In [243]:
df = pd.read_csv(csv_path)
df = df.drop(['user_id', 'user_name', 'Unnamed: 0', 'recipe_code', 'comment_id', 'recipe_number', 'created_at', 'best_score'], axis='columns')

df

Unnamed: 0,recipe_name,user_reputation,reply_count,thumbs_up,thumbs_down,stars,text
0,Creamy White Chili,1,0,0,0,5,"I tweaked it a little, removed onions because ..."
1,Creamy White Chili,50,0,7,0,5,Bush used to have a white chili bean and it ma...
2,Creamy White Chili,10,0,3,0,5,I have a very complicated white chicken chili ...
3,Creamy White Chili,1,2,2,0,0,"In your introduction, you mentioned cream chee..."
4,Creamy White Chili,10,1,7,0,0,Wonderful! I made this for a &#34;Chili/Stew&#...
...,...,...,...,...,...,...,...
18177,Mamaw Emily’s Strawberry Cake,1,0,0,0,5,This Strawberry Cake has been a family favorit...
18178,Mamaw Emily’s Strawberry Cake,1,0,0,0,5,<p>I received endless compliments on this cake...
18179,Mamaw Emily’s Strawberry Cake,1,0,0,0,5,This cake was delicious and so moist! I didn&#...
18180,Mamaw Emily’s Strawberry Cake,1,0,0,0,5,I just made this too. It is wonderful. As fo...


# Data Preparation

In [353]:
import string
import re

def process_text(txt):
    r'''
    Transformations
        1. Lowercase
        2. remove all non-digit and non-letter characters
    '''
    pattern = re.compile(r'[^a-zA-Z0-9]+')
    f1 = lambda s: pattern.sub(' ', s)
    f2 = lambda s: s.strip()
    f3 = lambda s: f'<{s}>'
    if isinstance(txt, pd.Series):
        return txt.fillna('').str\
                  .lower()\
                  .apply(f1)\
                  .apply(f2)
    elif isinstance(txt, (list, tuple, np.ndarray)):
        return np.vectorize(process_txt)(txt)
    elif isinstance(txt, str):
        txt = txt.lower()
        txt = f1(txt)
        txt = f2(txt)
        return txt
    else:
        raise ValueError(f'Cannot process type {type(txt)}...')

def pad_and_batch(tokens, tokenizer, batch_size=32):
    tokens.sort(key=len, reverse=True)
    batches = []
    for idx in range(0, len(tokens), batch_size):
        batch = data[idx:idx+batch_size]
        max_length = len(batch[0])
        padded_batch = [sample + vocabulary.pad_ * (max_length - len(sample) for sample in batch]
        batches.append(torch.tensor(padded_batch))
    return batches        

class Tokenizer:
    r'''Simple tokenization scheme that learns the vocabulary from the text.
    '''
    def __init__(self, word_len, overlap, ):
        self.word_len = word_len
        self.overlap = overlap
        
        self.unk = '<UNK>'
        self.pad = '<PAD>'
        self.start = '<START>'
        self.end = '<END>'
        
        self.tokens = [self.unk, self.pad, self.start, self.end]
        self.vocabulary = {word: idx for idx, word in enumerate(self.tokens)}  # word -> index

        # Indices
        self.unk_ = self.tokens.index(self.unk)
        self.pad_ = self.tokens.index(self.pad)
        self.start_ = self.tokens.index(self.start)
        self.end_ = self.tokens.index(self.end)

    def split(self, txt: str):
        return [txt[idx:idx+self.word_len] for idx in range(1, len(txt)-self.word_len+1, self.word_len - self.overlap)]

    def fit(self, data):
        data = process_text(data)
        if isinstance(data, (list, tuple, np.ndarray, pd.Series)):
            np.vectorize(self.fit)(data)
        elif isinstance(data, str):
            tokens = self.split(data)
            tokens = set(tokens)
            tokens -= self.vocabulary.keys()
            offset = len(self.tokens)
            vocabulary = {s: idx+offset for idx, s in enumerate(tokens)}
            self.vocabulary.update(vocabulary)
            self.tokens += tokens            
        else:
            raise ValueError(f'Cannot process "{type(data)}"...')

    def __call__(self, txt):
        txt = process_text(txt)
        if isinstance(txt, (list, tuple)):
            txt = [self(t) for t in txt]
            return txt
        elif isinstance(txt, (np.ndarray, pd.Series)):
            txt = np.array([self(t) for t in txt], dtype=object)
            return txt
        elif isinstance(txt, str):
            tokens = self.split(txt)
            tokens = [self.start_] + [self.vocabulary.get(tok, self.unk_) for tok in tokens] + [self.end_]
            return tokens
        else:
            raise ValueError(f'Unknown type "{type(txt)}"...')

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, i):
        if isinstance(i, int):
            return self.tokens[i]
        else:
            return self.vocabulary[i]

In [None]:
class PositionalEncoding(nn.Module):
    r'''Positional encoding'''
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class TransformerModel(nn.Module):
    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, nfeatures, feature_embed_size, dropout=0.5):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        self.text_encoder = nn.Embedding(ntoken, ninp)
        self.feature_encoders = nn.ModuleList([
            nn.Embedding(num_embeddings, feature_embed_size) for num_embeddings in nfeatures
        ])
        self.transformer_encoder = nn.TransformerEncoder(nn.TransformerEncoderLayer(ninp + feature_embed_size * len(nfeatures), nhead, nhid, dropout), nlayers)
        self.ninp = ninp
        self.feature_embed_size = feature_embed_size
        self.decoder = nn.Linear(ninp + feature_embed_size * len(nfeatures), ntoken)

    def forward(self, text, features):
        text_embedded = self.text_encoder(text) * math.sqrt(self.ninp)
        feature_embeddings = [encoder(features[:, i]) for i, encoder in enumerate(self.feature_encoders)]
        feature_embeddings = torch.cat(feature_embeddings, dim=-1)

        src = torch.cat((text_embedded, feature_embeddings), dim=-1)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        output = self.decoder(output)
        return output

In [356]:
tokenizer = Tokenizer(word_len=4, overlap=2)
tokenizer.fit(df['text'])
# tokenizer(df['text'])

In [359]:
ntokens = len(tokenizer) # size of vocabulary for text
nfeatures = [num_unique_values1, num_unique_values2, ...] # List of unique values count for each categorical feature
feature_embed_size = 10 # Size of embeddings for each feature
emsize = 200 # embedding dimension for text
nhid = 200 # dimension of the feedforward network in nn.TransformerEncoder
nlayers = 2 # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2 # number of heads in the multiheadattention models
dropout = 0.2 # dropout value

model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, nfeatures, feature_embed_size, dropout)


NameError: name 'num_unique_values1' is not defined

In [None]:
import torch
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn as nn

# Assuming your dataset is a list of (text, features, labels) tuples
train_dataset = ... # Your dataset
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for text, features, labels in train_loader:
        # Reset gradients
        optimizer.zero_grad()

        # Forward pass
        output = model(text, features)

        # Compute loss
        loss = criterion(output, labels)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader)}')

    # Validation step...
    # Save model...