In [28]:
import pandas as pd
import numpy as np
import torch
import typing
import transformers
from transformers import TFAutoModel, AutoTokenizer, BertTokenizer, AutoConfig, BertModel
%load_ext blackcellmagic

The blackcellmagic extension is already loaded. To reload it, use:
  %reload_ext blackcellmagic


In [78]:
df = pd.read_csv(
    "data/transformed/df.csv",
    dtype={
        "product_title": str,
        "review_title": str,
        "review_text": str,
        "product_id": str,
    },
)
reviews_scraped = pd.read_csv(
    "data/transformed/reviews_scraped.csv",
    dtype={"review_title": str, "review_text": str, "product_id": str},
)
df = df[0:1000]

reviews_scraped = reviews_scraped[0:2000]


### Constants

In [88]:
BERT_TOKENIZER_LENGTH: int = 256
BERT_EMBEDDING_SIZE : int = 768
DEVICE :torch.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BERT_TOKENIZER = BertTokenizer.from_pretrained("bert-base-uncased")
BERT_CONFIG, _ = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, return_unused_kwargs=True)
SCRAPED_REVIEW_LIMIT = 10
BATCH_SIZE=256

### Dataset


In [89]:
from typing import TypedDict
from torch.utils.data import Dataset, DataLoader
from dataclasses import dataclass


class BertInput(TypedDict):
    attention_mask: torch.Tensor
    """shape: (1,MAX_SEQUENCE_LENGTH_BERT_TOKENIZER)"""
    input_ids: torch.Tensor
    """shape: (1,MAX_SEQUENCE_LENGTH_BERT_TOKENIZER)"""


class BertInputBatch:
    def __init__(self, attention_mask: torch.Tensor, input_ids: torch.Tensor) -> None:
        self.attention_mask = attention_mask
        self.input_ids = input_ids

    @staticmethod
    def from_batch_encoding(
        batch_encoding: transformers.tokenization_utils_base.BatchEncoding,
    ):
        return BertInputBatch(
            torch.tensor(batch_encoding["attention_mask"]),
            torch.tensor(batch_encoding["input_ids"]),
        )

    attention_mask: torch.Tensor
    input_ids: torch.Tensor

    @property
    def shape(self) -> torch.Size:
        return self.input_ids.shape

    def __getitem__(self, i: int) -> BertInput:
        return {
            "attention_mask": self.attention_mask[i : i + 1, :],
            "input_ids": self.input_ids[i : i + 1, :],
        }


# class ReviewDataSetItemInput(TypedDict):
#     """one fake/real review with X context reviews from the same product"""

#     product_title_bert_input: BertInput
#     review_title_bert_input: BertInput
#     review_text_bert_input: BertInput

#     review_features: torch.Tensor
#     """shape: (4)    Rating [1-5; 1] + Verified Purchase [0-1; 1] + SA Valence + Subjectivity [0.0-1.0; 2]"""

#     product_features: torch.Tensor
#     """shape: (37)    Ratings (1-5) Ratio [0.0-1.0]; 5] + Rating Avg [f64; 1] + Rating Count [u64; 1] + Category [u64; 30]"""

#     scraped_review_features: torch.Tensor
#     """shape: (X, 4)    Rating [1-5; 1] + Verified Purchase [0-1; 1] + SA Valence + Subjectivity [0.0-1.0; 2]"""

#     scraped_helpful: torch.Tensor
#     """shape: (X, 1)    Helpfulness [u64; 1]"""

#     scraped_review_texts: BertInputBatch

#     scraped_review_titles: BertInputBatch

PACKED_DATASET_ROW_SIZE = 37 + 4 + 6 * BERT_TOKENIZER_LENGTH + SCRAPED_REVIEW_LIMIT * (1 + 4 + 4 * BERT_TOKENIZER_LENGTH)

class ReviewsDataSet(Dataset):
    """
    Args:
        df: pd.DataFrame of size M with columns:
            - rating (1-5)
            - verified (0-1)
            - product_id (str)
            - review_title (str)
            - review_text (str)
            - cat_0 - cat_29 (0-1)
            - label (0-1)
            - text_sentiment (0.0-1.0)
            - text_subjectivity (0.0-1.0)
            - rating_count (u64)
            - rating_avg (1.0-5.0)
            - rating1 - rating5  (0.0-1.0)
            - product_title (str)

        reviews_scraped:  pd.DataFrame of size N with columns:
            - product_id (str)
            - review_title (str)
            - review_text (str)
            - helpful (u64)
            - verified (0-1)
            - rating (1-5)
            - text_sentiment (0.0-1.0)
            - text_subjectivity (0.0-1.0)
    """

    df: pd.DataFrame
    reviews_scraped: pd.DataFrame

    df_product_title_encoded: BertInputBatch
    """shape: (N, MAX_SEQUENCE_LENGTH_BERT_TOKENIZER)"""

    df_review_title_encoded: BertInputBatch
    """shape: (N, MAX_SEQUENCE_LENGTH_BERT_TOKENIZER)"""

    df_review_text_encoded: BertInputBatch
    """shape: (N, MAX_SEQUENCE_LENGTH_BERT_TOKENIZER)"""

    review_feature_vector: torch.Tensor
    """shape: (N, 4)    Rating [1-5; 1] + Verified Purchase [0-1; 1] + SA Valence + Subjectivity [0.0-1.0; 2]"""

    product_feature_vector: torch.Tensor
    """shape: (N, 37)    Ratings (1-5) Ratio [0.0-1.0]; 5] + Rating Avg [f64; 1] + Rating Count [u64; 1] + Category [u64; 30]"""

    label_vector: torch.Tensor
    """shape: (N, 1) """

    scraped_review_feature_vector: torch.Tensor
    """shape: (M, 4)    Rating [1-5; 1] + Verified Purchase [0-1; 1] + SA Valence + Subjectivity [0.0-1.0; 2]"""

    scraped_review_helpful: torch.Tensor
    """shape: (M, 1)    Helpfulness [u64; 1]"""

    scraped_review_title_encoded: BertInputBatch
    """shape: (M, MAX_SEQUENCE_LENGTH_BERT_TOKENIZER)"""

    scraped_review_text_encoded: BertInputBatch
    """shape: (M, MAX_SEQUENCE_LENGTH_BERT_TOKENIZER)"""

    # product_id_indexes_map: dict[str, list[int]]
    # """For each product, maps the product id to the int indices for all scraped context reviews for this product"""

    def __init__(self, df, reviews_scraped):

        self.df = df
        self.reviews_scraped = reviews_scraped
        M = reviews_scraped.__len__()
        N = df.__len__()

        # use bert tokenizer to tokenize all strings of the
        print(
            f"creating ReviewsDataSet (real/fake reviews: N={len(df)}, context reviews: M={len(reviews_scraped)}"
        )
        print(f"    bert tokeinzer working...")

        def bert_input_batch_tokenize(
            list_of_strings: list[str],
        ) -> BertInputBatch:
            list_of_strings = [str(e) for e in list_of_strings]
            return BertInputBatch.from_batch_encoding(
                BERT_TOKENIZER.batch_encode_plus(
                    list_of_strings,
                    max_length=BERT_TOKENIZER_LENGTH,
                    pad_to_max_length=True,
                    truncation=True,
                    return_token_type_ids=False,
                )
            )

        print(f"    creating df_product_title_encoded...")
        self.df_product_title_encoded = bert_input_batch_tokenize(
            df["product_title"].tolist()
        )

        print(f"    creating df_review_title_encoded...")
        self.df_review_title_encoded = bert_input_batch_tokenize(
            df["review_title"].tolist()
        )

        print(f"    creating df_review_text_encoded...")
        self.df_review_text_encoded = bert_input_batch_tokenize(
            df["review_text"].tolist()
        )

        print(f"    creating scraped_review_title_encoded...")
        self.scraped_review_title_encoded = bert_input_batch_tokenize(
            reviews_scraped["review_title"].tolist()
        )

        print(f"    creating scraped_review_text_encoded...")
        self.scraped_review_text_encoded = bert_input_batch_tokenize(
            reviews_scraped["review_text"].tolist()
        )

        review_feature_cols = [
            "rating",
            "verified",
            "text_sentiment",
            "text_subjectivity",
        ]
        self.review_feature_vector = torch.tensor(df[review_feature_cols].to_numpy()).float()
        self.label_vector = torch.tensor(df[["label"]].to_numpy()).float()
        self.scraped_review_feature_vector = torch.tensor(
            reviews_scraped[review_feature_cols].to_numpy()
        ).float()
        self.scraped_review_helpful = torch.reshape(
            torch.tensor(reviews_scraped["helpful"].to_numpy()).float(), (M, 1)
        )

        product_feature_cols = [
            "rating_count",
            "rating_avg",
            "rating1",
            "rating2",
            "rating3",
            "rating4",
            "rating5",
        ] + [f"cat_{i}" for i in range(0, 30)]
        self.product_feature_vector = torch.tensor(df[product_feature_cols].to_numpy()).float()

    def __getitem__(self, index) -> tuple[torch.Tensor, torch.Tensor]:
        product_id = self.df.loc[index, "product_id"]

        # get indexes in scraped data where product id is the same
        indices = torch.tensor(
            reviews_scraped.index[
                reviews_scraped["product_id"] == product_id
            ].to_numpy()
        ).int()

        def slice(tensor: torch.Tensor) -> torch.Tensor:
            """selects only the subset where index indicates product id is the same"""
            return torch.index_select(tensor, 0, indices)

        def zero_pad_ravel(tensor: torch.Tensor, X) -> torch.Tensor:
            """takes in a tensor of shape (N,D) appends zero elements or removes elements from the end to create an (X,D) shaped tensor and then reshapes that into a (X*D) shaped tensor"""
            (N, D) = tensor.shape
            return torch.reshape(
                torch.nn.functional.pad(
                    input=tensor[0:X, :],
                    pad=(0, 0, 0, max(X - N, 0)),
                    mode="constant",
                    value=0,
                ),
                (-1,),
            )

        # slice the tensors of relevant scraped reviews out of the total reviews:
        scraped_helpful = slice(self.scraped_review_helpful)
        _N = scraped_helpful.shape[0]  # is N
        assert scraped_helpful.shape == (_N, 1)

        scraped_review_features = slice(self.scraped_review_feature_vector)
        assert scraped_review_features.shape == (_N, 4)

        scraped_review_texts_att = slice(
            self.scraped_review_text_encoded.attention_mask
        )

        assert scraped_review_texts_att.shape == (_N, BERT_TOKENIZER_LENGTH)

        scraped_review_texts_ids = slice(self.scraped_review_text_encoded.input_ids)
        assert scraped_review_texts_ids.shape == (_N, BERT_TOKENIZER_LENGTH)

        scraped_review_titles_att = slice(
            self.scraped_review_title_encoded.attention_mask
        )
        assert scraped_review_titles_att.shape == (_N, BERT_TOKENIZER_LENGTH)

        scraped_review_titles_ids = slice(self.scraped_review_title_encoded.input_ids)
        assert scraped_review_titles_ids.shape == (_N, BERT_TOKENIZER_LENGTH)

        # combine all review data into a (X,) shaped tensor:
        scraped_block = torch.cat(
            (
                scraped_helpful,
                scraped_review_features,
                scraped_review_texts_att,
                scraped_review_texts_ids,
                scraped_review_titles_att,
                scraped_review_titles_ids,
            ),
            dim=1,
        )
        scraped_block_flat = zero_pad_ravel(scraped_block, SCRAPED_REVIEW_LIMIT)
        assert scraped_block_flat.shape == (
            SCRAPED_REVIEW_LIMIT * (1 + 4 + 4 * BERT_TOKENIZER_LENGTH),
        )

        product_title_bert_input_att = self.df_product_title_encoded[index][
            "attention_mask"
        ][0, :]
        assert product_title_bert_input_att.shape == (BERT_TOKENIZER_LENGTH,)
        product_title_bert_input_ids = self.df_product_title_encoded[index][
            "input_ids"
        ][0, :]
        assert product_title_bert_input_ids.shape == (BERT_TOKENIZER_LENGTH,)
        review_title_bert_input_att = self.df_review_title_encoded[index][
            "attention_mask"
        ][0, :]
        assert review_title_bert_input_att.shape == (BERT_TOKENIZER_LENGTH,)
        review_title_bert_input_ids = self.df_review_title_encoded[index]["input_ids"][
            0, :
        ]
        assert review_title_bert_input_ids.shape == (BERT_TOKENIZER_LENGTH,)
        review_text_bert_input_att = self.df_review_text_encoded[index][
            "attention_mask"
        ][0, :]
        assert review_text_bert_input_att.shape == (BERT_TOKENIZER_LENGTH,)
        review_text_bert_input_ids = self.df_review_text_encoded[index]["input_ids"][
            0, :
        ]
        assert review_text_bert_input_ids.shape == (BERT_TOKENIZER_LENGTH,)

        product_features = self.product_feature_vector[index]  # shape: (37)
        assert product_features.shape == (37,)

        review_features = self.review_feature_vector[index]  # shape: (4)
        assert review_features.shape == (4,)

        catted = torch.cat(
            (
                product_features,
                review_features,
                product_title_bert_input_att,
                product_title_bert_input_ids,
                review_title_bert_input_att,
                review_title_bert_input_ids,
                review_text_bert_input_att,
                review_text_bert_input_ids,
                scraped_block_flat,
            ),
            dim=0,
        )
        assert catted.shape == (PACKED_DATASET_ROW_SIZE,)
        ##############################################################
        ## Data layout in the catted tensor row, assuming BERT_TOKENIZER_LENGTH = 256 and SCRAPED_REVIEW_LIMIT = 10
        ## | 37 | 4 | 256+256 | 256+256 | 256+256 | 10 * ( | 1 | 4 | 256+256 | 256+256 |)

        label = self.label_vector[index]
        return (catted, label)

    def __len__(self) -> int:
        return self.df.__len__()

#### unpacking functions

In [81]:
def unpack_dataset_items(packed_batch_tensor: torch.Tensor):
    """
    expects packed_batch_tensor to be of shape
    """
    assert packed_batch_tensor.shape[1] == PACKED_DATASET_ROW_SIZE
    (B, T) = packed_batch_tensor.shape

    c: int = 0

    def next(n: int) -> torch.Tensor:
        nonlocal c
        slice = packed_batch_tensor[:, c : c + n]
        c += n
        return slice

    product_features = next(37)
    assert product_features.shape == (B, 37)

    review_features = next(4)
    assert review_features.shape == (B, 4)

    product_title_bert_input_att = next(BERT_TOKENIZER_LENGTH)
    product_title_bert_input_ids = next(BERT_TOKENIZER_LENGTH)
    review_title_bert_input_att = next(BERT_TOKENIZER_LENGTH)
    review_title_bert_input_ids = next(BERT_TOKENIZER_LENGTH)
    review_text_bert_input_att = next(BERT_TOKENIZER_LENGTH)
    review_text_bert_input_ids = next(BERT_TOKENIZER_LENGTH)
    assert (
        product_title_bert_input_att.shape
        == product_title_bert_input_ids.shape
        == review_title_bert_input_att.shape
        == review_title_bert_input_ids.shape
        == review_text_bert_input_att.shape
        == review_text_bert_input_ids.shape
        == (B, BERT_TOKENIZER_LENGTH)
    )

    _scraped_block_flat = next(
        SCRAPED_REVIEW_LIMIT * (1 + 4 + 4 * BERT_TOKENIZER_LENGTH)
    )
    assert _scraped_block_flat.shape == (
        B,
        SCRAPED_REVIEW_LIMIT * (1 + 4 + 4 * BERT_TOKENIZER_LENGTH),
    )

    scraped_block = torch.reshape(
        _scraped_block_flat,
        (B, SCRAPED_REVIEW_LIMIT, (1 + 4 + 4 * BERT_TOKENIZER_LENGTH)),
    )
    assert scraped_block.shape == (
        B,
        SCRAPED_REVIEW_LIMIT,
        (1 + 4 + 4 * BERT_TOKENIZER_LENGTH),
    )

    scraped_helpful = scraped_block[:, :, 0:1]
    assert scraped_helpful.shape == (B, SCRAPED_REVIEW_LIMIT, 1)

    scraped_review_features = scraped_block[:, :, 1:5]
    assert scraped_review_features.shape == (B, SCRAPED_REVIEW_LIMIT, 4)

    scraped_review_title_bert_input_att = scraped_block[
        :, :, 5 : 5 + BERT_TOKENIZER_LENGTH
    ]
    scraped_review_title_bert_input_ids = scraped_block[
        :, :, 5 + BERT_TOKENIZER_LENGTH : 5 + 2 * BERT_TOKENIZER_LENGTH
    ]
    scraped_review_text_bert_input_att = scraped_block[
        :, :, 5 + 2 * BERT_TOKENIZER_LENGTH : 5 + 3 * BERT_TOKENIZER_LENGTH
    ]
    scraped_review_text_bert_input_ids = scraped_block[
        :, :, 5 + 3 * BERT_TOKENIZER_LENGTH : 5 + 4 * BERT_TOKENIZER_LENGTH
    ]

    return (
        product_features,
        review_features,
        product_title_bert_input_att,
        product_title_bert_input_ids,
        review_title_bert_input_att,
        review_title_bert_input_ids,
        review_text_bert_input_att,
        review_text_bert_input_ids,
        (
            scraped_helpful,
            scraped_review_features,
            scraped_review_title_bert_input_att,
            scraped_review_title_bert_input_ids,
            scraped_review_text_bert_input_att,
            scraped_review_text_bert_input_ids,
        ),
    )

### Model

In [82]:
from collections import OrderedDict
import torch.nn as nn
def create_mlp(layer_sizes: list[int]) -> nn.Sequential:
    """
    creates an MLP with the given layer_sizes. The first element is the input size, the last one the output size
    args:
        layer_sizes: [input_dim, h1_dim, h2_dim, ...., out_dim]"""
    assert layer_sizes.__len__() >= 2
        
    layers = []
    for i in range(1, layer_sizes.__len__()):
        layers.append((f"hidden_layer_{i}",nn.Linear(layer_sizes[i-1], layer_sizes[i]) ))
        layers.append((f"activation_{i}",  nn.ReLU()))
    return nn.Sequential(OrderedDict(layers))

In [83]:
class ReviewEncodingModel(torch.nn.Module):

    def __init__(self, freeze: bool = True, outdim=500) -> None:
        super(ReviewEncodingModel, self).__init__()
        self.bert = BertModel(BERT_CONFIG)
        if freeze:
            for param in self.bert.parameters():
                param.requires_grad = False
        self.out_mlp = create_mlp([BERT_EMBEDDING_SIZE*2+4, outdim])
    
    def forward(self, review_features: torch.Tensor, 
               review_title_bert_input_att: torch.Tensor,
               review_title_bert_input_ids: torch.Tensor,
               review_text_bert_input_att: torch.Tensor,
               review_text_bert_input_ids: torch.Tensor, ) -> torch.Tensor:
        """
        args:
            review_features: shape: (BATCH_SIZE, 5)
            review_title_bert_input_att: shape: (BATCH_SIZE, BERT_TOKENIZER_LENGTH)
            review_title_bert_input_ids: shape: (BATCH_SIZE, BERT_TOKENIZER_LENGTH)
            review_text_bert_input_att: shape: (BATCH_SIZE, BERT_TOKENIZER_LENGTH)
            review_text_bert_input_ids: shape: (BATCH_SIZE, BERT_TOKENIZER_LENGTH)
        returns:
            torch.Tensor of shape (BATCH_SIZE, OUTDIM)
        """

        # max over all tokens in every dimension of the 768 dimensional embedding
      
        bert_title_embedding = self.bert(attention_mask=review_title_bert_input_att.int(), input_ids=review_title_bert_input_ids.int()).last_hidden_state[:,0,:]
        assert bert_title_embedding.shape == (BATCH_SIZE, BERT_EMBEDDING_SIZE)

        # max over all tokens in every dimension of the 768 dimensional embedding
        bert_text_embedding = self.bert(attention_mask=review_text_bert_input_att.int(), input_ids=review_text_bert_input_ids.int()).last_hidden_state[:,0,:]
        assert bert_title_embedding.shape == (BATCH_SIZE, BERT_EMBEDDING_SIZE)

        # concat all on top of each other:
        catted = torch.cat([bert_title_embedding, bert_text_embedding, review_features], dim=1)

        # apply linear layer and relu:
        return self.out_mlp(catted)

        
    def freeze_bert(self, freezed: bool) -> None:
        for param in self.bert.parameters():
                param.requires_grad =  not freezed


In [92]:
REVIEW_ENCODING_MODEL_OUTDIM = 149
# REVIEW_ENCODING_MODEL_OUTDIM + 1 has to be divisible by 3 for this to work, bc. we use 3 attention heads

class FakeDetectionModel(torch.nn.Module):
    review_encoding_model: ReviewEncodingModel
    scraped_reviews_transformer: torch.nn.TransformerEncoderLayer
    outmlp: nn.Sequential

    def __init__(self):
        super(FakeDetectionModel, self).__init__()
        self.review_encoding_model: ReviewEncodingModel = ReviewEncodingModel(
            freeze=True, outdim=REVIEW_ENCODING_MODEL_OUTDIM
        )
        self.scraped_reviews_transformer = torch.nn.TransformerEncoderLayer(
            d_model=REVIEW_ENCODING_MODEL_OUTDIM+1, nhead=3, batch_first=True
        )

        LAST_FEATURES_DIM = 37 + (REVIEW_ENCODING_MODEL_OUTDIM + 1) + REVIEW_ENCODING_MODEL_OUTDIM +  BERT_EMBEDDING_SIZE
        # LAST_FEATURES_DIM: product features + transformer output dimension + review encoded + bert embedding from product title
        self.outmlp = create_mlp([LAST_FEATURES_DIM, 30, 20, 1])

    def forward(self, packed_dataset_rows: torch.Tensor)-> torch.Tensor :
        """ "
        args:
            packed_dataset_row: torch.Tensor with shape (BATCH_SIZE, PACKED_DATASET_ROW_SIZE)
        returns:
            torch.Tensor with shape: (BATCH_SIZE,1)
        """
        assert packed_dataset_rows.shape == (BATCH_SIZE, PACKED_DATASET_ROW_SIZE)
        (
            product_features,
            review_features,
            product_title_bert_input_att,
            product_title_bert_input_ids,
            review_title_bert_input_att,
            review_title_bert_input_ids,
            review_text_bert_input_att,
            review_text_bert_input_ids,
            (
                scraped_helpful,
                scraped_review_features,
                scraped_review_title_bert_input_att,
                scraped_review_title_bert_input_ids,
                scraped_review_text_bert_input_att,
                scraped_review_text_bert_input_ids,
            ),
        ) = unpack_dataset_items(packed_dataset_rows)

        ### CREATE REVIEW ENCODING
        review_encoding = self.review_encoding_model(review_features, review_title_bert_input_att, review_title_bert_input_ids, review_text_bert_input_att, review_text_bert_input_ids)  # type: ignore
        assert review_encoding.shape == (BATCH_SIZE, REVIEW_ENCODING_MODEL_OUTDIM)

        ### CREATE REVIEW ENCODINGS FOR ALL SCRAPED REVIEWS
        ### THEN COMBINE THEM AND THEIR HELPFULNESS VIA THE TRANSFORMER

        # transform (BATCH_SIZE, SCRAPED_REVIEW_LIMIT, 5 + 4 * BERT_TOKENIZER_LENGTH) into (BATCH_SIZE, SCRAPED_REVIEW_LIMIT, REVIEW_ENCODING_MODEL_OUTDIM)
        transformer_input = torch.zeros(
            (BATCH_SIZE, SCRAPED_REVIEW_LIMIT, 1 + REVIEW_ENCODING_MODEL_OUTDIM)
        )
        for i in range(SCRAPED_REVIEW_LIMIT):
            transformer_input[:, i, :] = torch.cat(
            [self.review_encoding_model(
                scraped_review_features[:, i, :],
                scraped_review_title_bert_input_att[:, i, :],
                scraped_review_title_bert_input_ids[:, i, :],
                scraped_review_text_bert_input_att[:, i, :],
                scraped_review_text_bert_input_ids[:, i, :],
            ), scraped_helpful[:,i,:]], dim=1 ) # type: ignore
        # REVIEW_ENCODING_MODEL_OUTDIM + 1 has to be divisible by 3 for this to work, bc. we use 3 attention heads

        transformer_output = self.scraped_reviews_transformer(transformer_input)[:,-1,:] # type: ignore
        assert transformer_output.shape == (BATCH_SIZE, 1+ REVIEW_ENCODING_MODEL_OUTDIM)

        ### CREATE BERT EMBEDDING FOR PRODUCT TITLE
        bert_product_title_embedding = self.review_encoding_model.bert(attention_mask=product_title_bert_input_att.int(), input_ids=product_title_bert_input_ids.int()).last_hidden_state[:,0,:] # type: ignore
        assert bert_product_title_embedding.shape == (BATCH_SIZE, BERT_EMBEDDING_SIZE)
        # linear layer connection all

        ### COMBINE PRODUCT INFORMATION (features + title), CONTEXT (other review's encodings and their helpfulness) and REVIEW ENCODING into a single scalar: the real vs. fake prediction
        catted = torch.cat([product_features, bert_product_title_embedding, review_encoding, transformer_output],dim=1)
        return torch.sigmoid(self.outmlp(catted)) # type: ignore

### Running

In [90]:

dataset = ReviewsDataSet(df, reviews_scraped)
dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE)

creating ReviewsDataSet (real/fake reviews: N=1000, context reviews: M=2000
    bert tokeinzer working...
    creating df_product_title_encoded...




    creating df_review_title_encoded...
    creating df_review_text_encoded...
    creating scraped_review_title_encoded...
    creating scraped_review_text_encoded...


In [86]:
for (input, labels) in dataloader:
    ( product_features,
        review_features,
        product_title_bert_input_att,
        product_title_bert_input_ids,
        review_title_bert_input_ids,
        review_title_bert_input_ids,
        review_text_bert_input_att,
        review_text_bert_input_ids,
        scraped_block ) = unpack_dataset_items(input)
    # this is just a test if we can get items from the dataset
    break

In [91]:
model = FakeDetectionModel()


In [93]:
model.eval()
for (input, labels) in dataloader:
    o = model(input)
    print(o)
    break

bg model
tensor([[  101.,  6179.,   102.,  ...,     0.,     0.,     0.],
        [  101.,  2047.,  3690.,  ...,     0.,     0.,     0.],
        [  101.,  7929.,   102.,  ...,     0.,     0.,     0.],
        ...,
        [  101., 28305.,  2102.,  ...,     0.,     0.,     0.],
        [  101.,  3835.,  3609.,  ...,     0.,     0.,     0.],
        [  101.,  2009.,  1005.,  ...,     0.,     0.,     0.]])


KeyboardInterrupt: 