In [7]:
import pandas as pd
import numpy as np
import torch
import typing
import transformers
from transformers import TFAutoModel, AutoTokenizer, BertTokenizer
%load_ext blackcellmagic

The blackcellmagic extension is already loaded. To reload it, use:
  %reload_ext blackcellmagic


In [8]:
df = pd.read_csv(
    "data/transformed/df.csv",
    dtype={
        "product_title": str,
        "review_title": str,
        "review_text": str,
        "product_id": str,
    },
)
reviews_scraped = pd.read_csv(
    "data/transformed/reviews_scraped.csv",
    dtype={"review_title": str, "review_text": str, "product_id": str},
)

### Constants


In [20]:
MAX_SEQUENCE_LENGTH_BERT_TOKENIZER: int = 512
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BERT_TOKENIZER = BertTokenizer.from_pretrained("bert-base-uncased")

### Dataset


In [30]:
# from typing import List
from typing import TypedDict
from torch.utils.data import Dataset, DataLoader
from dataclasses import dataclass


class BertInput(TypedDict):
    attention_mask: torch.Tensor
    input_ids: torch.Tensor


class BertInputBatch:
    def __init__(self, attention_mask: torch.Tensor, input_ids: torch.Tensor) -> None:
        self.attention_mask = attention_mask
        self.input_ids = input_ids

    @staticmethod
    def from_batch_encoding(
        batch_encoding: transformers.tokenization_utils_base.BatchEncoding,
    ):
        return BertInputBatch(
            torch.tensor(batch_encoding["attention_mask"]),
            torch.tensor(batch_encoding["input_ids"]),
        )

    attention_mask: torch.Tensor
    input_ids: torch.Tensor

    @property
    def shape(self) -> torch.Size:
        return self.input_ids.shape

    def __getitem__(self, i: int) -> BertInput:
        return {
            "attention_mask": self.attention_mask[i, :],
            "input_ids": self.input_ids[i, :],
        }


@dataclass
class ReviewDataSetItem(TypedDict):
    """one fake/real review with X context reviews from the same product"""

    product_title_bert_input: BertInput
    review_title_bert_input: BertInput
    review_text_bert_input: BertInput

    review_features: torch.Tensor
    """shape: (4)    Rating [1-5; 1] + Verified Purchase [0-1; 1] + SA Valence + Subjectivity [0.0-1.0; 2]"""

    product_features: torch.Tensor
    """shape: (37)    Ratings (1-5) Ratio [0.0-1.0]; 5] + Rating Avg [f64; 1] + Rating Count [u64; 1] + Category [u64; 30]"""

    scraped_review_features: torch.Tensor
    """shape: (X, 4)    Rating [1-5; 1] + Verified Purchase [0-1; 1] + SA Valence + Subjectivity [0.0-1.0; 2]"""

    scraped_helpful: torch.Tensor
    """shape: (X, 1)    Helpfulness [u64; 1]"""

    scraped_review_texts: BertInputBatch

    scraped_review_titles: BertInputBatch


class ReviewsDataSet(Dataset):
    """
    Args:
        df: pd.DataFrame of size M with columns:
            - rating (1-5)
            - verified (0-1)
            - product_id (str)
            - review_title (str)
            - review_text (str)
            - cat_0 - cat_29 (0-1)
            - label (0-1)
            - text_sentiment (0.0-1.0)
            - text_subjectivity (0.0-1.0)
            - rating_count (u64)
            - rating_avg (1.0-5.0)
            - rating1 - rating5  (0.0-1.0)
            - product_title (str)

        reviews_scraped:  pd.DataFrame of size N with columns:
            - product_id (str)
            - review_title (str)
            - review_text (str)
            - helpful (u64)
            - verified (0-1)
            - rating (1-5)
            - text_sentiment (0.0-1.0)
            - text_subjectivity (0.0-1.0)
    """

    df: pd.DataFrame
    reviews_scraped: pd.DataFrame

    df_product_title_encoded: BertInputBatch
    """shape: (N, MAX_SEQUENCE_LENGTH_BERT_TOKENIZER)"""

    df_review_title_encoded: BertInputBatch
    """shape: (N, MAX_SEQUENCE_LENGTH_BERT_TOKENIZER)"""

    df_review_text_encoded: BertInputBatch
    """shape: (N, MAX_SEQUENCE_LENGTH_BERT_TOKENIZER)"""

    review_feature_vector: torch.Tensor
    """shape: (N, 4)    Rating [1-5; 1] + Verified Purchase [0-1; 1] + SA Valence + Subjectivity [0.0-1.0; 2]"""

    product_feature_vector: torch.Tensor
    """shape: (N, 37)    Ratings (1-5) Ratio [0.0-1.0]; 5] + Rating Avg [f64; 1] + Rating Count [u64; 1] + Category [u64; 30]"""

    scraped_review_feature_vector: torch.Tensor
    """shape: (M, 4)    Rating [1-5; 1] + Verified Purchase [0-1; 1] + SA Valence + Subjectivity [0.0-1.0; 2]"""

    scraped_review_helpful: torch.Tensor
    """shape: (M, 1)    Helpfulness [u64; 1]"""

    scraped_review_title_encoded: BertInputBatch
    """shape: (M, MAX_SEQUENCE_LENGTH_BERT_TOKENIZER)"""

    scraped_review_text_encoded: BertInputBatch
    """shape: (M, MAX_SEQUENCE_LENGTH_BERT_TOKENIZER)"""

    # product_id_indexes_map: dict[str, list[int]]
    # """For each product, maps the product id to the int indices for all scraped context reviews for this product"""

    def __init__(self, df, reviews_scraped):

        # use bert tokenizer to tokenize all strings of the
        print(
            f"creating ReviewsDataSet (real/fake reviews: N={len(df)}, context reviews: M={len(reviews_scraped)}"
        )
        print(f"    bert tokeinzer working...")
        def bert_input_batch_tokenize(
            list_of_strings: list[str],
        ) -> BertInputBatch:
            return BertInputBatch.from_batch_encoding(
                BERT_TOKENIZER.batch_encode_plus(
                    list_of_strings,
                    max_length=MAX_SEQUENCE_LENGTH_BERT_TOKENIZER,
                    pad_to_max_length=True,
                    truncation=True,
                    return_token_type_ids=False,
                )
            )

        # print(f"    creating df_product_title_encoded...")
        # self.df_product_title_encoded = bert_input_batch_tokenize(
        #     df["product_title"].tolist()
        # )

        # print(f"    creating df_review_title_encoded...")
        # self.df_review_title_encoded = bert_input_batch_tokenize(
        #     df["review_title"].tolist()
        # )

        # print(f"    creating df_review_text_encoded...")
        # self.df_review_text_encoded = bert_input_batch_tokenize(
        #     df["review_text"].tolist()
        # )

        # print(f"    creating scraped_review_title_encoded...")
        # self.scraped_review_title_encoded = bert_input_batch_tokenize(
        #     reviews_scraped["review_title"].tolist()
        # )

        print(f"    creating scraped_review_text_encoded...")
        self.scraped_review_text_encoded = bert_input_batch_tokenize(
            reviews_scraped["review_text"].tolist()
        )
        return

        review_feature_cols = [
            "rating",
            "verified",
            "text_sentiment",
            "text_subjectivity",
        ]
        self.review_feature_vector = torch.tensor(df[review_feature_cols])
        self.scraped_review_feature_vector = torch.tensor(
            reviews_scraped[review_feature_cols]
        )
        self.scraped_review_helpfulness = torch.tensor(reviews_scraped["helpful"])

        product_feature_cols = [
            "rating_count",
            "rating_avg",
            "rating1",
            "rating2",
            "rating3",
            "rating4",
            "rating5",
        ] + [f"cat_{i}" for i in range(0, 30)]
        self.product_feature_vector = torch.tensor(df[product_feature_cols])

    def __getitem__(self, index) -> ReviewDataSetItem:
        product_id: str = df[index]["product_id"]

        # get indexes in scraped data where product id is the same
        indexes = torch.tensor(
            reviews_scraped.index[reviews_scraped["product_id"] == product_id].tolist()
        )

        def slice(tensor: torch.Tensor) -> torch.Tensor:
            """selects only the subset where index indicates product id is the same"""
            return torch.index_select(tensor, 0, indexes)

        scraped_helpful = slice(self.scraped_review_helpful)
        scraped_review_features = slice(self.scraped_review_feature_vector)

        scraped_review_texts_att = slice(
            self.scraped_review_text_encoded.attention_mask
        )
        scraped_review_texts_ids = slice(self.scraped_review_text_encoded.input_ids)
        scraped_review_texts = BertInputBatch(
            scraped_review_texts_att, scraped_review_texts_ids
        )

        scraped_review_titles_att = slice(
            self.scraped_review_title_encoded.attention_mask
        )
        scraped_review_titles_ids = slice(self.scraped_review_title_encoded.input_ids)
        scraped_review_titles = BertInputBatch(
            scraped_review_titles_att, scraped_review_titles_ids
        )

        return ReviewDataSetItem(
            product_title_bert_input=self.df_product_title_encoded[index],
            review_text_bert_input=self.df_review_text_encoded[index],
            review_title_bert_input=self.df_review_title_encoded[index],
            product_features=self.product_feature_vector[index],
            review_features=self.review_feature_vector[index],
            scraped_helpful=scraped_helpful,
            scraped_review_features=scraped_review_features,
            scraped_review_texts=scraped_review_texts,
            scraped_review_titles=scraped_review_titles,
        )

    def __len__(self) -> int:
        return self.df.__len__()

Example DatasetClass


In [35]:
# dataset = ReviewsDataSet(df, reviews_scraped)
# # df["product_title"].tolist()
for o in reviews_scraped["review_text"].tolist():
    if(type(o) != str):
        print(o)
    else:
        print("ashdsadds")

nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan


Example DataLoader


In [None]:
train = DataLoader(dataset = train, batch_size = 2, shuffle = True)

### Model

In [None]:
class FakeDetectionModelParameters():
    def __init__(self):
        pass

class FakeDetectionModel(torch.nn.Module):

    def __init__(self, parameters: FakeDetectionModelParameters):
        pass

    def forward(self, df_row):
        """"
            row_in_fake_real_products_df is a df with the following columns:
                - product_title
                - review_title
                - rating1, rating2, rating3, rating4, rating5 (the percentages of ratings from each brackets, as values between 0.0 and 1.0)
                - 
        """
        pass
        