In [1]:
from dataset import read_dataset, split
from pathlib import Path
import logging
import pandas as pd
from transformers import DistilBertTokenizer, DistilBertModel
import torch
import numpy as np
from tqdm import tqdm

In [2]:
data_dir = Path("../data/data.csv")

logging.basicConfig(
    level=logging.INFO,  # Set the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
df = read_dataset(path=data_dir)
train_df, valid_df, test_df = split(df, data_dir, "InvoiceNo", data_dir)

In [3]:
# Preprocess item descriptions
df["Description"] = df["Description"].str.lower()
df["Description"] = df["Description"].str.replace(r'[^\w\s]', '')

In [4]:
# Load the DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [5]:
df_items = df.sort_values("InvoiceDate", ascending=False).groupby(
    ["Description", "ItemID"], as_index=False
).first()[["Description", "ItemID", "UnitPrice"]]

In [6]:
from functools import partial

from bundling import encode_text

tqdm.pandas()
df_items["EncodedText"] = df_items["Description"].progress_apply(
    partial(encode_text, tokenizer=tokenizer, model=model)
)

100%|██████████| 3897/3897 [02:54<00:00, 22.30it/s]


In [11]:
# Unravel the "EncodedText" column into separate columns
df_unraveled = df_items["EncodedText"].apply(pd.Series)

# Rename the new columns if needed
df_unraveled.columns = [f"Dimension_{i+1}" for i in range(df_unraveled.shape[1])]

# Define custom conditions to create bins
conditions = [
    df["UnitPrice"] < 1.0,
    df["UnitPrice"].between(1.0, 2.0),
    df["UnitPrice"].between(2.0, 5.0),
    df["UnitPrice"] >= 5.0,
]

# Create a list of labels for the bins
labels = ["bin1", "bin2", "bin3", "bin4"]

# Use pd.get_dummies to one-hot encode based on conditions
one_hot_encoded = pd.get_dummies(
    pd.DataFrame(conditions, index=labels).T, prefix="", prefix_sep=""
)
# Concatenate the new columns with the original DataFrame
df = pd.concat([df_items, df_unraveled, one_hot_encoded], axis=1)

df = df.drop(columns=["EncodedText", "Description", "UnitPrice"], axis=1)

Unnamed: 0,ItemID,UnitPrice,Dimension_1,Dimension_2,Dimension_3,Dimension_4,Dimension_5,Dimension_6,Dimension_7,Dimension_8,...,Dimension_763,Dimension_764,Dimension_765,Dimension_766,Dimension_767,Dimension_768,bin1,bin2,bin3,bin4
0,72800B,0.79,0.279743,0.056446,0.249030,-0.032549,-0.158809,-0.127833,0.163790,0.011161,...,0.136809,-0.231609,0.092972,0.141649,0.111225,-0.048842,True,False,False,False
1,23437,1.25,-0.095109,-0.093896,0.292563,0.159640,0.184952,-0.280693,0.029589,0.205439,...,0.163814,-0.073782,-0.067937,0.117935,0.061918,-0.344338,False,True,False,False
2,23345,1.25,-0.077112,-0.108013,-0.089808,0.056201,0.164547,-0.068288,0.344739,0.275575,...,0.161297,-0.340114,-0.127146,0.194883,0.213920,-0.172648,False,True,False,False
3,23391,4.15,0.459742,-0.196889,0.175754,0.016932,0.261984,-0.314756,0.153665,0.263018,...,0.260515,-0.058466,-0.067980,-0.161584,-0.017928,-0.082257,False,False,True,False
4,23391,4.15,0.340728,-0.153522,0.152621,0.002055,0.187530,-0.195071,-0.053574,0.165981,...,0.287337,-0.157290,-0.179086,0.060895,-0.025161,-0.087103,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3892,23144,0.83,0.185700,0.157623,0.158833,0.001247,0.037398,-0.116408,-0.183034,0.009217,...,0.139112,-0.219093,0.253752,0.037796,0.051023,-0.112780,True,False,False,False
3893,21275,16.95,0.075655,0.039583,0.010623,0.173341,0.518097,-0.153319,-0.183385,0.014145,...,0.044075,-0.134866,-0.100747,0.042468,-0.064806,-0.320927,False,False,False,True
3894,84832,0.85,0.138023,0.258403,0.063173,0.196025,0.248309,-0.272928,0.253204,0.128193,...,0.301988,-0.246873,-0.081405,0.156366,0.185138,-0.087839,True,False,False,False
3895,23143,3.95,0.330232,0.027210,-0.110568,-0.065304,0.199900,0.049254,-0.087521,-0.087634,...,0.428982,-0.180068,-0.183402,0.004716,-0.111648,-0.030423,False,False,True,False


In [None]:
(df_items["UnitPrice"] < 1).mean()

In [None]:
# (df_items["UnitPrice"].between(1,2)).mean()

In [None]:
(df_items["UnitPrice"].between(2,5)).mean()

In [None]:
(df_items["UnitPrice"] > 5).mean()