In [27]:
import torch
from torch.nn import functional as F
from torch.optim import Adam
from torch.optim.lr_scheduler import OneCycleLR  #
# from torch.utils.data import TensorDataset, \
#     DataLoader  # Own stuff set of data quality checks, tensor shapes might be different dataloader loads the tensor,
import pytorch_lightning as pl  #
from pytorch_lightning.callbacks.early_stopping import \
    EarlyStopping  # early stop when you reach optimum loss, 3 times in a row gradient descent
from pytorch_lightning.callbacks import LearningRateMonitor  # delta (loss / accuracy)
from pytorch_lightning.loggers import MLFlowLogger  # Model tracking
import torch
import torch.nn as nn
from transformers import DistilBertModel
from data_preprocessing import injestDataset, downSample
from data_cleaning import feature_cleaner, cleaningPreprocess
from torch.utils.data import Dataset, DataLoader

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer, LabelBinarizer

import torchmetrics
import pytorch_lightning as pl
from transformers import DistilBertModel, DistilBertTokenizer
import numpy as np
# from transformers import DistilBertModel

### Injest data see injestDataset and down sampling to make levels/classes the same size 
* the injested file is kaggle data https://www.kaggle.com/datasets/nicapotato/womens-ecommerce-clothing-reviews
* the goal is to predict rating using the review or recommended(1 positive 0 negative)
* would also like to explore multimodal aproach given recomendation can I predict rating or recommend 
* some sort of UI gladio 
* MLOps serialize weights or save weights and broadcast need to explore that 


In [28]:
torch.set_default_dtype(torch.float32)

In [29]:
data = injestDataset()

In [30]:
df = downSample(data,"Recommended IND",1000)

In [31]:
df.dtypes

trueIndex           int64
Review Text        object
Rating              int64
Recommended IND     int64
dtype: object

In [32]:
df

Unnamed: 0,trueIndex,Review Text,Rating,Recommended IND
0,12849,I've been looking for sometime for a spring/su...,5,1
1,4578,Great spring time dress that looks good with a...,5,1
2,6169,This material of this shirt is soft and appeal...,4,1
3,2942,This dress is so light and twirly and the fabr...,4,1
4,2769,I bought this shirt for my 16 year old daughte...,4,1
...,...,...,...,...
995,12936,"This is a beautiful dress. however, it looked ...",1,0
996,258,"Fits well through the shoulders and arms, but ...",3,0
997,19148,I love maeve dresses but there are several iss...,3,0
998,20477,"Okay, so this top has its redeeming qualities,...",3,0


### Applying NLTK to clean data check data_cleaning.py 

In [33]:
df = cleaningPreprocess(df, "Review Text") #  Apply cleaner 
df = df.rename(columns={"Recommended IND": "targetOne", "Review Text": "feature", "Rating": "targetTwo"})

In [34]:
df

Unnamed: 0,trueIndex,feature,targetTwo,targetOne
0,12849,ive looking sometime springsummer jacket wear ...,5,1
1,4578,great spring time dress looks good thin top ca...,5,1
2,6169,material shirt soft appealing great fall side ...,4,1
3,2942,dress light twirly fabric airy pretty sadly fi...,4,1
4,2769,bought shirt 16 year old daughter looked adora...,4,1
...,...,...,...,...
995,12936,beautiful dress however looked awful ordered m...,1,0
996,258,fits well shoulders arms zero waist looks like...,3,0
997,19148,love maeve dresses several issues one taste 1 ...,3,0
998,20477,okay top redeeming qualities promise gave 3 st...,3,0


### Using Dataset class
* Really like how spark does the data transformations and adds column after every transformation in a pipeline attempiting to do the same 
* We going to have two lines one for the normal __getitem__ meaning dataloader should be able to get the tensors for input and output plus also an added bonus to view transformations in dataframe format 
* try something different use self to execute 
* use decorators and also self to activate or deactivate the instance of viewing the dataframe or tensors 
* also fix __len__ for when train is on (inside decorator) it should show the train length or test length etc

In [35]:
# Here is your dataset
class MyDataset(Dataset):
    def __init__(self, df):
        self.input_data = df
        # self.max_length = 30
        self.tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
        self.tokenize_dataframe()
        self.train_val_test_idx()
        self.data_split = None
        self.split_columns()
        self.targertOne_labelEncoder()
        self.targertTwo_labelEncoder()

    def tokenize_dataframe(self):
        tokenized_texts = []
        input_ids = []
        attention_masks = []

        for text in self.input_data['feature']:
            encoded_inputs = self.tokenizer(
                text,
                add_special_tokens=True,
                # max_length=self.max_length,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )
            tokenized_texts.append(encoded_inputs)
            input_ids.append(encoded_inputs['input_ids'])
            attention_masks.append(encoded_inputs['attention_mask'])
        
        self.input_data['tokenizedFeature'] = tokenized_texts
        self.input_data['input_ids'] = input_ids
        self.input_data['attention_mask'] = attention_masks


    def split_columns(self):
        self.features = self.input_data["feature"]
        self.targetOne = self.input_data["targetOne"]
        self.targetTwo = self.input_data["targetTwo"]

            
    def train_val_test_idx(self):
        idx = list(range(len(self.input_data)))
        train_idx, val_test_idx = train_test_split(
                                idx,
                                train_size=0.8,
                                stratify=self.input_data["targetOne"],
                                random_state=500)
                                
        test_idx,  val_idx, = train_test_split(
                                val_test_idx,
                                train_size=0.5,
                                random_state=500)

        self.train_idx = train_idx 
        self.test_idx = test_idx
        self.val_idx = val_idx
    
    def targertOne_labelEncoder(self):
        # self.targetOne_label_binarizer = MultiLabelBinarizer()
        self.targetOne_label_binarizer = LabelEncoder()
        transformed =  self.targetOne_label_binarizer.fit_transform(self.input_data["targetOne"].astype("str"))
        self.input_data['targetOne_binirized'] = [subarray[0] for subarray in np.split(transformed,len(transformed))]

    def targertTwo_labelEncoder(self):
        self.targetTwo_label_binarizer = MultiLabelBinarizer()
        transformed =  self.targetTwo_label_binarizer.fit_transform(self.input_data["targetTwo"].astype("str"))
        self.input_data['targetTwo_binirized'] = [subarray[0] for subarray in np.split(transformed,len(transformed))]

        
    def __len__(self):
        if self.data_split is None or self.data_split == 'all':
            # Return the length of the entire dataset
            return len(self.input_data)
        elif self.data_split == 'train':
            # Return the length of the training split
            return len(self.train_idx)
        elif self.data_split == 'val':
            # Return the length of the validation split
            return len(self.val_idx)
        elif self.data_split == 'test':
            # Return the length of the test split
            return len(self.test_idx)
        else:
            raise ValueError("Invalid data_split argument. Use 'train', 'val', 'test', or 'all'.")
        
    ## The whole purpose of this __getitem__ function is to return a dataframe or dataloader compatible data 

    def __getitem__(self, index):
        if self.data_split is not None:
            # Depending on the data_split argument, return the corresponding split
            if self.data_split == 'train':
                index = self.train_idx[index]
            elif self.data_split == 'val':
                index = self.val_idx[index]
            elif self.data_split == 'test':
                index = self.test_idx[index]
            else:
                raise ValueError("Invalid data_split argument. Use 'train', 'val', or 'test'.")

            input_ids = self.input_data['input_ids'][index]
            attention_mask = self.input_data['attention_mask'][index]
            targetOne = torch.tensor(self.input_data['targetOne_binirized'][index],dtype=torch.float32,device='mps:0')
            targetTwo = torch.tensor(self.input_data['targetTwo_binirized'][index],dtype=torch.float32,device='mps:0')

            return {
                'input_ids': input_ids,
                'attention_mask': attention_mask,
                'targetOne': targetOne,
                'targetTwo': targetTwo,
            }

        else: 
            return self.input_data.iloc[index]

       # Define properties to access different data splits


    # def set_data_split(self, data_split):
    #     # This method allows you to set the data_split after creating the instance
    #     self.data_split = data_split

    # def reset_data_split(self):
    #     # This method allows you to reset the data_split to None, returning the entire DataFrame
    #     self.data_split = None

    @property
    def train(self):
        self.data_split = 'train'
        return self

    @property
    def val(self):
        self.data_split = 'val'
        return self

    @property
    def test(self):
        self.data_split = 'test'
        return self

    @property
    def all(self):
        self.data_split = None
        return self



### Creating dataloader and batch size, and test the decorators if they are working 
* check data is loaded properly with the decorators and the loader


In [36]:
my_dataset = MyDataset(df)

In [37]:
my_dataset.all[1:10] # fuction .all is meant to show intire data set with all the transformations packaged

Unnamed: 0,trueIndex,feature,targetTwo,targetOne,tokenizedFeature,input_ids,attention_mask,targetOne_binirized,targetTwo_binirized
1,4578,great spring time dress looks good thin top ca...,5,1,"[input_ids, attention_mask]","[[tensor(101), tensor(2307), tensor(3500), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...",1,"[0, 0, 0, 0, 1]"
2,6169,material shirt soft appealing great fall side ...,4,1,"[input_ids, attention_mask]","[[tensor(101), tensor(3430), tensor(3797), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...",1,"[0, 0, 0, 1, 0]"
3,2942,dress light twirly fabric airy pretty sadly fi...,4,1,"[input_ids, attention_mask]","[[tensor(101), tensor(4377), tensor(2422), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...",1,"[0, 0, 0, 1, 0]"
4,2769,bought shirt 16 year old daughter looked adora...,4,1,"[input_ids, attention_mask]","[[tensor(101), tensor(4149), tensor(3797), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...",1,"[0, 0, 0, 1, 0]"
5,11611,shirt runs true size im xl fit perfectly first...,5,1,"[input_ids, attention_mask]","[[tensor(101), tensor(3797), tensor(3216), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...",1,"[0, 0, 0, 0, 1]"
6,3199,got sweater neutral color versatile truly beau...,4,1,"[input_ids, attention_mask]","[[tensor(101), tensor(2288), tensor(14329), te...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...",1,"[0, 0, 0, 1, 0]"
7,19429,great fabric details constructionbut grey defi...,4,1,"[input_ids, attention_mask]","[[tensor(101), tensor(2307), tensor(8313), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...",1,"[0, 0, 0, 1, 0]"
8,19443,size 4 ordered 6 little big fix price nice tho...,4,1,"[input_ids, attention_mask]","[[tensor(101), tensor(2946), tensor(1018), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...",1,"[0, 0, 0, 1, 0]"
9,11763,love jeans summer followed advice previous rev...,5,1,"[input_ids, attention_mask]","[[tensor(101), tensor(2293), tensor(6312), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...",1,"[0, 0, 0, 0, 1]"


In [38]:
my_dataset.all[my_dataset.train_idx].iloc[1:5] # test set check 

Unnamed: 0,trueIndex,feature,targetTwo,targetOne,tokenizedFeature,input_ids,attention_mask,targetOne_binirized,targetTwo_binirized
244,2996,love love love dress fabric high quality color...,5,1,"[input_ids, attention_mask]","[[tensor(101), tensor(2293), tensor(2293), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...",1,"[0, 0, 0, 0, 1]"
863,6017,look online pictures navy colored dress stripe...,2,0,"[input_ids, attention_mask]","[[tensor(101), tensor(2298), tensor(3784), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...",0,"[0, 1, 0, 0, 0]"
214,10392,called jeans arent exactly jean material black...,5,1,"[input_ids, attention_mask]","[[tensor(101), tensor(2170), tensor(6312), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...",1,"[0, 0, 0, 0, 1]"
169,4346,bought dress white event many compliments fun ...,5,1,"[input_ids, attention_mask]","[[tensor(101), tensor(4149), tensor(4377), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...",1,"[0, 0, 0, 0, 1]"


In [39]:
list(my_dataset.all[my_dataset.train_idx].iloc[[1]]["input_ids"]) #compared to my_dataset.train[1] should be the same 

[tensor([[  101,  2293,  2293,  2293,  4377,  8313,  2152,  3737,  6087,  4138,
          17026,  4906,  4257, 17989,  2330,  3300,  4179,  7130,  5808, 22979,
           3538,  5102,  8265, 11912,  5102,  4389,  2015,  6879,  5156,  2946,
          16142,  6669,   102,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,   

In [40]:
my_dataset.train[1]

{'input_ids': tensor([[  101,  2293,  2293,  2293,  4377,  8313,  2152,  3737,  6087,  4138,
          17026,  4906,  4257, 17989,  2330,  3300,  4179,  7130,  5808, 22979,
           3538,  5102,  8265, 11912,  5102,  4389,  2015,  6879,  5156,  2946,
          16142,  6669,   102,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,    

In [41]:
my_dataset.test[1]

{'input_ids': tensor([[  101,  3062,  2293, 15292,  2574,  2387,  3784,  2502,  5470, 29032,
           9515,  2140, 14003,  5617,  4310,  7782, 15292,  5791,  4906, 12465,
           6140,  9904,  4102,  3861,  2437,  2665,  2601,  9724,  2630,  2471,
           3504,  2304, 21274,  5808,  7126,  2149,  7289,  3953, 25022, 12680,
           2067,  6462,  2098,  5808,  2987,  2102,  2191,  6700,  2298,  2898,
           8313,  3730,   102,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,    

In [42]:
my_dataset.val[1]

{'input_ids': tensor([[  101,  2327, 16142,  3835,  2028,  4929,  2134,  2102,  2130,  8871,
           8313, 18001,  2052, 16755,   102,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,    

### Check dataloader previous task and batch 
* if batch size = 5 and length = 800 shoud be 160 batches 

In [43]:
batch_size=5
train_loader = DataLoader(my_dataset.train, batch_size=batch_size, shuffle=True)

In [44]:
for i, batch in enumerate(train_loader, 0):
    input_ids = batch['input_ids'] # Access input IDs from the batch
    attention_mask = batch['attention_mask']  # Access attention masks from the batch
    targetOne = batch['targetOne']  # Access targetOne from the batch
    targetTwo = batch['targetTwo'] 

In [45]:
i # number of batches should be = len(train set) = i + 1

159

In [46]:
len(my_dataset.train_idx)

800

In [47]:
len(input_ids)

5

In [48]:
input_ids

tensor([[[ 101, 3482, 2100,  ...,    0,    0,    0]],

        [[ 101, 2428, 7568,  ...,    0,    0,    0]],

        [[ 101, 6087, 4377,  ...,    0,    0,    0]],

        [[ 101, 2387, 3573,  ...,    0,    0,    0]],

        [[ 101, 2179, 2651,  ...,    0,    0,    0]]])

In [49]:
targetTwo

tensor([[1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0.],
        [1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1.]], device='mps:0')

### Use torch first
* Hardware issue might have to migrate to databricks but need some GPUs or Colab 
* Setup neural net with binary cross entropy loss or categorical depending on labels 
* make sure to send model to device you can't use self I think but investigate you have to make the model first and then transfer it that's the assumption during self we still inside the model so shouldn't work my guess 



In [50]:
import torch.nn as nn
# number of features (len of X cols)
input_dim = 521
# number of hidden layers
hidden_layers = 758
# number of classes (unique of y)
output_dim = 2
class Network(nn.Module):
  def __init__(self):
    super(Network, self).__init__()
    self.device = "mps:0"
    self.model = DistilBertModel.from_pretrained("distilbert-base-uncased")
    # self.pre_classifier = nn.Linear(521, 521)
    # self.classifier = nn.Linear(521, output_dim)
    self.linear1 = nn.Linear(input_dim, hidden_layers)
    self.linear2 = nn.Linear(hidden_layers, output_dim)
    self.dropout = nn.Dropout(0.1)
    self.activation = nn.ReLU()
  def forward(self, batch):
    embeddings = self.model(batch['input_ids'].squeeze(1).to(self.device), attention_mask=batch['attention_mask'].to(self.device), output_hidden_states= True, return_dict=False)[0]
    pooler = self.activation(self.linear1(embeddings))
    pooler = self.dropout(pooler)
    output = self.linear2(pooler)
    return output

In [51]:
# clf = Network()
# device = torch.device("mps:0")
# clf.to(device)
# criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.SGD(clf.parameters(), lr=0.1)

# epochs = 1
# for epoch in range(epochs):
#     running_loss = 0.0
#     for i, batch in enumerate(train_loader, 0):
#         inputs = batch['input_ids']
#         attention_mask = batch['attention_mask']
#         labels = batch['targetOne']  # Assuming 'targetOne' is your target variable
        
#         # Zero the parameter gradients
#         optimizer.zero_grad()
        
#         # Forward pass
#         outputs = clf({'input_ids': inputs, 'attention_mask': attention_mask})
        
#         # Calculate the loss
#         loss = criterion(outputs, labels)
        
#         # Backward pass and optimization
#         loss.backward()
#         optimizer.step()
        
#         running_loss += loss.item()
        
#     # Display statistics
#     print(f'[{epoch + 1}, {i + 1}] loss: {running_loss / len(train_loader):.5f}')


In [52]:
#GPU too small 