In [1]:
import torch
from torch.nn import functional as F
from torch.optim import Adam
from torch.optim.lr_scheduler import OneCycleLR  #
# from torch.utils.data import TensorDataset, \
#     DataLoader  # Own stuff set of data quality checks, tensor shapes might be different dataloader loads the tensor,
import pytorch_lightning as pl  #
from pytorch_lightning.callbacks.early_stopping import \
    EarlyStopping  # early stop when you reach optimum loss, 3 times in a row gradient descent
from pytorch_lightning.callbacks import LearningRateMonitor  # delta (loss / accuracy)
from pytorch_lightning.loggers import MLFlowLogger  # Model tracking
import torch
import torch.nn as nn
from transformers import DistilBertModel
from data_preprocessing import injestDataset, downSample
from data_cleaning import feature_cleaner, cleaningPreprocess
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MultiLabelBinarizer, LabelBinarizer
from sklearn.compose import ColumnTransformer

import torchmetrics
import pytorch_lightning as pl
from transformers import BertTokenizer, BertModel, DistilBertModel, DistilBertTokenizer, DistilBertForTokenClassification
import numpy as np
# from transformers import DistilBertModel

In [2]:
torch.set_default_dtype(torch.float32)
# torch.set_default_device('mps:0')
# torch.set_default_device('cpu:0')

In [3]:
data = injestDataset()

In [4]:
df = downSample(data,"Recommended IND",1000)

In [5]:
df.dtypes

trueIndex           int64
Review Text        object
Rating              int64
Recommended IND     int64
dtype: object

In [6]:
df = cleaningPreprocess(df, "Review Text")

In [7]:
df = df.rename(columns={"Recommended IND": "targetOne", "Review Text": "feature", "Rating": "targetTwo"})

In [8]:
df

Unnamed: 0,trueIndex,feature,targetTwo,targetOne
0,12849,ive looking sometime springsummer jacket wear ...,5,1
1,4578,great spring time dress looks good thin top ca...,5,1
2,6169,material shirt soft appealing great fall side ...,4,1
3,2942,dress light twirly fabric airy pretty sadly fi...,4,1
4,2769,bought shirt 16 year old daughter looked adora...,4,1
...,...,...,...,...
995,12936,beautiful dress however looked awful ordered m...,1,0
996,258,fits well shoulders arms zero waist looks like...,3,0
997,19148,love maeve dresses several issues one taste 1 ...,3,0
998,20477,okay top redeeming qualities promise gave 3 st...,3,0


In [9]:
# from transformers import AutoTokenizer, DistilBertForTokenClassification
# import torch

# tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
# model = DistilBertForTokenClassification.from_pretrained("distilbert-base-uncased")

# inputs = tokenizer(
#     "HuggingFace is a company based in Paris and New York", add_special_tokens=False, return_tensors="pt"
# )

# with torch.no_grad():
#     logits = model(**inputs).logits

# predicted_token_class_ids = logits.argmax(-1)

# # Note that tokens are classified rather then input words which means that
# # there might be more predicted token classes than words.
# # Multiple token classes might account for the same word
# predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]

# labels = predicted_token_class_ids
# loss = model(**inputs, labels=labels).loss

In [10]:
# Here is your dataset
class MyDataset(Dataset):
    def __init__(self, df):
        self.input_data = df
        # self.max_length = 30
        self.tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
        self.tokenize_dataframe()
        self.train_val_test_idx()
        self.data_split = None
        self.split_columns()
        self.targertOne_labelEncoder()
        self.targertTwo_labelEncoder()

    def tokenize_dataframe(self):
        tokenized_texts = []
        input_ids = []
        attention_masks = []

        for text in self.input_data['feature']:
            encoded_inputs = self.tokenizer(
                text,
                add_special_tokens=True,
                # max_length=self.max_length,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )
            tokenized_texts.append(encoded_inputs)
            input_ids.append(encoded_inputs['input_ids'])
            attention_masks.append(encoded_inputs['attention_mask'])
        
        self.input_data['tokenizedFeature'] = tokenized_texts
        self.input_data['input_ids'] = input_ids
        self.input_data['attention_mask'] = attention_masks


    def split_columns(self):
        self.features = self.input_data["feature"]
        self.targetOne = self.input_data["targetOne"]
        self.targetTwo = self.input_data["targetTwo"]

            
    def train_val_test_idx(self):
        idx = list(range(len(self.input_data)))
        train_idx, val_test_idx = train_test_split(
                                idx,
                                train_size=0.8,
                                stratify=self.input_data["targetOne"],
                                random_state=500)
                                
        test_idx,  val_idx, = train_test_split(
                                val_test_idx,
                                train_size=0.5,
                                random_state=500)

        self.train_idx = train_idx 
        self.test_idx = test_idx
        self.val_idx = val_idx
    
    def targertOne_labelEncoder(self):
        # self.targetOne_label_binarizer = MultiLabelBinarizer()
        self.targetOne_label_binarizer = LabelEncoder()
        transformed =  self.targetOne_label_binarizer.fit_transform(self.input_data["targetOne"].astype("str"))
        self.input_data['targetOne_binirized'] = [subarray[0] for subarray in np.split(transformed,len(transformed))]

    def targertTwo_labelEncoder(self):
        self.targetTwo_label_binarizer = MultiLabelBinarizer()
        transformed =  self.targetTwo_label_binarizer.fit_transform(self.input_data["targetTwo"].astype("str"))
        self.input_data['targetTwo_binirized'] = [subarray[0] for subarray in np.split(transformed,len(transformed))]

        
    def __len__(self):
        if self.data_split is None or self.data_split == 'all':
            # Return the length of the entire dataset
            return len(self.input_data)
        elif self.data_split == 'train':
            # Return the length of the training split
            return len(self.train_idx)
        elif self.data_split == 'val':
            # Return the length of the validation split
            return len(self.val_idx)
        elif self.data_split == 'test':
            # Return the length of the test split
            return len(self.test_idx)
        else:
            raise ValueError("Invalid data_split argument. Use 'train', 'val', 'test', or 'all'.")
        
    ## The whole purpose of this __getitem__ function is to return a dataframe or dataloader compatible data 

    def __getitem__(self, index):
        if self.data_split is not None:
            # Depending on the data_split argument, return the corresponding split
            if self.data_split == 'train':
                index = self.train_idx[index]
            elif self.data_split == 'val':
                index = self.val_idx[index]
            elif self.data_split == 'test':
                index = self.test_idx[index]
            else:
                raise ValueError("Invalid data_split argument. Use 'train', 'val', or 'test'.")

            input_ids = self.input_data['input_ids'][index]
            attention_mask = self.input_data['attention_mask'][index]
            targetOne = torch.tensor(self.input_data['targetOne_binirized'][index],dtype=torch.float32,device='mps:0')
            targetTwo = torch.tensor(self.input_data['targetTwo_binirized'][index],dtype=torch.float32,device='mps:0')

            return {
                'input_ids': input_ids,
                'attention_mask': attention_mask,
                'targetOne': targetOne,
                'targetTwo': targetTwo,
            }

        else: 
            return self.input_data.iloc[index]

       # Define properties to access different data splits


    # def set_data_split(self, data_split):
    #     # This method allows you to set the data_split after creating the instance
    #     self.data_split = data_split

    # def reset_data_split(self):
    #     # This method allows you to reset the data_split to None, returning the entire DataFrame
    #     self.data_split = None

    @property
    def train(self):
        self.data_split = 'train'
        return self

    @property
    def val(self):
        self.data_split = 'val'
        return self

    @property
    def test(self):
        self.data_split = 'test'
        return self

    @property
    def all(self):
        self.data_split = None
        return self



In [11]:
my_dataset = MyDataset(df)
batch_size=5
train_loader = DataLoader(my_dataset.train, batch_size=batch_size, shuffle=True)

In [12]:
import torch.nn as nn
# number of features (len of X cols)
input_dim = 521
# number of hidden layers
hidden_layers = 758
# number of classes (unique of y)
output_dim = 2
class Network(nn.Module):
  def __init__(self):
    super(Network, self).__init__()
    self.device = "mps:0"
    self.model = DistilBertModel.from_pretrained("distilbert-base-uncased")
    # self.pre_classifier = nn.Linear(521, 521)
    # self.classifier = nn.Linear(521, output_dim)
    self.linear1 = nn.Linear(input_dim, hidden_layers)
    self.linear2 = nn.Linear(hidden_layers, output_dim)
    self.dropout = nn.Dropout(0.1)
    self.activation = nn.ReLU()
  def forward(self, batch):
    embeddings = self.model(batch['input_ids'].squeeze(1).to(self.device), attention_mask=batch['attention_mask'].to(self.device), output_hidden_states= True, return_dict=False)[0]
    pooler = self.activation(self.linear1(embeddings))
    pooler = self.dropout(pooler)
    output = self.linear2(pooler)
    return output

In [13]:
clf = Network()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(clf.parameters(), lr=0.1)

In [14]:
for i, batch in enumerate(train_loader, 0):
    input_ids = batch['input_ids'] # Access input IDs from the batch
    attention_mask = batch['attention_mask']  # Access attention masks from the batch
    targetOne = batch['targetOne']  # Access targetOne from the batch
    targetTwo = batch['targetTwo'] 

In [15]:
clf = Network()
device = torch.device("mps:0")
clf.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(clf.parameters(), lr=0.1)

epochs = 1
for epoch in range(epochs):
    running_loss = 0.0
    for i, batch in enumerate(train_loader, 0):
        inputs = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['targetOne']  # Assuming 'targetOne' is your target variable
        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = clf({'input_ids': inputs, 'attention_mask': attention_mask})
        
        # Calculate the loss
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
    # Display statistics
    print(f'[{epoch + 1}, {i + 1}] loss: {running_loss / len(train_loader):.5f}')


: 

: 

In [None]:
#GPU too small 