In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset
from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader, random_split
import torch
torch.manual_seed(0)
class CustomDataset(Dataset):
    def __init__(self, csv_file, label_encoders):
        self.data = pd.read_csv(csv_file)
        self.data = self.data[:].dropna()
        self.text = self.data['review'].tolist()
        self.target1 = self.data['drug'].tolist()
        self.target2 = self.data['condition'].tolist()
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        # Retrieve label encoders from the provided dictionary
        self.label_encoder1 = label_encoders['target1']
        self.label_encoder2 = label_encoders['target2']

        # Encode target variables using the retrieved label encoders
        self.target1_encoded = self.label_encoder1.transform(self.target1)
        self.target2_encoded = self.label_encoder2.transform(self.target2)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.text[idx]
        target1 = self.target1_encoded[idx]
        target2 = self.target2_encoded[idx]

        # Tokenize the text
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'target1': target1,
            'target2': target2,
            'original_target1': self.target1[idx],
            'original_target2': self.target2[idx]
        }

# Example usage:
csv_file = r'df_train_20.csv'  # Replace 'your_data.csv' with the path to your CSV file
# csv_file = r'D:\Downloads\fdf.csv'  # Replace 'your_data.csv' with the path to your CSV file

# Create label encoders for target variables
label_encoders = {}
label_encoders['target1'] = LabelEncoder()
label_encoders['target2'] = LabelEncoder()

# Fit label encoders on target variables
data = pd.read_csv(csv_file)
label_encoders['target1'].fit(data['drug'])
label_encoders['target2'].fit(data['condition'])

# Create dataset with label encoders
dataset = CustomDataset(csv_file, label_encoders)


  from .autonotebook import tqdm as notebook_tqdm


In [11]:
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

# Divide the dataset into training and validation sets
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Example usage of dataloaders for training and validation
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [12]:
data

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,drug
0,1,1,1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192,Guanfacine
1,2,2,2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17,"levonorgestrel, ethinyl estradiol"
2,3,3,3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10,norelgestromin
3,5,5,5,165907,Levonorgestrel,Birth Control,"""He pulled out, but he cummed a bit in me. I t...",1,7-Mar-17,5,Levonorgestrel
4,6,6,6,102654,Aripiprazole,Bipolar Disorder,"""Abilify changed my life. There is hope. I was...",10,14-Mar-15,32,Aripiprazole
...,...,...,...,...,...,...,...,...,...,...,...
84645,126067,131680,131680,228492,Geodon,Bipolar Disorder,"""I was in a very bad place at the time I start...",3,25-Jul-16,5,ziprasidone
84646,126069,131682,131682,93069,Vortioxetine,Depression,"""This is the third med I&#039;ve tried for anx...",2,17-Jul-16,33,Vortioxetine
84647,126070,131683,131683,132177,Ativan,Anxiety,"""I was super against taking medication. I&#039...",9,16-Aug-16,61,lorazepam
84648,126073,131686,131686,103458,Tekturna,High Blood Pressure,"""I have only been on Tekturna for 9 days. The ...",7,7-Feb-10,18,aliskiren


In [13]:
import torch
import torch.nn as nn
from transformers import BertModel

class BertClassifier(nn.Module):
    def __init__(self, num_classes1, num_classes2):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.fc1 = nn.Linear(self.bert.config.hidden_size, num_classes1)
        self.fc2 = nn.Linear(self.bert.config.hidden_size, num_classes2)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = self.dropout(outputs.pooler_output)

        # Use pooled_output for final classification
        logits1 = self.fc1(pooled_output)
        logits2 = self.fc2(pooled_output)

        return logits1, logits2

# Example usage:
num_classes1 = len(label_encoders['target1'].classes_)
num_classes2 = len(label_encoders['target2'].classes_)
model = BertClassifier(num_classes1, num_classes2)


In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score

# Define optimizer
optimizer = optim.Adam(model.parameters(), lr=2e-5)

# Define loss function
criterion = nn.CrossEntropyLoss()


In [15]:
# Define optimizer
optimizer = optim.Adam(model.parameters(), lr=2e-5)

# Define loss function
criterion = nn.CrossEntropyLoss()


In [16]:
def calculate_metrics(preds, targets):
    # If preds is a 2D tensor with shape (batch_size, num_classes), then use argmax along dim=1
    if preds.dim() == 2:
        preds = preds.argmax(dim=1).cpu().numpy()
    # If preds is a 1D tensor with shape (batch_size,), then no need for argmax
    elif preds.dim() == 1:
        preds = preds.cpu().numpy()
    else:
        raise ValueError("Unsupported shape for preds tensor")

    targets = targets.cpu().numpy()
    accuracy = accuracy_score(targets, preds)
    f1 = f1_score(targets, preds, average='macro')
    return accuracy, f1

In [17]:
def train_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    epoch_loss = 0.0
    all_preds1 = []
    all_targets1 = []
    all_preds2 = []
    all_targets2 = []

    progress_bar = tqdm(enumerate(dataloader), total=len(dataloader), desc='Training')
    for batch_idx, batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        target1 = batch['target1'].type(torch.LongTensor).to(device)
        target2 = batch['target2'].type(torch.LongTensor).to(device)

        optimizer.zero_grad()
        logits1, logits2 = model(input_ids, attention_mask)
        loss1 = criterion(logits1, target1)
        loss2 = criterion(logits2, target2)
        loss = loss1 + loss2
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

        # Calculate metrics for target1
        all_preds1.extend(logits1.argmax(dim=1).cpu().numpy())
        all_targets1.extend(target1.cpu().numpy())

        # Calculate metrics for target2
        all_preds2.extend(logits2.argmax(dim=1).cpu().numpy())
        all_targets2.extend(target2.cpu().numpy())

        progress_bar.set_postfix({'loss': epoch_loss / (batch_idx + 1)})


    # Calculate metrics for target1
    accuracy1, f1_1 = calculate_metrics(torch.tensor(all_preds1), torch.tensor(all_targets1))

    # Calculate metrics for target2
    accuracy2, f1_2 = calculate_metrics(torch.tensor(all_preds2), torch.tensor(all_targets2))

    return epoch_loss / len(dataloader), accuracy1, f1_1, accuracy2, f1_2


In [18]:
def evaluate(model, dataloader, criterion, device):
    model.eval()
    epoch_loss = 0.0
    all_preds1 = []
    all_targets1 = []
    all_preds2 = []
    all_targets2 = []

    progress_bar = tqdm(enumerate(dataloader), total=len(dataloader), desc='Validation')
    for batch_idx, batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        target1 = batch['target1'].type(torch.LongTensor).to(device)
        target2 = batch['target2'].type(torch.LongTensor).to(device)

        logits1, logits2 = model(input_ids, attention_mask)
        loss1 = criterion(logits1, target1)
        loss2 = criterion(logits2, target2)
        loss = loss1 + loss2

        epoch_loss += loss.item()

        # Calculate metrics for target1
        all_preds1.extend(logits1.argmax(dim=1).cpu().numpy())
        all_targets1.extend(target1.cpu().numpy())

        # Calculate metrics for target2
        all_preds2.extend(logits2.argmax(dim=1).cpu().numpy())
        all_targets2.extend(target2.cpu().numpy())

        progress_bar.set_postfix({'loss': epoch_loss / (batch_idx + 1)})

    # Calculate metrics for target1
    accuracy1, f1_1 = calculate_metrics(torch.tensor(all_preds1), torch.tensor(all_targets1))

    # Calculate metrics for target2
    accuracy2, f1_2 = calculate_metrics(torch.tensor(all_preds2), torch.tensor(all_targets2))

    return epoch_loss / len(dataloader), accuracy1, f1_1, accuracy2, f1_2


In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_epochs = 3  # Define the number of epochs
model = model.to(device)
for epoch in range(num_epochs):
    train_loss, train_accuracy1, train_f1_1, train_accuracy2, train_f1_2 = train_epoch(model, train_dataloader, optimizer, criterion, device)
    val_loss, val_accuracy1, val_f1_1, val_accuracy2, val_f1_2 = evaluate(model, val_dataloader, criterion, device)

    print(f'Epoch {epoch+1}/{num_epochs}:')
    print(f'Train Loss: {train_loss:.4f}, Train Accuracy Target 1: {train_accuracy1:.4f}, Train F1 Score Target 1: {train_f1_1:.4f}, Train Accuracy Target 2: {train_accuracy2:.4f}, Train F1 Score Target 2: {train_f1_2:.4f}')
    print(f'Val Loss: {val_loss:.4f}, Val Accuracy Target 1: {val_accuracy1:.4f}, Val F1 Score Target 1: {val_f1_1:.4f}, Val Accuracy Target 2: {val_accuracy2:.4f}, Val F1 Score Target 2: {val_f1_2:.4f}')


Training: 100%|██████████| 4233/4233 [26:31<00:00,  2.66it/s, loss=4.45]
Validation: 100%|██████████| 1059/1059 [02:36<00:00,  6.75it/s, loss=3.12]


Epoch 1/3:
Train Loss: 4.4544, Train Accuracy Target 1: 0.2129, Train F1 Score Target 1: 0.0183, Train Accuracy Target 2: 0.8098, Train F1 Score Target 2: 0.7215
Val Loss: 3.1222, Val Accuracy Target 1: 0.3271, Val F1 Score Target 1: 0.0392, Val Accuracy Target 2: 0.8825, Val F1 Score Target 2: 0.8459


Training: 100%|██████████| 4233/4233 [26:28<00:00,  2.67it/s, loss=2.73]
Validation: 100%|██████████| 1059/1059 [02:37<00:00,  6.74it/s, loss=2.5]


Epoch 2/3:
Train Loss: 2.7323, Train Accuracy Target 1: 0.3546, Train F1 Score Target 1: 0.0529, Train Accuracy Target 2: 0.9017, Train F1 Score Target 2: 0.8707
Val Loss: 2.4960, Val Accuracy Target 1: 0.3813, Val F1 Score Target 1: 0.0779, Val Accuracy Target 2: 0.8995, Val F1 Score Target 2: 0.8708


Training: 100%|██████████| 4233/4233 [26:31<00:00,  2.66it/s, loss=2.19]
Validation: 100%|██████████| 1059/1059 [02:36<00:00,  6.75it/s, loss=2.23]


Epoch 3/3:
Train Loss: 2.1899, Train Accuracy Target 1: 0.4100, Train F1 Score Target 1: 0.0864, Train Accuracy Target 2: 0.9279, Train F1 Score Target 2: 0.9087
Val Loss: 2.2336, Val Accuracy Target 1: 0.4069, Val F1 Score Target 1: 0.1109, Val Accuracy Target 2: 0.9113, Val F1 Score Target 2: 0.8882


In [20]:
model_path = "model.pth"
torch.save(model.state_dict(), model_path)

print("Model saved successfully!")
# model_path = "m1odel.pth"

# Load the saved model state dictionary
# model.load_state_dict(torch.load(model_path))

Model saved successfully!


In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_epochs = 1  # Define the number of epochs
model = model.to(device)
for epoch in range(num_epochs):
    train_loss, train_accuracy1, train_f1_1, train_accuracy2, train_f1_2 = train_epoch(model, train_dataloader, optimizer, criterion, device)
    val_loss, val_accuracy1, val_f1_1, val_accuracy2, val_f1_2 = evaluate(model, val_dataloader, criterion, device)

    print(f'Epoch {epoch+1}/{num_epochs}:')
    print(f'Train Loss: {train_loss:.4f}, Train Accuracy Target 1: {train_accuracy1:.4f}, Train F1 Score Target 1: {train_f1_1:.4f}, Train Accuracy Target 2: {train_accuracy2:.4f}, Train F1 Score Target 2: {train_f1_2:.4f}')
    print(f'Val Loss: {val_loss:.4f}, Val Accuracy Target 1: {val_accuracy1:.4f}, Val F1 Score Target 1: {val_f1_1:.4f}, Val Accuracy Target 2: {val_accuracy2:.4f}, Val F1 Score Target 2: {val_f1_2:.4f}')


Training: 100%|██████████| 50431/50431 [3:10:55<00:00,  4.40it/s, loss=3.47]  
Validation: 100%|██████████| 12608/12608 [29:57<00:00,  7.02it/s, loss=3.67]

Epoch 1/1:
Train Loss: 3.4695, Train Accuracy Target 1: 0.4033, Train F1 Score Target 1: 0.0700, Train Accuracy Target 2: 0.7766, Train F1 Score Target 2: 0.1296
Val Loss: 3.6711, Val Accuracy Target 1: 0.3924, Val F1 Score Target 1: 0.0965, Val Accuracy Target 2: 0.7559, Val F1 Score Target 2: 0.1843





In [11]:
# model_path = "model1.pth"
# torch.save(model.state_dict(), model_path)

# # print("Model saved successfully!")
# model_path = "m1odel1.pth"

# # Load the saved model state dictionary
# model.load_state_dict(torch.load(model_path))

In [None]:
from torch.utils.data import Dataset, DataLoader

import torch

# Define the data class

# Assuming you have a list of dictionaries `test_data` containing input data and target labels
# Instantiate the test dataset
test_dataset = TestDataset(test_data)

# Create a data loader for the test dataset
batch_size = 4  # Adjust batch size as needed
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Now you can use `test_dataloader` for inference
# For example:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
preds1, preds2, targets1, targets2 = inference(model, test_dataloader, label_encoders, device)

# Analysis of predictions
for pred1, pred2, target1, target2 in zip(preds1, preds2, targets1, targets2):
    print(f'Predicted target1: {pred1}, Original target1: {target1}')
    print(f'Predicted target2: {pred2}, Original target2: {target2}')
    print()  # Add a newline for readability


In [None]:
class CustomDataset(Dataset):
    def __init__(self, csv_file, label_encoders):
        self.data = pd.read_csv(csv_file)
        self.data = self.data[:].dropna()
        self.text = self.data['review'].tolist()
        self.target1 = self.data['drug'].tolist()
        self.target2 = self.data['condition'].tolist()
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        # Retrieve label encoders from the provided dictionary
        self.label_encoder1 = label_encoders['target1']
        self.label_encoder2 = label_encoders['target2']

        # Encode target variables using the retrieved label encoders
        self.target1_encoded = self.label_encoder1.transform(self.target1)
        self.target2_encoded = self.label_encoder2.transform(self.target2)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.text[idx]
        target1 = self.target1_encoded[idx]
        target2 = self.target2_encoded[idx]

        # Tokenize the text
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'target1': target1,
            'target2': target2,
            'original_target1': self.target1[idx],
            'original_target2': self.target2[idx]
        }

# Example usage:
csv_file = r'D:\Downloads\fdf.csv'  # Replace 'your_data.csv' with the path to your CSV file

# Create label encoders for target variables
label_encoders = {}
label_encoders['target1'] = LabelEncoder()
label_encoders['target2'] = LabelEncoder()

# Fit label encoders on target variables
data = pd.read_csv(csv_file)
label_encoders['target1'].fit(data['drug'])
label_encoders['target2'].fit(data['condition'])

# Create dataset with label encoders
dataset = CustomDataset(csv_file, label_encoders)


In [9]:
from tqdm import tqdm

def inference(model, dataloader, label_encoders, device):
    model = model.to(device)
    model.eval()
    all_preds1 = []
    all_preds2 = []
    all_targets1 = []
    all_targets2 = []

    for batch in tqdm(dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        target1 = batch['target1'].to(device)
        target2 = batch['target2'].to(device)

        logits1, logits2 = model(input_ids, attention_mask)

        all_preds1.extend(logits1.argmax(dim=1).cpu().numpy())
        all_preds2.extend(logits2.argmax(dim=1).cpu().numpy())
        all_targets1.extend(target1.cpu().numpy())
        all_targets2.extend(target2.cpu().numpy())

    # Reverse label encoding
    label_encoder1 = label_encoders['target1']
    label_encoder2 = label_encoders['target2']
    original_preds1 = label_encoder1.inverse_transform(all_preds1)
    original_preds2 = label_encoder2.inverse_transform(all_preds2)
    original_targets1 = label_encoder1.inverse_transform(all_targets1)
    original_targets2 = label_encoder2.inverse_transform(all_targets2)

    return original_preds1, original_preds2, original_targets1, original_targets2

# CustomDataset()
# # Perform inference
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# preds1, preds2, targets1, targets2 = inference(model, val_dataloader, label_encoders, device)

# # Analysis of predictions
# for pred1, pred2, target1, target2 in zip(preds1, preds2, targets1, targets2):
#     print(f'Predicted target1: {pred1}, Original target1: {target1}')
#     print(f'Predicted target2: {pred2}, Original target2: {target2}')
#     print()  # Add a newline for readability


{'0</span> users found this comment helpful.': 0, '10</span> users found this comment helpful.': 1, '110</span> users found this comment helpful.': 2, '11</span> users found this comment helpful.': 3, '121</span> users found this comment helpful.': 4, '123</span> users found this comment helpful.': 5, '12</span> users found this comment helpful.': 6, '13</span> users found this comment helpful.': 7, '142</span> users found this comment helpful.': 8, '145</span> users found this comment helpful.': 9, '146</span> users found this comment helpful.': 10, '14</span> users found this comment helpful.': 11, '15</span> users found this comment helpful.': 12, '16</span> users found this comment helpful.': 13, '17</span> users found this comment helpful.': 14, '18</span> users found this comment helpful.': 15, '19</span> users found this comment helpful.': 16, '1</span> users found this comment helpful.': 17, '20</span> users found this comment helpful.': 18, '21</span> users found this comment 

In [10]:
# Load the test data from the CSV file
test_csv_file = r"df_test_20.csv"
test_data = pd.read_csv(test_csv_file)

# Assuming you already have label encoders for target variables
# label_encoders['target1'] and label_encoders['target2']

# Create dataset with label encoders
test_dataset = CustomDataset(test_csv_file, label_encoders)

# Create data loader for the test dataset
batch_size = 4  # Adjust batch size as needed
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Now you can use test_dataloader for inference
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
preds1, preds2, targets1, targets2 = inference(model, test_dataloader, label_encoders, device)

# Analysis of predictions
for pred1, pred2, target1, target2 in zip(preds1, preds2, targets1, targets2):
    print(f'Predicted target1: {pred1}, Original target1: {target1}')
    print(f'Predicted target2: {pred2}, Original target2: {target2}')
    print()  # Add a newline for readability


 62%|██████▏   | 3614/5784 [18:03<12:11,  2.96it/s]