<a href="https://colab.research.google.com/github/vrtejus/AgentChain/blob/main/Nooks_ML_Takehome_Solved.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip3 install transformers
!pip3 install torchsampler
!pip3 install nlpaug

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: nlpaug
Successfully installed nlpaug-1.1.11


In [None]:
from google.colab import drive
drive.mount('/content/drive')
BASE_PATH = "/content/drive/MyDrive/Colab Notebooks/takehome_data"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from torchsampler import ImbalancedDatasetSampler
import pandas as pd
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
import torch.nn as nn
import os
from sklearn.metrics import classification_report
import nlpaug.augmenter.word as naw

In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

In [None]:
# Load the dataset
df = pd.read_csv(f"{BASE_PATH}/call_data.csv")

# Display the first few rows of the dataframe
df.head()


Unnamed: 0,SID,Had Timing Objection,Timing Objection Index
0,CAb91afa254ef65b0d75c24e16edd9c4b7,False,
1,CA3b8e6bbc08808390ddc4c8c36fb8ba5f,False,
2,CA43cdd0c490f4e49474dda60800caba77,True,43.0
3,CA535b77a335e9d2cd49c0158215b96133,False,
4,CAfd5dd1a208651485bc5bc0b85f0555da,False,


In [None]:
transcript_folder = f"{BASE_PATH}/transcripts"
df['Transcript'] = None

for idx, row in df.iterrows():
    sid = row['SID']
    transcript_path = os.path.join(transcript_folder, f"{sid}.txt")
    if os.path.exists(transcript_path):
        with open(transcript_path, 'r') as f:
            transcript = f.read()
        df.at[idx, 'Transcript'] = transcript
    else:
        print(f"Transcript not found for SID: {sid}")

# Check the first few rows to ensure transcripts are loaded
print(df.head())


                                  SID  Had Timing Objection  \
0  CAb91afa254ef65b0d75c24e16edd9c4b7                 False   
1  CA3b8e6bbc08808390ddc4c8c36fb8ba5f                 False   
2  CA43cdd0c490f4e49474dda60800caba77                  True   
3  CA535b77a335e9d2cd49c0158215b96133                 False   
4  CAfd5dd1a208651485bc5bc0b85f0555da                 False   

  Timing Objection Index                                         Transcript  
0                   None  1. [Sales Rep] Hello?\n2. [Prospect] Hey, Jane...  
1                   None  1. [Sales Rep] Hello?\n2. [Prospect] Hey, Eric...  
2                     43  1. [Sales Rep] Hi. It's Alan.\n2. [Prospect] A...  
3                   None  1. [Prospect] Hello?\n2. [Sales Rep] Yeah.\n3....  
4                   None  1. [Sales Rep] Hello?\n2. [Prospect] Peter, th...  


In [None]:
# Clean and tokenize the text data
df['tokenized_text'] = df['Transcript'].apply(lambda x: tokenizer(x, truncation=True, padding='max_length', max_length=512, return_tensors='pt') if pd.notnull(x) else None)


In [None]:
# Load the model
model = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=1)

# Verify if a GPU is available and if so, use it
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Data Splitting

In [None]:
# Drop rows where transcripts are missing
df = df.dropna(subset=['Transcript'])

# Split the data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


In [None]:
print(train_df.head())

                                    SID  Had Timing Objection  \
79   CA714dba08054505f4063d5378f173ffea                 False   
197  CAe2e02e272554e568be0ce0fbd9e8a7cd                 False   
38   CA56b95700fb1660ae63b5f58b26efed9d                 False   
24   CA8fa65fe9b916f14b0985ed52afd96175                 False   
122  CAf51ce847e20f4bbac4dfd2468929b704                 False   

    Timing Objection Index                                         Transcript  \
79                    None  1. [Sales Rep] Hello?\n2. [Prospect] Hey. Is t...   
197                      1  1. [Sales Rep] Please leave your message for e...   
38                    None  1. [Prospect] Hi. Is this Patrick? Hey. This i...   
24                    None  1. [Sales Rep] This is Jennifer. Okay. Okay.\n...   
122                      1                     1. [Sales Rep] Hello? Hello?\n   

                  tokenized_text  
79   [input_ids, attention_mask]  
197  [input_ids, attention_mask]  
38   [input_ids, 

In [None]:
# Identify the Minority Class
minority_class = train_df['Had Timing Objection'].value_counts().idxmin()

# Calculate the Oversampling Amount
minority_count = train_df[train_df['Had Timing Objection'] == minority_class].shape[0]
majority_count = train_df[train_df['Had Timing Objection'] != minority_class].shape[0]

samples_to_generate = majority_count - minority_count

# Oversample the Minority Class (duplicate samples)
minority_samples = train_df[train_df['Had Timing Objection'] == minority_class]

# Use Text Data Augmentation
text_augmenter = naw.SynonymAug(aug_src='wordnet')
oversampled_texts = []
for text in minority_samples['Transcript'].values:
    augmented_text = text_augmenter.augment(text)
    if isinstance(augmented_text, str):
        oversampled_texts.append(augmented_text)
    elif isinstance(augmented_text, list):
        oversampled_texts.append(' '.join(augmented_text))
    else:
        raise TypeError("Unexpected output type from text augmenter")

# Make sure we have generated enough samples
if len(oversampled_texts) < samples_to_generate:
    deficit = samples_to_generate - len(oversampled_texts)
    additional_samples = minority_samples.sample(deficit, replace=True)['Transcript'].tolist()
    oversampled_texts.extend(additional_samples)

oversampled_labels = [minority_class] * samples_to_generate
oversampled_df = pd.DataFrame({'Transcript': oversampled_texts, 'Had Timing Objection': oversampled_labels})
train_df_oversampled = pd.concat([train_df, oversampled_df]).reset_index(drop=True)


Training with oversampled dataset

In [None]:
# Tokenize the text data
tokenized_texts = tokenizer(list(train_df_oversampled['Transcript']), truncation=True, padding="max_length", max_length=512, return_tensors="pt")
train_input_ids = tokenized_texts['input_ids']
train_attention_mask = tokenized_texts['attention_mask']

# Ensure that labels are a tensor and have the correct shape
train_labels = torch.tensor(train_df_oversampled['Had Timing Objection'].values).unsqueeze(1).float()

# Check that all tensors have the same number of rows
assert train_input_ids.shape[0] == train_attention_mask.shape[0] == train_labels.shape[0]

# Create a TensorDataset
train_data = TensorDataset(train_input_ids, train_attention_mask, train_labels)

# Create a DataLoader without the sampler using the oversampled dataset
train_loader = DataLoader(train_data, batch_size=8, shuffle=True)

# Define the learning rate scheduler using the oversampled dataset
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader)*num_epochs)

# Training loop using the oversampled dataset
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}', leave=False)
    for batch in progress_bar:
        # Move data to GPU if available
        batch = tuple(t.to(device) for t in batch)

        # Forward pass
        outputs = model(input_ids=batch[0], attention_mask=batch[1], labels=batch[2])

        # Compute loss
        loss = criterion(outputs.logits, batch[2])
        total_loss += loss.item()

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Step the learning rate scheduler
        scheduler.step()

        # Update the progress bar
        progress_bar.set_postfix({'loss': total_loss / (progress_bar.n + 1)})

    # Print loss for the epoch
    print(f'Epoch {epoch + 1}/{num_epochs} | Loss: {total_loss / len(train_loader)}')



Epoch 1/3 | Loss: 0.6300374585570712




Epoch 2/3 | Loss: 0.6020110899751837


                                                                      

Epoch 3/3 | Loss: 0.5650223285862894




Training with class weights

In [None]:
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import WeightedRandomSampler, DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
from transformers import get_linear_schedule_with_warmup
import torch
import numpy as np
from tqdm import tqdm

# Assuming train_df is a DataFrame with columns 'tokenized_text' and 'Had Timing Objection'
# 'tokenized_text' is expected to be a dictionary with keys 'input_ids' and 'attention_mask'

# Prepare the data loaders
train_input_ids = torch.cat(train_df['tokenized_text'].apply(lambda x: torch.tensor(x['input_ids'])).tolist(), dim=0)
train_attention_mask = torch.cat(train_df['tokenized_text'].apply(lambda x: torch.tensor(x['attention_mask'])).tolist(), dim=0)
train_labels = torch.tensor(train_df['Had Timing Objection'].values)

train_data = TensorDataset(train_input_ids, train_attention_mask, train_labels)

# Calculate class weights
y_labels = train_df['Had Timing Objection'].values
class_weights = compute_class_weight('balanced', classes=[0, 1], y=y_labels)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)

# Ensure y_labels is a numpy array of consistent numeric type
y_labels = np.asarray(y_labels, dtype=np.int64)

# Calculate sample weights
sample_weights = class_weights[y_labels]
sample_weights = torch.tensor(sample_weights, dtype=torch.float).to(device)


# Create a WeightedRandomSampler
sampler = WeightedRandomSampler(weights=sample_weights, num_samples=len(sample_weights), replacement=True)

# Define the optimizer
optimizer = optim.AdamW(model.parameters(), lr=5e-5)

# Calculate the ratio of negative to positive samples for pos_weight
pos_weight = class_weights[0] / class_weights[1]
pos_weight_tensor = torch.tensor([pos_weight], dtype=torch.float).to(device)

# Define a BCE loss with the calculated class weights
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight_tensor)

criterion.to(device)

# Define the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader)*num_epochs)

# Create a DataLoader with the sampler
train_loader = DataLoader(train_data, batch_size=8, sampler=sampler)

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}', leave=False)
    for batch in progress_bar:
        # Move data to GPU if available
        batch = tuple(t.to(device) for t in batch)

        # Forward pass
        outputs = model(input_ids=batch[0], attention_mask=batch[1], labels=batch[2].unsqueeze(1).float())

        # Compute loss using class weights
        loss = criterion(outputs.logits, batch[2].unsqueeze(1).float())
        total_loss += loss.item()

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Step the learning rate scheduler
        scheduler.step()

        # Update the progress bar
        progress_bar.set_postfix({'loss': total_loss / (progress_bar.n + 1)})

    # Print loss for the epoch
    print(f'Epoch {epoch + 1}/{num_epochs} | Loss: {total_loss / len(train_loader)}')

  train_input_ids = torch.cat(train_df['tokenized_text'].apply(lambda x: torch.tensor(x['input_ids'])).tolist(), dim=0)
  train_attention_mask = torch.cat(train_df['tokenized_text'].apply(lambda x: torch.tensor(x['attention_mask'])).tolist(), dim=0)


Epoch 1/3 | Loss: 0.3234663508832455




Epoch 2/3 | Loss: 0.28307280465960505


                                                                      

Epoch 3/3 | Loss: 0.25752780586481094




The loss decreases from epoch 1 to epoch 3, which is a good sign. However, there is a slight increase from epoch 1 to epoch 2. This could be due to the stochastic nature of gradient descent, especially since our dataset is not too large.

Training with oversampleing and class weights

In [None]:
# Tokenize the text data
tokenized_texts = tokenizer(list(train_df_oversampled['Transcript']), truncation=True, padding="max_length", max_length=512, return_tensors="pt")
train_input_ids = tokenized_texts['input_ids']
train_attention_mask = tokenized_texts['attention_mask']

# Ensure that labels are a tensor and have the correct shape
train_labels = torch.tensor(train_df_oversampled['Had Timing Objection'].values).unsqueeze(1)

# Check that all tensors have the same number of rows
assert train_input_ids.shape[0] == train_attention_mask.shape[0] == train_labels.shape[0]

# Create a TensorDataset
train_data = TensorDataset(train_input_ids, train_attention_mask, train_labels)

# Calculate class weights
class_weights = compute_class_weight('balanced', classes=[0, 1], y=train_df_oversampled['Had Timing Objection'].values)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)

# Ensure that class labels are integers
class_labels = train_df_oversampled['Had Timing Objection'].values.astype(int)

# Handle any potential missing values in class_labels (if necessary)
# For example, you could replace missing values with a default class label (e.g., 0)
# class_labels = np.nan_to_num(class_labels, nan=0).astype(int)

# Calculate sample weights
sample_weights = np.array([class_weights[label] for label in class_labels])

# Ensure sample_weights is of float type
sample_weights = sample_weights.astype(np.float32)

# Convert to PyTorch tensor
sample_weights = torch.tensor(sample_weights, dtype=torch.float).to(device)

# Create a WeightedRandomSampler
sampler = WeightedRandomSampler(weights=sample_weights, num_samples=len(sample_weights), replacement=True)

# Define the optimizer
optimizer = optim.AdamW(model.parameters(), lr=5e-5)

# Calculate the ratio of negative to positive samples for pos_weight
pos_weight = class_weights[0] / class_weights[1]
pos_weight_tensor = torch.tensor([pos_weight], dtype=torch.float).to(device)

# Define a BCE loss with the calculated class weights
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight_tensor)
criterion.to(device)

# Create a DataLoader with the sampler
train_loader = DataLoader(train_data, batch_size=8, sampler=sampler)

# Define the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader)*num_epochs)

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}', leave=False)
    for batch in progress_bar:
        # Move data to GPU if available
        batch = tuple(t.to(device) for t in batch)

        # Forward pass
        outputs = model(input_ids=batch[0], attention_mask=batch[1], labels=batch[2].float())

        # Compute loss using class weights
        loss = criterion(outputs.logits, batch[2].float())
        total_loss += loss.item()

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Step the learning rate scheduler
        scheduler.step()

        # Update the progress bar
        progress_bar.set_postfix({'loss': total_loss / (progress_bar.n + 1)})

    # Print loss for the epoch
    print(f'Epoch {epoch + 1}/{num_epochs} | Loss: {total_loss / len(train_loader)}')



Epoch 1/3 | Loss: 0.5724569577159304




Epoch 2/3 | Loss: 0.5503697603037863


                                                                      

Epoch 3/3 | Loss: 0.5454881100943594




In [None]:
# Assuming test_df['tokenized_text'] contains the output of the tokenizer
test_input_ids = torch.cat(test_df['tokenized_text'].apply(lambda x: x['input_ids']).tolist(), dim=0)
test_attention_mask = torch.cat(test_df['tokenized_text'].apply(lambda x: x['attention_mask']).tolist(), dim=0)
test_labels = torch.tensor(test_df['Had Timing Objection'].values)

# Prepare the test data loader
test_data = TensorDataset(test_input_ids, test_attention_mask, test_labels)
test_loader = DataLoader(test_data, batch_size=8)

# Evaluation loop
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for batch in test_loader:
        # Move data to GPU if available
        batch = tuple(t.to(device) for t in batch)

        # Forward pass
        outputs = model(input_ids=batch[0], attention_mask=batch[1])
        preds = torch.sigmoid(outputs.logits).round().cpu().numpy()
        labels = batch[2].cpu().numpy()

        all_preds.extend(preds)
        all_labels.extend(labels.astype(int))  # Convert boolean to int here

# Convert predictions to a flat list
all_preds = [item for sublist in all_preds for item in sublist]

# Classification report
print(classification_report(all_labels, all_preds, target_names=['Not Had Timing Objection', 'Had Timing Objection']))


                          precision    recall  f1-score   support

Not Had Timing Objection       1.00      0.44      0.61        32
    Had Timing Objection       0.31      1.00      0.47         8

                accuracy                           0.55        40
               macro avg       0.65      0.72      0.54        40
            weighted avg       0.86      0.55      0.58        40

