# Distilbert Model

In [None]:
#pip install transformers

In [None]:
#pip install nltk

In [None]:
#pip install torch

In [None]:
#pip install datasets

In [None]:
import logging
logging.getLogger("transformers").setLevel(logging.WARNING)
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, EarlyStoppingCallback, TrainerCallback, DistilBertConfig
import nltk
from nltk.corpus import wordnet
from transformers import get_scheduler, set_seed
from torch.optim import AdamW
from datasets import Dataset, load_dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import torch
import random

In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
#!pip install accelerate -U

In [None]:
#pip install torch torchvision torchaudio

In [None]:
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2)

In [None]:
# Set the random seed for reproducibility
set_seed(44)

dataset = load_dataset("ethos", "binary")

# Use the dataset's train_test_split method
train_test_dataset = dataset["train"].train_test_split(test_size=0.1)
train_dataset = train_test_dataset['train']
val_dataset = train_test_dataset['test']

# Tokenize function without additional augmentation
def tokenize_function(examples):
    # Tokenize the texts and include the labels
    tokenized_inputs = tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)
    tokenized_inputs['labels'] = examples['label']
    return tokenized_inputs

# Tokenize the datasets and include the labels
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)

# Initialize DistilBERT tokenizer and model
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2)

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results_ethosR1',
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=3e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=20,
    evaluation_strategy="steps",
    eval_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model='f1',  # Focus on F1 score
    save_total_limit=1,
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary' if num_labels == 2 else 'macro')
    return {'accuracy': accuracy, 'f1': f1, 'precision': precision, 'recall': recall}

# Initialize the optimizer
optimizer = AdamW(model.parameters(), lr=3e-5)

# Calculate the total number of training steps
num_training_steps = len(train_dataset) // training_args.per_device_train_batch_size * training_args.num_train_epochs

# Define warmup steps as a ratio of training steps (e.g., 10% of training steps)
warmup_steps = int(num_training_steps * 0.1)

# Create the scheduler
scheduler = get_scheduler(
    name="cosine_with_restarts",
    optimizer=optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=num_training_steps,
)


# Define the callbacks list, assuming you have it defined
callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]

num_labels = 2

# Initialize Trainer with the custom optimizer and scheduler
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    compute_metrics=compute_metrics,
    callbacks=callbacks,
    optimizers=(optimizer, scheduler),
)

trainer.train()

In [None]:
# Save the model and tokenizer
model_path = "./model_save_runEthosR1"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

# Save the tokenized validation dataset
tokenized_val_dataset.save_to_disk("./tokenized_validation_dataset_EthosR1")

# Load Test Data

In [None]:
import pandas as pd

In [None]:
#Load feather files 
file_path = 'df_2020C_06_27_to_28_testdata.feather'

# Read Feather file into a DataFrame
df_2020C_06_27_to_28_testdata = pd.read_feather(file_path)

In [None]:
import numpy as np
df_2020C_06_27_to_28_testdata['predicted_label'] = np.nan

In [None]:
part_size = len(df_2020C_06_27_to_28_testdata) // 40

# Create a list to hold all the split DataFrames
dfs = []

# Split the DataFrame into 20 parts
for i in range(40):
    start_index = i * part_size
    # Ensure the last DataFrame includes the remainder of the rows
    if i == 39:
        end_index = None  # This goes till the end
    else:
        end_index = start_index + part_size
    split_df = df_2020C_06_27_to_28_testdata.iloc[start_index:end_index]
    dfs.append(split_df)

# You can access each DataFrame using dfs[0], dfs[1], ..., dfs[19]
# For example, to print the first split DataFrame:
#print(dfs[0])

In [None]:
df_627_g = dfs[6]

In [None]:
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load fine-tuned tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("./model_save_runEthosR1") # THIS IS YOUR MODEL NAME SAVED FROM TESTING AND WHATEVER PATH IT IS ON
model = AutoModelForSequenceClassification.from_pretrained("./model_save_runEthosR1") # THIS IS YOUR MODEL NAME SAVED FROM TESTING
model.to(device)
model.eval()  # Set model to evaluation mode

# Define tokenization function
def tokenize_function(examples):
    return tokenizer(examples['body'], padding='max_length', truncation=True, max_length=128)

In [None]:
%%time

# Tokenize dataset
texts = df_627_g['body'].astype(str).tolist()
tokenized_inputs = tokenize_function({"body": texts})

class PredictionDataset(Dataset):
    def __init__(self, encodings):
        #self.encodings = encodings
        self.encodings = {key: torch.tensor(val).to(device).detach() for key, val in encodings.items()}

    def __getitem__(self, idx):
        if isinstance(idx, int):
            # Handle single integer index (expected behavior)
            #return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            return {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        elif isinstance(idx, list):
            # Handle list of indices (if passed by DataLoader)
            #return {key: torch.stack([torch.tensor(val[i]) for i in idx]) for key, val in self.encodings.items()}
            return {key: torch.stack([val[i].clone().detach() for i in idx]) for key, val in self.encodings.items()}
        else:
            raise TypeError(f"Unsupported index type: {type(idx)}")

    def __len__(self):
        return len(self.encodings['input_ids'])

# Create dataset and dataloader
pred_dataset = PredictionDataset(tokenized_inputs)

# Create dataloader without a specified collate function (use default behavior)
pred_dataloader = DataLoader(pred_dataset, batch_size=16, shuffle=False)

# Make predictions
predictions = []
for batch in pred_dataloader:
    #batch = {k: v.to('cpu') for k, v in batch.items()} # I think this can be changed to GPU should you be able to run it on a GPU
    batch = {k: v.clone().detach().to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    preds = outputs.logits.argmax(dim=-1)
    predictions.extend(preds.tolist())

In [None]:
#df_627_g['predicted_label'] = predictions
df_2020C_06_27_to_28_testdata.loc[df_627_g.index, 'predicted_label'] = predictions

In [None]:
#df_627_g.to_csv('df_627_g--Version_2.csv', index=False)
df_2020C_06_27_to_28_testdata.loc[df_627_g.index, 'predicted_label'].to_csv('df_627_g--Version_2.csv', index=False)

In [None]:
df_627_h = dfs[7]

In [None]:
%%time 

# Tokenize dataset
texts = df_627_h['body'].astype(str).tolist()
tokenized_inputs = tokenize_function({"body": texts})

'''
class PredictionDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        if isinstance(idx, int):
            # Handle single integer index (expected behavior)
            return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        elif isinstance(idx, list):
            # Handle list of indices (if passed by DataLoader)
            return {key: torch.stack([torch.tensor(val[i]) for i in idx]) for key, val in self.encodings.items()}
        else:
            raise TypeError(f"Unsupported index type: {type(idx)}")

    def __len__(self):
        return len(self.encodings['input_ids'])
'''

# Create dataset and dataloader
pred_dataset = PredictionDataset(tokenized_inputs)

# Create dataloader without a specified collate function (use default behavior)
pred_dataloader = DataLoader(pred_dataset, batch_size=16, shuffle=False)

# Make predictions
predictions = []
for batch in pred_dataloader:
    #batch = {k: v.to('cpu') for k, v in batch.items()} # I think this can be changed to GPU should you be able to run it on a GPU
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    preds = outputs.logits.argmax(dim=-1)
    predictions.extend(preds.tolist())

In [None]:
#df_627_h['predicted_label'] = predictions
df_2020C_06_27_to_28_testdata.loc[df_627_h.index, 'predicted_label'] = predictions

In [None]:
df_2020C_06_27_to_28_testdata.loc[df_627_h.index, 'predicted_label'].to_csv('df_627_h--Version_2.csv', index=False)

# Faster process

In [4]:
%%time

from tqdm.auto import tqdm
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DistilBertForSequenceClassification

#Load feather files
file_path = 'df_2020C_07_01_to_02_testdata.feather'

# Read Feather file into a DataFrame
data_frame = pd.read_feather(file_path)

model_path = "./model_save_runEthosR1"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#print(f"Using device: {device}")

# Load fine-tuned tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path, num_labels = 2)
model.to(device)
model.eval()  # Set model to evaluation mode

# Define tokenization function
def tokenize_function(examples):
    return tokenizer(examples, padding = 'max_length', truncation = True, max_length = 128)

# Define chunk size
chunk_size = 100  # Adjust chunk size based on your system's capacity

# Tokenize dataset in chunks
texts = data_frame['body'].astype(str).tolist()
tokenized_inputs = {'input_ids': [], 'attention_mask': []}  # Initialize your encoding structure

for i in tqdm(range(0, len(texts), chunk_size), desc="Tokenizing"):
    chunk_texts = texts[i:i + chunk_size]  # Get a chunk of texts
    chunk_tokenized = tokenize_function(chunk_texts)  # Tokenize the chunk
    tokenized_inputs['input_ids'].extend(chunk_tokenized['input_ids'])
    tokenized_inputs['attention_mask'].extend(chunk_tokenized['attention_mask'])
    
# Class for handling the tokenized data
class PredictionDataset(Dataset):
    def __init__(self, encodings):
        #self.encodings = encodings
        self.encodings = {key: torch.tensor(val).to(device).detach() for key, val in encodings.items()}

    def __getitem__(self, idx):
        if isinstance(idx, int):
            #return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            return {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        elif isinstance(idx, list):
            #return {key: torch.stack([torch.tensor(val[i]) for i in idx]) for key, val in self.encodings.items()}
            return {key: torch.stack([val[i].clone().detach() for i in idx]) for key, val in self.encodings.items()}
        else:
            raise TypeError(f"Unsupported index type: {type(idx)}")

    def __len__(self):
        return len(self.encodings['input_ids'])

# Create dataset and dataloader
pred_dataset = PredictionDataset(tokenized_inputs)
pred_dataloader = DataLoader(pred_dataset, batch_size = 32, shuffle = False)

# Make predictions
predictions = []
for batch in tqdm(pred_dataloader, desc="Predicting"):
    #batch = {k: v.to('cpu') for k, v in batch.items()}
    batch = {k: v.clone().detach().to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    preds = outputs.logits.argmax(dim=-1)
    predictions.extend(preds.tolist())
    
# Add predictions to DataFrame and save
data_frame['predicted_label'] = predictions
data_frame.to_csv('Data_Frame_Of_Texts_And_Predictions_By_DBHSBC_For_2020-07-01_To_2020-07-02.csv', index = False)

Tokenizing:   0%|          | 0/63134 [00:00<?, ?it/s]

Predicting:   0%|          | 0/197293 [00:00<?, ?it/s]

CPU times: user 1h 9min 59s, sys: 1min, total: 1h 10min 59s
Wall time: 59min 14s


In [None]:
data_frame.shape

In [None]:
data_frame.columns

In [None]:
data_frame.head()