In [None]:
pip install transformers



In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Load your data
data = pd.read_excel('training data with ID.xlsx')
X = data[['entity_id', 'entity_title']]
y = data['category']

# Label encoding for the target labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_train_encoded, test_size=0.2, random_state=42)

# Load DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(set(y_train_encoded)))

# Tokenize input texts
X_train_encoded = tokenizer(list(X_train['entity_title']), padding=True, truncation=True, return_tensors='pt')
X_test_encoded = tokenizer(list(X_test['entity_title']), padding=True, truncation=True, return_tensors='pt')

# Define optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

# ORIGINAL Training loop
num_epochs = 2
batch_size = 8

for epoch in range(num_epochs):
    model.train()
    for i in range(0, len(X_train), batch_size):
        optimizer.zero_grad()
        batch_indices = list(range(i, min(i + batch_size, len(X_train))))

        if len(batch_indices) > 0:  # Ensure there are samples in the batch
            batch_X = {key: val[batch_indices] for key, val in X_train_encoded.items()}
            batch_y = torch.as_tensor(y_train[batch_indices])

            outputs = model(**batch_X, labels=batch_y)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

    # Evaluate on test data after each epoch
    model.eval()
    with torch.no_grad():
        outputs = model(**X_test_encoded)
        predictions = torch.argmax(outputs.logits, dim=1)
        accuracy = accuracy_score(y_test, predictions)
        print(f"Epoch {epoch+1}/{num_epochs} - Test Accuracy: {accuracy}")

# Final evaluation on test data
model.eval()
with torch.no_grad():
    outputs = model(**X_test_encoded)
    predictions = torch.argmax(outputs.logits, dim=1)
    accuracy = accuracy_score(y_test, predictions)
    print(f"Final Test Accuracy: {accuracy}")

    # Decode the predicted labels back to their original category names
    predicted_categories = label_encoder.inverse_transform(predictions.numpy())

    # Decode the actual labels back to their original category names
    actual_categories = label_encoder.inverse_transform(y_test)

    # Create a DataFrame to store predicted and actual categories
    results_df = pd.DataFrame({
        'Entity Title': X_test['entity_title'],  # Assuming 'Entity Title' is the column name
        'Predicted Category': predicted_categories,
        'Actual Category': actual_categories
    })

    # Save the DataFrame to an Excel file
    results_df.to_excel('distilBERT.xlsx', index=False)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 - Test Accuracy: 0.764
Epoch 2/3 - Test Accuracy: 0.8066666666666666
Epoch 3/3 - Test Accuracy: 0.8013333333333333
Final Test Accuracy: 0.8013333333333333


In [None]:
from sklearn.metrics import confusion_matrix, classification_report

# Load the saved predictions and actual categories from the Excel file
#results_df = pd.read_excel('Predicted_vs_Actual.xlsx')

# Extract predicted and actual categories
predicted_categories = results_df['Predicted Category']
actual_categories = results_df['Actual Category']

# Generate confusion matrix
conf_matrix = confusion_matrix(actual_categories, predicted_categories)

# Generate classification report
class_report = classification_report(actual_categories, predicted_categories)

# Display confusion matrix
print("\nConfusion Matrix:")
print(conf_matrix)

# Display classification report
print("\nClassification Report:")
print(class_report)



Confusion Matrix:
[[  0   0   2   0   0   0]
 [  0  34   1   0   0   1]
 [  0   0 336   5  12  21]
 [  0   3   7  69   0   4]
 [  0   0  17   0  25   2]
 [  0   9  47  17   1 137]]

Classification Report:
                       precision    recall  f1-score   support

      Anti Corruption       0.00      0.00      0.00         2
               ESG/ET       0.74      0.94      0.83        36
           Functional       0.82      0.90      0.86       374
                  HSE       0.76      0.83      0.79        83
           Leadership       0.66      0.57      0.61        44
Technical/Engineering       0.83      0.65      0.73       211

             accuracy                           0.80       750
            macro avg       0.63      0.65      0.64       750
         weighted avg       0.80      0.80      0.80       750



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import pandas as pd

# Load the uploaded file into a DataFrame
df = pd.read_excel('distilBERT.xlsx')

# Filter rows where predicted and actual categories are different
different_categories = df[df['Predicted Category'] != df['Actual Category']]

# Save the filtered DataFrame to a new Excel file
different_categories.to_excel('different_categories.xlsx', index=False)
