In [2]:
pip install torch

Collecting torch
  Downloading torch-2.5.0-cp311-cp311-win_amd64.whl.metadata (28 kB)
Collecting sympy==1.13.1 (from torch)
  Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Downloading torch-2.5.0-cp311-cp311-win_amd64.whl (203.1 MB)
   ---------------------------------------- 0.0/203.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/203.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/203.1 MB 1.3 MB/s eta 0:02:39
   ---------------------------------------- 0.0/203.1 MB 1.3 MB/s eta 0:02:39
   ---------------------------------------- 0.1/203.1 MB 573.4 kB/s eta 0:05:55
   ---------------------------------------- 0.1/203.1 MB 525.1 kB/s eta 0:06:27
   ---------------------------------------- 0.1/203.1 MB 504.4 kB/s eta 0:06:43
   ---------------------------------------- 0.1/203.1 MB 568.9 kB/s eta 0:05:57
   ---------------------------------------- 0.1/203.1 MB 568.9 kB/s eta 0:05:57
   ---------------------------------------- 0.1/203.1 MB

In [6]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel

# Load dataset
# Assuming your CSV has columns: 'Question', 'Answer', 'Strength'
data = pd.read_excel('Audio_data.xlsx')

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to get BERT embeddings for a text
def get_bert_embeddings(text, max_length=512):
    # Tokenize text
    tokens = tokenizer(text,
                       padding='max_length',
                       truncation=True,
                       max_length=max_length,
                       return_tensors='pt')

    # Get BERT output embeddings
    with torch.no_grad():
        outputs = model(**tokens)

    # Extract [CLS] token embedding (representing the entire sequence)
    cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
    return cls_embedding

# Create a new list to hold the features, questions, answers, and strengths
features = []

# Extract embeddings for each row in the dataset
for index, row in data.iterrows():
    question = row['Question']
    answer = row['Answer']

    # Get embeddings for both question and answer
    question_embedding = get_bert_embeddings(question)
    answer_embedding = get_bert_embeddings(answer)

    # Concatenate the question and answer embeddings into one feature vector
    combined_embedding = torch.cat((torch.tensor(question_embedding), torch.tensor(answer_embedding)))

    # Append the question, answer, strength, and embedding as a dictionary
    features.append({
        'question': question,
        'answer': answer,
        'strength': row['Strength'],
        'embedding': combined_embedding.numpy()  # BERT embedding as a feature
    })

# Convert the list of dictionaries into a DataFrame
features_df = pd.DataFrame(features)

# Convert the 'embedding' column (which contains arrays) into a format that can be saved in a CSV
# This is done by converting the array into a string representation
features_df['embedding'] = features_df['embedding'].apply(lambda x: ','.join(map(str, x)))

# Save the DataFrame with question, answer, strength, and embeddings to a new CSV
features_df.to_csv('bert_features_add_data.csv', index=False)

print("BERT feature extraction completed and saved to bert_features.csv!")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BERT feature extraction completed and saved to bert_features.csv!


In [1]:
# # Hyperparameter tuning - using GridSearchCV
# The main goal is to improve the model’s performance on unseen data.
# The right hyperparameters can significantly enhance a model's accuracy, reduce overfitting, and lead to a better generalization.
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

# Step 1: Load the BERT features
features = pd.read_csv('bert_features_add_data.csv')

# Convert embeddings and labels
features['embedding'] = features['embedding'].apply(lambda x: np.fromstring(x.strip('[]'), sep=', '))  # Convert string to list
X = np.array(features['embedding'].tolist())
y = features['strength'].values

# Step 2: Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Hyperparameter Tuning using Grid Search
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Initialize RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# Set up the GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy')

# Fit Grid Search
grid_search.fit(X_train.tolist(), y_train)

# Get the best parameters and estimator
best_params = grid_search.best_params_
best_rf = grid_search.best_estimator_

# Print the best parameters
print("Best Hyperparameters:", best_params)

# Step 4: Train the final model using the best parameters
final_model = best_rf.fit(X_train.tolist(), y_train)

# Step 5: Evaluate the final model
y_pred = final_model.predict(X_test.tolist())

# Print classification report
print(classification_report(y_test, y_pred))

# Print confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)


Best Hyperparameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 150}
              precision    recall  f1-score   support

          -1       0.46      0.46      0.46        57
           0       0.25      0.30      0.27        47
           1       0.41      0.35      0.38        68

    accuracy                           0.37       172
   macro avg       0.37      0.37      0.37       172
weighted avg       0.38      0.37      0.37       172

Confusion Matrix:
 [[26 16 15]
 [13 14 20]
 [18 26 24]]


In [2]:
# Comparing Accuracy for all of the models
!pip install catboost xgboost scikit-learn



In [3]:
# Model Comparison based on the bert model
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from tabulate import tabulate

# Load the BERT features
features = pd.read_csv('bert_features_add_data.csv')

# Convert the 'embedding' column from string to a list of floats
def convert_embedding(embedding_str):
    return np.fromstring(embedding_str.strip("[]"), sep=',')

# Apply the conversion to the 'embedding' column
features['embedding'] = features['embedding'].apply(convert_embedding)

# Map strength labels from {-1, 0, 1} to {0, 1, 2}
label_mapping = {-1: 0, 0: 1, 1: 2}
features['mapped_strength'] = features['strength'].map(label_mapping)

# Split the features and labels
X = np.array(features['embedding'].tolist())  # Convert to numpy array for model training
y = features['mapped_strength'].values  # Use the mapped strength column

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# List of classifiers
classifiers = {
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'Decision Tree': DecisionTreeClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'XGBoost': XGBClassifier(eval_metric='mlogloss', use_label_encoder=False),
    'CatBoost': CatBoostClassifier(silent=True),
    'Naive Bayes': GaussianNB()
}

# Dictionary to store the results
results = []

# Iterate over classifiers and store performance metrics
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Append the results
    results.append([name, accuracy, precision, recall, f1])

# Create a table to show results
headers = ["Classifier", "Accuracy", "Precision", "Recall", "F1-Score"]
print(tabulate(results, headers=headers, tablefmt="grid"))

# If needed: Map predictions back to original labels
inverse_mapping = {0: -1, 1: 0, 2: 1}
y_test_original = np.vectorize(inverse_mapping.get)(y_test)
y_pred_original = np.vectorize(inverse_mapping.get)(y_pred)

# Print classification report for original labels
print(classification_report(y_test_original, y_pred_original))


Parameters: { "use_label_encoder" } are not used.



+---------------+------------+-------------+----------+------------+
| Classifier    |   Accuracy |   Precision |   Recall |   F1-Score |
| Random Forest |   0.366279 |    0.373745 | 0.366279 |   0.367171 |
+---------------+------------+-------------+----------+------------+
| SVM           |   0.401163 |    0.426694 | 0.401163 |   0.399129 |
+---------------+------------+-------------+----------+------------+
| Decision Tree |   0.348837 |    0.36117  | 0.348837 |   0.353903 |
+---------------+------------+-------------+----------+------------+
| AdaBoost      |   0.389535 |    0.409896 | 0.389535 |   0.392687 |
+---------------+------------+-------------+----------+------------+
| XGBoost       |   0.383721 |    0.386934 | 0.383721 |   0.384948 |
+---------------+------------+-------------+----------+------------+
| CatBoost      |   0.389535 |    0.389714 | 0.389535 |   0.38829  |
+---------------+------------+-------------+----------+------------+
| Naive Bayes   |   0.406977 |    