In [4]:
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2Model

# Load dataset
data = pd.read_excel('Audio_Data.xlsx')

# Load pre-trained GPT-2 model and tokenizer
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_model = GPT2Model.from_pretrained('gpt2')

# Set the pad token to be the same as the EOS token
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token

# Function to get GPT-2 embeddings for a text
def get_gpt2_embeddings(text, max_length=512):
    # Tokenize text
    tokens = gpt2_tokenizer(text, 
                            padding='max_length',
                            truncation=True,
                            max_length=max_length,
                            return_tensors='pt')
    
    # Get GPT-2 output embeddings
    with torch.no_grad():
        outputs = gpt2_model(**tokens)

    # Extract last hidden state for the last token
    last_hidden_state = outputs.last_hidden_state
    return last_hidden_state[:, -1, :].squeeze().numpy()

# Create a new list to hold the features, questions, answers, and strengths
gpt2_features = []

# Extract embeddings for each row in the dataset
for index, row in data.iterrows():
    question = row['Question']
    answer = row['Answer']

    # Get embeddings for both question and answer
    question_embedding = get_gpt2_embeddings(question)
    answer_embedding = get_gpt2_embeddings(answer)

    # Concatenate the question and answer embeddings into one feature vector
    combined_embedding = torch.cat((torch.tensor(question_embedding), torch.tensor(answer_embedding)))

    # Append the question, answer, strength, and embedding as a dictionary
    gpt2_features.append({
        'question': question,
        'answer': answer,
        'strength': row['Strength'],
        'embedding': combined_embedding.numpy()  # GPT-2 embedding as a feature
    })

# Convert the list of dictionaries into a DataFrame
gpt2_df = pd.DataFrame(gpt2_features)

# Convert the 'embedding' column (which contains arrays) into a format that can be saved in a CSV
gpt2_df['embedding'] = gpt2_df['embedding'].apply(lambda x: ','.join(map(str, x)))

# Save the DataFrame with question, answer, strength, and embeddings to a new CSV
gpt2_df.to_csv('gpt2_features.csv', index=False)

print("GPT-2 feature extraction completed and saved to gpt2_features.csv!")




GPT-2 feature extraction completed and saved to gpt2_features.csv!


In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

# Step 1: Load the GPT-2 features
features = pd.read_csv('gpt2_features.csv')

# Check if 'strength' column exists; rename if needed
if 'strength' not in features.columns:
    features.rename(columns={features.columns[2]: 'strength'}, inplace=True)

# Convert the 'embedding' column from string to a list of floats
def convert_embedding(embedding_str):
    return np.fromstring(embedding_str.strip("[]"), sep=',')

# Apply the conversion to the 'embedding' column
features['embedding'] = features['embedding'].apply(convert_embedding)

# Split the features and labels
X = np.array(features['embedding'].tolist())  # Convert to numpy array for model training
y = features['strength'].values  # Use the strength column

# Step 2: Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Hyperparameter Tuning using Grid Search
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Initialize RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# Set up the GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy')

# Fit Grid Search
grid_search.fit(X_train, y_train)

# Get the best parameters and estimator
best_params = grid_search.best_params_
best_rf = grid_search.best_estimator_

# Print the best parameters
print("Best Hyperparameters:", best_params)

# Step 4: Train the final model using the best parameters
final_model = best_rf.fit(X_train, y_train)

# Step 5: Evaluate the final model
y_pred = final_model.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))

# Print confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)


Best Hyperparameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 150}
              precision    recall  f1-score   support

          -1       0.44      0.44      0.44        57
           0       0.23      0.28      0.25        47
           1       0.43      0.37      0.40        68

    accuracy                           0.37       172
   macro avg       0.37      0.36      0.36       172
weighted avg       0.38      0.37      0.37       172

Confusion Matrix:
 [[25 19 13]
 [14 13 20]
 [18 25 25]]


In [6]:
!pip install catboost xgboost scikit-learn



In [7]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import torch
from tabulate import tabulate

# Load the GPT-2 feature dataset
features = pd.read_csv('gpt2_features.csv')

# Check if 'Strength' column exists; rename if necessary
if 'Strength' not in features.columns:
    features.rename(columns={features.columns[2]: 'Strength'}, inplace=True)

# Map strength labels from {-1, 0, 1} to {0, 1, 2}
label_mapping = {-1: 0, 0: 1, 1: 2}
features['mapped_strength'] = features['Strength'].map(label_mapping)

# Convert the embeddings from strings back to arrays
features['embedding'] = features['embedding'].apply(lambda x: np.fromstring(x, sep=','))

# Step 1: Prepare feature (X) and target (y) variables
X = np.stack(features['embedding'].values)
y = features['mapped_strength'].values

# Step 2: Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Define classifiers for comparison
classifiers = {
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'Decision Tree': DecisionTreeClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'XGBoost': XGBClassifier(eval_metric='mlogloss', use_label_encoder=False),
    'CatBoost': CatBoostClassifier(silent=True),
    'Naive Bayes': GaussianNB()
}

# Dictionary to store results
results = []

# Step 4: Model training and evaluation
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Append results
    results.append([name, accuracy, precision, recall, f1])

# Display results in a table
headers = ["Classifier", "Accuracy", "Precision", "Recall", "F1-Score"]
print(tabulate(results, headers=headers, tablefmt="grid"))

# Step 5: Map predictions back to original labels
inverse_mapping = {0: -1, 1: 0, 2: 1}
y_test_original = np.vectorize(inverse_mapping.get)(y_test)
y_pred_original = np.vectorize(inverse_mapping.get)(y_pred)

# Print classification report for original labels
print("Classification Report for Original Labels:")
print(classification_report(y_test_original, y_pred_original))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Parameters: { "use_label_encoder" } are not used.



+---------------+------------+-------------+----------+------------+
| Classifier    |   Accuracy |   Precision |   Recall |   F1-Score |
| Random Forest |   0.360465 |    0.371181 | 0.360465 |   0.362964 |
+---------------+------------+-------------+----------+------------+
| SVM           |   0.290698 |    0.211363 | 0.290698 |   0.208877 |
+---------------+------------+-------------+----------+------------+
| Decision Tree |   0.331395 |    0.343901 | 0.331395 |   0.334427 |
+---------------+------------+-------------+----------+------------+
| AdaBoost      |   0.348837 |    0.349193 | 0.348837 |   0.348009 |
+---------------+------------+-------------+----------+------------+
| XGBoost       |   0.372093 |    0.394686 | 0.372093 |   0.377995 |
+---------------+------------+-------------+----------+------------+
| CatBoost      |   0.348837 |    0.354834 | 0.348837 |   0.350207 |
+---------------+------------+-------------+----------+------------+
| Naive Bayes   |   0.412791 |    