In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Load dataset
# Assuming your Excel file has columns: 'Question', 'Answer', 'Strength'
data = pd.read_excel('Audio_Team_FinalCleaned.xlsx')

# Combine Questions and Answers into a single text column
data['text'] = data['Question'] + " " + data['Answer']

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed

# Fit and transform the text data to create TF-IDF embeddings
tfidf_embeddings = tfidf_vectorizer.fit_transform(data['text'])

# Convert the sparse matrix to a dense format (optional)
tfidf_embeddings_dense = tfidf_embeddings.todense()

# Convert the dense TF-IDF matrix to a DataFrame for easier handling
tfidf_df = pd.DataFrame(tfidf_embeddings_dense, columns=tfidf_vectorizer.get_feature_names_out())

# Create a new DataFrame to hold the features
features = []

# Iterate through the original dataset to append strength and embeddings
for index, row in data.iterrows():
    # Get the strength value
    strength = row['Strength']
    
    # Get the corresponding TF-IDF embedding for the current row
    embedding = tfidf_df.iloc[index].values  # Extract the embedding for this row
    
    # Append the strength and embedding as a dictionary
    features.append({
        'strength': strength,
        'embedding': embedding  # TF-IDF embedding as a feature
    })

# Convert the list of dictionaries into a DataFrame
features_df = pd.DataFrame(features)

# Convert the 'embedding' column (which contains arrays) into a format that can be saved in a CSV
# This is done by converting the array into a string representation
features_df['embedding'] = features_df['embedding'].apply(lambda x: ','.join(map(str, x)))

# Save the DataFrame with strength and embeddings to a new CSV
features_df.to_csv('tfidf_features_add_data_cleaned.csv', index=False)

print("TF-IDF feature extraction completed and saved to tfidf_features_add_data_cleaned.csv!")


TF-IDF feature extraction completed and saved to tfidf_features_add_data_cleaned.csv!


In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from tabulate import tabulate
from collections import Counter

# Load the dataset
data = pd.read_excel('Audio_Team_FinalCleaned.xlsx')

# Combine Questions and Answers into a single text column
data['text'] = data['Question'] + " " + data['Answer']

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed

# Fit and transform the text data to create TF-IDF embeddings
tfidf_embeddings = tfidf_vectorizer.fit_transform(data['text']).toarray()

# Create a DataFrame for the TF-IDF embeddings
features_df = pd.DataFrame(tfidf_embeddings)

# Add strength labels to the DataFrame
features_df['strength'] = data['Strength']

# Map strength labels from {-1, 0, 1} to {0, 1, 2}
label_mapping = {-1: 0, 0: 1, 1: 2}
features_df['mapped_strength'] = features_df['strength'].map(label_mapping)

# Split the features and labels
X = features_df.drop(columns=['strength', 'mapped_strength']).values  # Use only the TF-IDF features
y = features_df['mapped_strength'].values  # Use the mapped strength column

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to balance the training set
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Verify class distribution after SMOTE
print("Original training class distribution:", Counter(y_train))
print("Resampled training class distribution:", Counter(y_resampled))

# List of classifiers with optimized hyperparameters for Random Forest
classifiers = {
    'Random Forest': RandomForestClassifier(max_depth=10, min_samples_split=2, n_estimators=50, random_state=42),
    'SVM': SVC(),
    'Decision Tree': DecisionTreeClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'XGBoost': XGBClassifier(eval_metric='mlogloss', use_label_encoder=False),
    'CatBoost': CatBoostClassifier(silent=True),
    'Naive Bayes': GaussianNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000)  # Linear regression model
}

# Dictionary to store the results
results = []

# Iterate over classifiers and store performance metrics
for name, clf in classifiers.items():
    clf.fit(X_resampled, y_resampled)  # Train on resampled (balanced) data
    y_pred = clf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Append the results
    results.append([name, accuracy, precision, recall, f1])

# Create a table to show results
headers = ["Classifier", "Accuracy", "Precision", "Recall", "F1-Score"]
print(tabulate(results, headers=headers, tablefmt="grid"))

# If needed: Map predictions back to original labels
inverse_mapping = {0: -1, 1: 0, 2: 1}
y_test_original = np.vectorize(inverse_mapping.get)(y_test)
y_pred_original = np.vectorize(inverse_mapping.get)(y_pred)

# Print classification report for original labels
print(classification_report(y_test_original, y_pred_original))


Original training class distribution: Counter({2: 164, 1: 132, 0: 110})
Resampled training class distribution: Counter({2: 164, 1: 164, 0: 164})


Parameters: { "use_label_encoder" } are not used.



+---------------------+------------+-------------+----------+------------+
| Classifier          |   Accuracy |   Precision |   Recall |   F1-Score |
| Random Forest       |   0.54902  |    0.526465 | 0.54902  |   0.52428  |
+---------------------+------------+-------------+----------+------------+
| SVM                 |   0.578431 |    0.611515 | 0.578431 |   0.559992 |
+---------------------+------------+-------------+----------+------------+
| Decision Tree       |   0.352941 |    0.363627 | 0.352941 |   0.355987 |
+---------------------+------------+-------------+----------+------------+
| AdaBoost            |   0.509804 |    0.553977 | 0.509804 |   0.514429 |
+---------------------+------------+-------------+----------+------------+
| XGBoost             |   0.441176 |    0.436439 | 0.441176 |   0.43805  |
+---------------------+------------+-------------+----------+------------+
| CatBoost            |   0.480392 |    0.478227 | 0.480392 |   0.478511 |
+---------------------+--