In [2]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize  # Import word_tokenize

# Download NLTK data files (only needed once)
nltk.download('punkt')

# Load dataset
data = pd.read_excel('Audio_Data.xlsx')

# Check column names and ensure 'Strength' column is present
if 'Strength' not in data.columns:
    data.rename(columns={data.columns[2]: 'Strength'}, inplace=True)

# Tokenize the questions and answers
data['Question_tokens'] = data['Question'].apply(word_tokenize)
data['Answer_tokens'] = data['Answer'].apply(word_tokenize)

# Save the tokenized DataFrame, ensuring 'Strength' column is included
data.to_csv('nltk_tokenized_data.csv', index=False)

print("NLTK tokenization completed and saved to nltk_tokenized_data.csv!")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sumit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


NLTK tokenization completed and saved to nltk_tokenized_data.csv!


In [7]:
!pip install catboost xgboost scikit-learn



In [8]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
import nltk

# Download NLTK data files (only needed once)
nltk.download('punkt')

# Step 1: Load the tokenized data
features = pd.read_csv('nltk_tokenized_data.csv')

# Check if 'Strength' column exists; rename if needed
if 'Strength' not in features.columns:
    features.rename(columns={features.columns[2]: 'Strength'}, inplace=True)

# Combine tokenized questions and answers to create a single text input per entry
features['combined_text'] = features['Question_tokens'].apply(eval).apply(' '.join) + ' ' + features['Answer_tokens'].apply(eval).apply(' '.join)

# Step 2: Generate TF-IDF features from the combined text
tfidf_vectorizer = TfidfVectorizer(max_features=512)
X = tfidf_vectorizer.fit_transform(features['combined_text']).toarray()
y = features['Strength'].values

# Step 3: Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Hyperparameter Tuning using Grid Search
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Initialize RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# Set up the GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy')

# Fit Grid Search
grid_search.fit(X_train, y_train)

# Get the best parameters and estimator
best_params = grid_search.best_params_
best_rf = grid_search.best_estimator_

# Print the best parameters
print("Best Hyperparameters:", best_params)

# Step 5: Train the final model using the best parameters
final_model = best_rf.fit(X_train, y_train)

# Step 6: Evaluate the final model
y_pred = final_model.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))

# Print confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sumit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Best Hyperparameters: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 100}
              precision    recall  f1-score   support

          -1       0.42      0.49      0.46        57
           0       0.25      0.34      0.29        47
           1       0.44      0.26      0.33        68

    accuracy                           0.36       172
   macro avg       0.37      0.37      0.36       172
weighted avg       0.38      0.36      0.36       172

Confusion Matrix:
 [[28 20  9]
 [17 16 14]
 [21 29 18]]


In [9]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
import nltk
from tabulate import tabulate

# Download NLTK data files (only needed once)
nltk.download('punkt')

# Step 1: Load the tokenized data
features = pd.read_csv('nltk_tokenized_data.csv')

# Check if 'Strength' exists; rename it if necessary
if 'Strength' not in features.columns:
    features.rename(columns={features.columns[2]: 'Strength'}, inplace=True)

# Combine tokenized questions and answers into a single text entry
features['combined_text'] = features['Question_tokens'].apply(eval).apply(' '.join) + ' ' + features['Answer_tokens'].apply(eval).apply(' '.join)

# Map strength labels from {-1, 0, 1} to {0, 1, 2}
label_mapping = {-1: 0, 0: 1, 1: 2}
features['mapped_strength'] = features['Strength'].map(label_mapping)

# Step 2: Generate TF-IDF features from the combined text
tfidf_vectorizer = TfidfVectorizer(max_features=512)
X = tfidf_vectorizer.fit_transform(features['combined_text']).toarray()
y = features['mapped_strength'].values

# Step 3: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Define classifiers for comparison
classifiers = {
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'Decision Tree': DecisionTreeClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'XGBoost': XGBClassifier(eval_metric='mlogloss', use_label_encoder=False),
    'CatBoost': CatBoostClassifier(silent=True),
    'Naive Bayes': GaussianNB()
}

# Dictionary to store results
results = []

# Step 5: Model training and evaluation
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Append results
    results.append([name, accuracy, precision, recall, f1])

# Display results in a table
headers = ["Classifier", "Accuracy", "Precision", "Recall", "F1-Score"]
print(tabulate(results, headers=headers, tablefmt="grid"))

# Step 6: Map predictions back to original labels
inverse_mapping = {0: -1, 1: 0, 2: 1}
y_test_original = np.vectorize(inverse_mapping.get)(y_test)
y_pred_original = np.vectorize(inverse_mapping.get)(y_pred)

# Print classification report for original labels
print("Classification Report for Original Labels:")
print(classification_report(y_test_original, y_pred_original))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sumit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Parameters: { "use_label_encoder" } are not used.



+---------------+------------+-------------+----------+------------+
| Classifier    |   Accuracy |   Precision |   Recall |   F1-Score |
| Random Forest |   0.348837 |    0.337306 | 0.348837 |   0.335579 |
+---------------+------------+-------------+----------+------------+
| SVM           |   0.325581 |    0.331329 | 0.325581 |   0.325326 |
+---------------+------------+-------------+----------+------------+
| Decision Tree |   0.395349 |    0.409552 | 0.395349 |   0.39948  |
+---------------+------------+-------------+----------+------------+
| AdaBoost      |   0.348837 |    0.35739  | 0.348837 |   0.349459 |
+---------------+------------+-------------+----------+------------+
| XGBoost       |   0.348837 |    0.361326 | 0.348837 |   0.353504 |
+---------------+------------+-------------+----------+------------+
| CatBoost      |   0.313953 |    0.320739 | 0.313953 |   0.311513 |
+---------------+------------+-------------+----------+------------+
| Naive Bayes   |   0.348837 |    