In [3]:
import pandas as pd

def load_dataset(file_path):
    return pd.read_csv(file_path)


data = load_dataset('train.csv')


In [4]:
def clean_missing_values(df, column):
    return df.dropna(subset=[column])

# Clean the dataset by removing rows with missing 'Answer'
data_cleaned = clean_missing_values(data, "Answer")


In [5]:
def verify_cleaning(df):
    """Check for any remaining missing values and print dataset shape."""
    print("Missing values per column:")
    print(df.isnull().sum())
    print("\nDataset shape:", df.shape)

# Verify cleaning
verify_cleaning(data_cleaned)


Missing values per column:
Id        0
Prompt    0
Answer    0
Target    0
dtype: int64

Dataset shape: (16668, 4)


In [6]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def compute_bert_embeddings(prompt, answer):
    """Generate BERT embeddings for the concatenated prompt and answer."""
    # Combine prompt and answer
    text = f"{prompt} [SEP] {answer}"
    
    # Tokenize and create input tensors
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512, padding='max_length')
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Use the CLS token's output as the embedding
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()


  from .autonotebook import tqdm as notebook_tqdm


In [8]:
import numpy as np

def generate_embeddings(df, model, tokenizer):
    """Apply the embedding function to the entire dataset."""
    embeddings = []
    for _, row in df.iterrows():
        embedding = compute_bert_embeddings(row['Prompt'], row['Answer'])
        embeddings.append(embedding)
    return np.array(embeddings)

# Generate embeddings
bert_embeddings = generate_embeddings(data_cleaned, model, tokenizer)


KeyboardInterrupt: 

In [9]:
def save_embeddings(embeddings, file_name):
    """Save embeddings to a .npy file."""
    np.save(file_name, embeddings)

# Save the BERT embeddings
save_embeddings(bert_embeddings, 'bert_embeddings.npy')


NameError: name 'bert_embeddings' is not defined

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


In [13]:
from sklearn.model_selection import train_test_split

# Load the saved embeddings
import numpy as np
saved_embeddings_path = "bert_embeddings.npy"
bert_embeddings = np.load(saved_embeddings_path)

# Assuming 'data_cleaned' is a DataFrame and 'Target' is a column in it
# Replace 'data_cleaned['Target']' with your actual target variable if needed
target = data_cleaned['Target'].values  # Ensure this is a NumPy array

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(bert_embeddings, target, test_size=0.2, random_state=42)

print("Training and testing data split successfully.")

Training and testing data split successfully.


In [14]:
def train_logistic_regression(X_train, y_train):
    """Train a Logistic Regression model."""
    model = LogisticRegression(max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    return model

# Train the logistic regression model
logistic_model = train_logistic_regression(X_train, y_train)


In [17]:
def evaluate_model(model, X_test, y_test):
    """Evaluate the model's performance on test data."""
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", round(accuracy, 5))  # Display accuracy with 5 decimal places
    
    # Get the classification report and format the precision and recall to 5 decimal places
    report = classification_report(y_test, y_pred, digits=5)
    print("\nClassification Report:\n", report)
    
    return accuracy
# Evaluate the logistic regression model
logistic_accuracy = evaluate_model(logistic_model, X_test, y_test)


Accuracy: 0.94841

Classification Report:
               precision    recall  f1-score   support

           0    0.95626   0.99118   0.97341      3176
           1    0.33333   0.08861   0.14000       158

    accuracy                        0.94841      3334
   macro avg    0.64480   0.53990   0.55670      3334
weighted avg    0.92674   0.94841   0.93391      3334



In [18]:
from sklearn.ensemble import RandomForestClassifier


In [19]:
def train_random_forest(X_train, y_train, n_estimators=100, random_state=42):
    """Train a Random Forest Classifier."""
    model = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)
    model.fit(X_train, y_train)
    return model

# Train the Random Forest model
random_forest_model = train_random_forest(X_train, y_train)


In [20]:
# Evaluate the Random Forest model
random_forest_accuracy = evaluate_model(random_forest_model, X_test, y_test)


Accuracy: 0.95501

Classification Report:
               precision    recall  f1-score   support

           0    0.95490   1.00000   0.97693      3176
           1    1.00000   0.05063   0.09639       158

    accuracy                        0.95501      3334
   macro avg    0.97745   0.52532   0.53666      3334
weighted avg    0.95704   0.95501   0.93520      3334



In [21]:
from xgboost import XGBClassifier


In [22]:
def train_xgboost(X_train, y_train, random_state=42):
    """Train an XGBoost Classifier."""
    model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=random_state)
    model.fit(X_train, y_train)
    return model

# Train the XGBoost model
xgboost_model = train_xgboost(X_train, y_train)


Parameters: { "use_label_encoder" } are not used.



In [23]:
# Evaluate the XGBoost model
xgboost_accuracy = evaluate_model(xgboost_model, X_test, y_test)


Accuracy: 0.95471

Classification Report:
               precision    recall  f1-score   support

           0    0.95571   0.99874   0.97675      3176
           1    0.73333   0.06962   0.12717       158

    accuracy                        0.95471      3334
   macro avg    0.84452   0.53418   0.55196      3334
weighted avg    0.94517   0.95471   0.93649      3334



In [24]:
from sklearn.tree import DecisionTreeClassifier


In [25]:
def train_decision_tree(X_train, y_train, random_state=42):
    """Train a Decision Tree Classifier."""
    model = DecisionTreeClassifier(random_state=random_state)
    model.fit(X_train, y_train)
    return model

# Train the Decision Tree model
decision_tree_model = train_decision_tree(X_train, y_train)


In [26]:
# Evaluate the Decision Tree model
decision_tree_accuracy = evaluate_model(decision_tree_model, X_test, y_test)

Accuracy: 0.90792

Classification Report:
               precision    recall  f1-score   support

           0    0.95992   0.94270   0.95123      3176
           1    0.15349   0.20886   0.17694       158

    accuracy                        0.90792      3334
   macro avg    0.55671   0.57578   0.56409      3334
weighted avg    0.92171   0.90792   0.91454      3334



In [27]:
from sklearn.ensemble import AdaBoostClassifier


In [28]:
def train_adaboost(X_train, y_train, n_estimators=50, random_state=42):
    """Train an AdaBoost Classifier."""
    model = AdaBoostClassifier(n_estimators=n_estimators, random_state=random_state)
    model.fit(X_train, y_train)
    return model

# Train the AdaBoost model
adaboost_model = train_adaboost(X_train, y_train)




In [29]:
# Evaluate the AdaBoost model
adaboost_accuracy = evaluate_model(adaboost_model, X_test, y_test)

Accuracy: 0.95021

Classification Report:
               precision    recall  f1-score   support

           0    0.95496   0.99465   0.97440      3176
           1    0.34615   0.05696   0.09783       158

    accuracy                        0.95021      3334
   macro avg    0.65056   0.52580   0.53611      3334
weighted avg    0.92611   0.95021   0.93286      3334



In [30]:
from sklearn.naive_bayes import GaussianNB

In [31]:
def train_naive_bayes(X_train, y_train):
    """Train a Gaussian Naive Bayes Classifier."""
    model = GaussianNB()
    model.fit(X_train, y_train)
    return model

# Train the Naive Bayes model
naive_bayes_model = train_naive_bayes(X_train, y_train)


In [32]:
# Evaluate the Naive Bayes model
naive_bayes_accuracy = evaluate_model(naive_bayes_model, X_test, y_test)

Accuracy: 0.67876

Classification Report:
               precision    recall  f1-score   support

           0    0.97218   0.68230   0.80185      3176
           1    0.08688   0.60759   0.15202       158

    accuracy                        0.67876      3334
   macro avg    0.52953   0.64495   0.47693      3334
weighted avg    0.93023   0.67876   0.77105      3334



In [33]:
from sklearn.svm import SVC

In [34]:
def train_svm(X_train, y_train, kernel='linear', random_state=42):
    """Train a Support Vector Machine (SVM) Classifier."""
    model = SVC(kernel=kernel, random_state=random_state)
    model.fit(X_train, y_train)
    return model

# Train the SVM model
svm_model = train_svm(X_train, y_train)

In [35]:
# Evaluate the SVM model
svm_accuracy = evaluate_model(svm_model, X_test, y_test)

Accuracy: 0.95411

Classification Report:
               precision    recall  f1-score   support

           0    0.95404   1.00000   0.97648      3176
           1    1.00000   0.03165   0.06135       158

    accuracy                        0.95411      3334
   macro avg    0.97702   0.51582   0.51891      3334
weighted avg    0.95622   0.95411   0.93311      3334



In [36]:
from sklearn.linear_model import Perceptron

In [37]:
def train_perceptron(X_train, y_train, max_iter=1000, random_state=42):
    """Train a Perceptron Classifier."""
    model = Perceptron(max_iter=max_iter, random_state=random_state)
    model.fit(X_train, y_train)
    return model

# Train the Perceptron model
perceptron_model = train_perceptron(X_train, y_train)


In [38]:
# Evaluate the Perceptron model
perceptron_accuracy = evaluate_model(perceptron_model, X_test, y_test)


Accuracy: 0.95171

Classification Report:
               precision    recall  f1-score   support

           0    0.95284   0.99874   0.97525      3176
           1    0.20000   0.00633   0.01227       158

    accuracy                        0.95171      3334
   macro avg    0.57642   0.50253   0.49376      3334
weighted avg    0.91716   0.95171   0.92961      3334



In [39]:
from sklearn.neural_network import MLPClassifier


In [40]:
def train_mlp(X_train, y_train, hidden_layer_sizes=(100,), max_iter=300, random_state=42):
    """Train a Multi-Layer Perceptron (MLP) Classifier."""
    model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, max_iter=max_iter, random_state=random_state)
    model.fit(X_train, y_train)
    return model

# Train the MLP model
mlp_model = train_mlp(X_train, y_train)


In [41]:
# Evaluate the MLP model
mlp_accuracy = evaluate_model(mlp_model, X_test, y_test)


Accuracy: 0.94991

Classification Report:
               precision    recall  f1-score   support

           0    0.96023   0.98835   0.97409      3176
           1    0.43077   0.17722   0.25112       158

    accuracy                        0.94991      3334
   macro avg    0.69550   0.58278   0.61260      3334
weighted avg    0.93514   0.94991   0.93983      3334



In [42]:
from sklearn.neighbors import KNeighborsClassifier

def train_knn(X_train, y_train, n_neighbors=5, metric='minkowski', p=2):
    """Train a k-Nearest Neighbors (kNN) Classifier."""
    model = KNeighborsClassifier(n_neighbors=n_neighbors, metric=metric, p=p)
    model.fit(X_train, y_train)
    return model

# Train the kNN model
knn_model = train_knn(X_train, y_train)


In [43]:
# Evaluate the kNN model
knn_accuracy = evaluate_model(knn_model, X_test, y_test)


Accuracy: 0.95231

Classification Report:
               precision    recall  f1-score   support

           0    0.95643   0.99528   0.97547      3176
           1    0.48276   0.08861   0.14973       158

    accuracy                        0.95231      3334
   macro avg    0.71959   0.54194   0.56260      3334
weighted avg    0.93398   0.95231   0.93633      3334



In [44]:
from imblearn.over_sampling import SMOTE

In [45]:
def apply_smote(X_train, y_train):
    """Apply SMOTE to balance the data."""
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
    return X_resampled, y_resampled

# Apply SMOTE to balance the training data
X_train_resampled, y_train_resampled = apply_smote(X_train, y_train)


In [46]:
# Retrain Logistic Regression on resampled data
logistic_model_resampled = train_logistic_regression(X_train_resampled, y_train_resampled)

# Evaluate the Logistic Regression model on resampled data
logistic_accuracy_resampled = evaluate_model(logistic_model_resampled, X_test, y_test)


Accuracy: 0.81074

Classification Report:
               precision    recall  f1-score   support

           0    0.97217   0.82494   0.89252      3176
           1    0.12989   0.52532   0.20828       158

    accuracy                        0.81074      3334
   macro avg    0.55103   0.67513   0.55040      3334
weighted avg    0.93225   0.81074   0.86010      3334



In [47]:
from sklearn.model_selection import train_test_split

def split_data(X_resampled, y_resampled, test_size=0.2, random_state=42):
    """Split the resampled data into training and test sets."""
    X_train_resampled, X_test_resampled, y_train_resampled, y_test_resampled = train_test_split(
        X_resampled, y_resampled, test_size=test_size, random_state=random_state
    )
    return X_train_resampled, X_test_resampled, y_train_resampled, y_test_resampled

# Split the resampled data
X_train_resampled, X_test_resampled, y_train_resampled, y_test_resampled = split_data(X_train_resampled, y_train_resampled)


In [48]:
# Retrain Logistic Regression on the resampled data
logistic_model_resampled = train_logistic_regression(X_train_resampled, y_train_resampled)

# Evaluate Logistic Regression model on the resampled test set
logistic_accuracy_resampled = evaluate_model(logistic_model_resampled, X_test_resampled, y_test_resampled)


Accuracy: 0.86142

Classification Report:
               precision    recall  f1-score   support

           0    0.90451   0.81311   0.85638      2563
           1    0.82518   0.91133   0.86612      2481

    accuracy                        0.86142      5044
   macro avg    0.86485   0.86222   0.86125      5044
weighted avg    0.86549   0.86142   0.86117      5044



In [49]:
from sklearn.ensemble import RandomForestClassifier

def train_random_forest(X_train, y_train, random_state=42):
    """Train a Random Forest Classifier."""
    model = RandomForestClassifier(random_state=random_state)
    model.fit(X_train, y_train)
    return model

# Train the Random Forest model on resampled data
rf_model_resampled = train_random_forest(X_train_resampled, y_train_resampled)


In [50]:
# Evaluate the Random Forest model on the resampled test data
rf_accuracy_resampled = evaluate_model(rf_model_resampled, X_test_resampled, y_test_resampled)
print(f"Random Forest Accuracy on SMOTE data: {rf_accuracy_resampled}")


Accuracy: 0.99108

Classification Report:
               precision    recall  f1-score   support

           0    0.99103   0.99142   0.99122      2563
           1    0.99113   0.99073   0.99093      2481

    accuracy                        0.99108      5044
   macro avg    0.99108   0.99107   0.99108      5044
weighted avg    0.99108   0.99108   0.99108      5044

Random Forest Accuracy on SMOTE data: 0.9910785091197463


In [51]:
import xgboost as xgb

def train_xgboost(X_train, y_train, random_state=42):
    """Train an XGBoost Classifier."""
    model = xgb.XGBClassifier(random_state=random_state)
    model.fit(X_train, y_train)
    return model

# Train the XGBoost model on resampled data
xgboost_model_resampled = train_xgboost(X_train_resampled, y_train_resampled)


In [52]:
# Evaluate the XGBoost model on the resampled test data
xgboost_accuracy_resampled = evaluate_model(xgboost_model_resampled, X_test_resampled, y_test_resampled)
print(f"XGBoost Accuracy on SMOTE data: {xgboost_accuracy_resampled}")


Accuracy: 0.98454

Classification Report:
               precision    recall  f1-score   support

           0    0.99800   0.97152   0.98458      2563
           1    0.97136   0.99798   0.98449      2481

    accuracy                        0.98454      5044
   macro avg    0.98468   0.98475   0.98454      5044
weighted avg    0.98490   0.98454   0.98454      5044

XGBoost Accuracy on SMOTE data: 0.9845360824742269


In [53]:
from sklearn.tree import DecisionTreeClassifier

def train_decision_tree(X_train, y_train, random_state=42):
    """Train a Decision Tree Classifier."""
    model = DecisionTreeClassifier(random_state=random_state)
    model.fit(X_train, y_train)
    return model

# Train the Decision Tree model on resampled data
decision_tree_model_resampled = train_decision_tree(X_train_resampled, y_train_resampled)


In [54]:
# Evaluate the Decision Tree model on the resampled test data
decision_tree_accuracy_resampled = evaluate_model(decision_tree_model_resampled, X_test_resampled, y_test_resampled)
print(f"Decision Tree Accuracy on SMOTE data: {decision_tree_accuracy_resampled}")


Accuracy: 0.88204

Classification Report:
               precision    recall  f1-score   support

           0    0.91137   0.85057   0.87992      2563
           1    0.85558   0.91455   0.88408      2481

    accuracy                        0.88204      5044
   macro avg    0.88348   0.88256   0.88200      5044
weighted avg    0.88393   0.88204   0.88197      5044

Decision Tree Accuracy on SMOTE data: 0.8820380650277557


In [55]:
from sklearn.ensemble import AdaBoostClassifier

def train_adaboost(X_train, y_train, random_state=42):
    """Train an AdaBoost Classifier."""
    model = AdaBoostClassifier(random_state=random_state)
    model.fit(X_train, y_train)
    return model

# Train the AdaBoost model on resampled data
adaboost_model_resampled = train_adaboost(X_train_resampled, y_train_resampled)




In [56]:
# Evaluate the AdaBoost model on the resampled test data
adaboost_accuracy_resampled = evaluate_model(adaboost_model_resampled, X_test_resampled, y_test_resampled)
print(f"AdaBoost Accuracy on SMOTE data: {adaboost_accuracy_resampled}")


Accuracy: 0.79342

Classification Report:
               precision    recall  f1-score   support

           0    0.81155   0.77292   0.79177      2563
           1    0.77641   0.81459   0.79504      2481

    accuracy                        0.79342      5044
   macro avg    0.79398   0.79376   0.79340      5044
weighted avg    0.79427   0.79342   0.79338      5044

AdaBoost Accuracy on SMOTE data: 0.7934179222839016


In [57]:
from sklearn.naive_bayes import GaussianNB

def train_naive_bayes(X_train, y_train):
    """Train a Naive Bayes Classifier."""
    model = GaussianNB()
    model.fit(X_train, y_train)
    return model

# Train the Naive Bayes model on resampled data
naive_bayes_model_resampled = train_naive_bayes(X_train_resampled, y_train_resampled)


In [58]:
# Evaluate the Naive Bayes model on the resampled test data
naive_bayes_accuracy_resampled = evaluate_model(naive_bayes_model_resampled, X_test_resampled, y_test_resampled)
print(f"Naive Bayes Accuracy on SMOTE data: {naive_bayes_accuracy_resampled}")


Accuracy: 0.66872

Classification Report:
               precision    recall  f1-score   support

           0    0.69459   0.62115   0.65582      2563
           1    0.64717   0.71786   0.68068      2481

    accuracy                        0.66872      5044
   macro avg    0.67088   0.66950   0.66825      5044
weighted avg    0.67126   0.66872   0.66805      5044

Naive Bayes Accuracy on SMOTE data: 0.6687153053132434


In [59]:
from sklearn.svm import SVC

def train_svm(X_train, y_train, random_state=42):
    """Train a Support Vector Machine (SVM) Classifier."""
    model = SVC(random_state=random_state)
    model.fit(X_train, y_train)
    return model

# Train the SVM model on resampled data
svm_model_resampled = train_svm(X_train_resampled, y_train_resampled)


In [60]:
# Evaluate the SVM model on the resampled test data
svm_accuracy_resampled = evaluate_model(svm_model_resampled, X_test_resampled, y_test_resampled)
print(f"SVM Accuracy on SMOTE data: {svm_accuracy_resampled}")


Accuracy: 0.94508

Classification Report:
               precision    recall  f1-score   support

           0    0.95214   0.93913   0.94559      2563
           1    0.93800   0.95123   0.94457      2481

    accuracy                        0.94508      5044
   macro avg    0.94507   0.94518   0.94508      5044
weighted avg    0.94518   0.94508   0.94509      5044

SVM Accuracy on SMOTE data: 0.9450832672482157


In [61]:
from sklearn.linear_model import Perceptron

def train_perceptron(X_train, y_train, random_state=42):
    """Train a Perceptron Classifier."""
    model = Perceptron(random_state=random_state)
    model.fit(X_train, y_train)
    return model

# Train the Perceptron model on resampled data
perceptron_model_resampled = train_perceptron(X_train_resampled, y_train_resampled)


In [62]:
# Evaluate the Perceptron model on the resampled test data
perceptron_accuracy_resampled = evaluate_model(perceptron_model_resampled, X_test_resampled, y_test_resampled)
print(f"Perceptron Accuracy on SMOTE data: {perceptron_accuracy_resampled}")

Accuracy: 0.81582

Classification Report:
               precision    recall  f1-score   support

           0    0.81839   0.81935   0.81887      2563
           1    0.81316   0.81217   0.81266      2481

    accuracy                        0.81582      5044
   macro avg    0.81578   0.81576   0.81577      5044
weighted avg    0.81582   0.81582   0.81582      5044

Perceptron Accuracy on SMOTE data: 0.8158207771609833


In [63]:
from sklearn.neural_network import MLPClassifier

def train_mlp(X_train, y_train, random_state=42):
    """Train a Multilayer Perceptron (MLP) Classifier."""
    model = MLPClassifier(random_state=random_state, max_iter=500)
    model.fit(X_train, y_train)
    return model

# Train the MLP model on resampled data
mlp_model_resampled = train_mlp(X_train_resampled, y_train_resampled)


In [64]:
# Evaluate the MLP model on the resampled test data
mlp_accuracy_resampled = evaluate_model(mlp_model_resampled, X_test_resampled, y_test_resampled)
print(f"MLP Accuracy on SMOTE data: {mlp_accuracy_resampled}")


Accuracy: 0.97343

Classification Report:
               precision    recall  f1-score   support

           0    1.00000   0.94772   0.97316      2563
           1    0.94876   1.00000   0.97370      2481

    accuracy                        0.97343      5044
   macro avg    0.97438   0.97386   0.97343      5044
weighted avg    0.97480   0.97343   0.97343      5044

MLP Accuracy on SMOTE data: 0.9734337827121332


In [65]:
from sklearn.neighbors import KNeighborsClassifier

def train_knn(X_train, y_train, n_neighbors=5):
    """Train a K-Nearest Neighbors (KNN) Classifier."""
    model = KNeighborsClassifier(n_neighbors=n_neighbors)
    model.fit(X_train, y_train)
    return model

# Train the KNN model on resampled data
knn_model_resampled = train_knn(X_train_resampled, y_train_resampled)


In [66]:
# Evaluate the KNN model on the resampled test data
knn_accuracy_resampled = evaluate_model(knn_model_resampled, X_test_resampled, y_test_resampled)
print(f"KNN Accuracy on SMOTE data: {knn_accuracy_resampled}")


Accuracy: 0.76864

Classification Report:
               precision    recall  f1-score   support

           0    1.00000   0.54467   0.70523      2563
           1    0.68010   1.00000   0.80959      2481

    accuracy                        0.76864      5044
   macro avg    0.84005   0.77234   0.75741      5044
weighted avg    0.84265   0.76864   0.75656      5044

KNN Accuracy on SMOTE data: 0.7686360031720857


In [67]:
# Checking the number of features in your dataset
num_features = X_train_resampled.shape[1]
print(f"The dataset has {num_features} features.")


The dataset has 768 features.


In [68]:
# Extract feature importances from the Random Forest model
rf_feature_importances = rf_model_resampled.feature_importances_

# Create a DataFrame for better visualization
rf_importance_df = pd.DataFrame({
    "Feature": X_train_resampled.columns if hasattr(X_train_resampled, 'columns') else range(len(rf_feature_importances)),
    "Importance": rf_feature_importances
})

# Sort by importance
rf_importance_df = rf_importance_df.sort_values(by="Importance", ascending=False)

# Select the top 15 features
top_15_features_rf = rf_importance_df.head(15)
print(top_15_features_rf)


     Feature  Importance
685      685    0.008952
127      127    0.008950
406      406    0.007081
46        46    0.006471
370      370    0.005836
666      666    0.005742
339      339    0.005441
109      109    0.005380
251      251    0.005369
161      161    0.005004
1          1    0.004837
588      588    0.004222
752      752    0.004136
273      273    0.003823
198      198    0.003779


In [71]:
# Extract feature importances from the Random Forest model
rf_feature_importances = rf_model_resampled.feature_importances_

# Create a DataFrame for better visualization
rf_importance_df = pd.DataFrame({
    "Feature": X_train_resampled.columns if hasattr(X_train_resampled, 'columns') else range(len(rf_feature_importances)),
    "Importance": rf_feature_importances
})

# Sort by importance
rf_importance_df = rf_importance_df.sort_values(by="Importance", ascending=False)

# Select the top 15 features
top_15_features_rf = rf_importance_df.tail(15)
print(top_15_features_rf)


     Feature  Importance
131      131    0.000714
396      396    0.000712
519      519    0.000706
14        14    0.000702
136      136    0.000691
234      234    0.000677
292      292    0.000673
188      188    0.000662
537      537    0.000659
391      391    0.000655
552      552    0.000653
718      718    0.000651
84        84    0.000640
266      266    0.000632
282      282    0.000618


In [None]:
import shap
def shap_for_tree_model(model, X_resampled):
    """Apply SHAP to explain tree-based models with additivity check disabled."""
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_resampled, check_additivity=False)
    return shap_values

In [None]:
selected_features = [685, 127]
X_selected_features = X_train_resampled.iloc[:, selected_features]

In [None]:
# Single instance with selected features
instance_selected = X_selected_features.iloc[0:1]

# Compute SHAP values for the instance
shap_values_instance = shap.TreeExplainer(rf_model_resampled).shap_values(instance_selected)

# Visualize feature impact for the instance
shap.force_plot(
    base_value=shap.TreeExplainer(rf_model_resampled).expected_value[1],
    shap_values=shap_values_instance[1],
    features=instance_selected,
)


In [None]:
#User Input

In [75]:
def get_embeddings_for_input(prompt, answer, tokenizer, model):
    """Generate BERT embeddings for a single user input."""
    # Combine prompt and answer
    text = f"{prompt} [SEP] {answer}"
    
    # Tokenize and encode
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    
    # Generate embeddings using BERT
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    
    return embeddings


In [76]:
def predict_user_input(prompt, answer, model, tokenizer, bert_model):
    """Predict the label for user input using the best model."""
    # Generate embeddings for the input
    user_embeddings = get_embeddings_for_input(prompt, answer, tokenizer, bert_model)
    
    # Reshape embeddings to match input format
    user_embeddings = user_embeddings.reshape(1, -1)
    
    # Predict the label
    prediction = model.predict(user_embeddings)
    return prediction[0]


In [77]:
from transformers import BertTokenizer, BertModel
import torch

# Reload BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")

# Define the function to embed user input and predict
def predict_user_input(prompt, answer, model, tokenizer, bert_model):
    """Predict the label for user-provided prompt and answer."""
    # Combine prompt and answer into a single string
    combined_text = f"{prompt} [SEP] {answer}"
    
    # Tokenize and encode the input
    inputs = tokenizer(
        combined_text, return_tensors="pt", padding="max_length", truncation=True, max_length=512
    )
    
    # Get embeddings from BERT
    with torch.no_grad():
        embeddings = bert_model(**inputs).pooler_output.numpy()
    
    # Predict using the provided model
    prediction = model.predict(embeddings)
    return prediction[0]

# Example user input
user_prompt = input("Enter a prompt: ")
user_answer = input("Enter an answer: ")

# Predict using the Random Forest model (best model in this case)
best_prediction = predict_user_input(user_prompt, user_answer, rf_model_resampled, tokenizer, bert_model)

# Output the prediction
print(f"The predicted label for the given input is: {best_prediction}")


The predicted label for the given input is: 0
