In [1]:
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Drop rows with null values from the train dataset
train = train.dropna(subset=['Id', 'Prompt', 'Answer', 'Target'])

# Drop rows with null values from the test dataset (excluding 'Target' as it doesn't exist)
test = test.dropna(subset=['Id', 'Prompt', 'Answer'])

# Display the datasets to check the changes
print("Train dataset after removing nulls:\n", train.head())
print("Test dataset after removing nulls:\n", test.head())

# Save the cleaned datasets to new CSV files
train.to_csv('cleaned_train.csv', index=False)
test.to_csv('cleaned_test.csv', index=False)

Train dataset after removing nulls:
       Id                                             Prompt  \
0  11527  [INST] You are an AI assistant that helps peop...   
1   7322  [INST] You are an AI assistant. You will be gi...   
2  11742  [INST] You are an AI assistant. You will be gi...   
3  20928  [INST] You are an AI assistant. User will you ...   
4  25830  [INST] You are an AI assistant. User will you ...   

                                              Answer  Target  
0  Step-by-step reasoning process:\n1. Randy spen...       0  
1  What is the temperature at which hypothermia b...       0  
2  Answer: c) No. \n\nThe hypothesis is false bec...       0  
3                                         Prismatoid       0  
4                                             Case B       0  
Test dataset after removing nulls:
       Id                                             Prompt  \
0  20568  [INST] You are an AI assistant. You will be gi...   
1  17686  question:Question: This article: A

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

# Load the cleaned datasets
train = pd.read_csv('cleaned_train.csv')
test = pd.read_csv('cleaned_test.csv')

# Drop rows with null values from the train dataset (already done)
# Drop rows with null values from the test dataset (already done)

# Define features and target variable
X = train[['Prompt', 'Answer']]  # Adjust based on your actual features
y = train['Target']

# Split into train and validation sets (if needed)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE
smote = SMOTE(random_state=42)

# Since SMOTE works on numerical data, you might need to encode categorical features.
# If 'Prompt' and 'Answer' are text data, consider using a text vectorization technique (e.g., TF-IDF).
# Here’s a simple encoding as an example. Adjust as necessary for your data.

# Example: Encoding categorical text features
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train['Prompt'] + ' ' + X_train['Answer'])
X_val_vectorized = vectorizer.transform(X_val['Prompt'] + ' ' + X_val['Answer'])

# Resample the training data using SMOTE
X_resampled, y_resampled = smote.fit_resample(X_train_vectorized, y_train)

# Standardize the data (optional, but recommended for certain models)
# Here, it may not be necessary since we are using TF-IDF vectors.

# Check the balance of classes after SMOTE
print("Class distribution after SMOTE:\n", pd.Series(y_resampled).value_counts())

# Now you can train your model with the resampled data
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Train a model
model = RandomForestClassifier()
model.fit(X_resampled, y_resampled)

# Evaluate on the validation set
y_val_pred = model.predict(X_val_vectorized)
print(confusion_matrix(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))

# Optionally, evaluate on the test set (if you have true labels)
# X_test_vectorized = vectorizer.transform(test['Prompt'] + ' ' + test['Answer'])
# y_test_pred = model.predict(X_test_vectorized)





Class distribution after SMOTE:
 Target
0    12610
1    12610
Name: count, dtype: int64
[[3153   23]
 [ 141   17]]
              precision    recall  f1-score   support

           0       0.96      0.99      0.97      3176
           1       0.42      0.11      0.17       158

    accuracy                           0.95      3334
   macro avg       0.69      0.55      0.57      3334
weighted avg       0.93      0.95      0.94      3334



In [4]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Load the cleaned datasets
train = pd.read_csv('cleaned_train.csv')

# Define features and target variable
X = train[['Prompt', 'Answer']]  # Adjust based on your actual features
y = train['Target']

# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Load the BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to get BERT embeddings with batch processing
def get_bert_embeddings(text_list, batch_size=32):
    embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch_text = text_list[i:i + batch_size]

        # Tokenize the batch and convert to PyTorch tensors
        inputs = tokenizer(batch_text, return_tensors='pt', padding=True, truncation=True, max_length=128)

        # Forward pass through BERT to get embeddings
        with torch.no_grad():
            outputs = model(**inputs)
            # Take the mean of the last hidden states to get the sentence embedding
            batch_embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
            embeddings.append(batch_embeddings)

    # Concatenate all batches to form the final embedding matrix
    return np.concatenate(embeddings)

# Generate embeddings for training data
print("Generating BERT embeddings for prompts and answers...")
prompts_train = (X_train['Prompt'] + ' ' + X_train['Answer']).tolist()
prompt_embeddings_train = get_bert_embeddings(prompts_train)

# Generate embeddings for validation data
prompts_val = (X_val['Prompt'] + ' ' + X_val['Answer']).tolist()
prompt_embeddings_val = get_bert_embeddings(prompts_val)

# Check the shape of the embeddings
print("Train embeddings shape:", prompt_embeddings_train.shape)
print("Validation embeddings shape:", prompt_embeddings_val.shape)

# Apply SMOTE to the embeddings
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(prompt_embeddings_train, y_train)

# Check the balance of classes after SMOTE
print("Class distribution after SMOTE:\n", pd.Series(y_resampled).value_counts())

# Train a classifier on the resampled data
classifier = RandomForestClassifier()
classifier.fit(X_resampled, y_resampled)

# Evaluate on the validation set
y_val_pred = classifier.predict(prompt_embeddings_val)
print(confusion_matrix(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))

# Optionally, evaluate on the test set if you have true labels
# X_test = pd.read_csv('cleaned_test.csv')
# prompts_test = (X_test['Prompt'] + ' ' + X_test['Answer']).tolist()
# prompt_embeddings_test = get_bert_embeddings(prompts_test)
# y_test_pred = classifier.predict(prompt_embeddings_test)


  from .autonotebook import tqdm as notebook_tqdm


Generating BERT embeddings for prompts and answers...
Train embeddings shape: (13334, 768)
Validation embeddings shape: (3334, 768)
Class distribution after SMOTE:
 Target
0    12610
1    12610
Name: count, dtype: int64
[[3159   17]
 [ 144   14]]
              precision    recall  f1-score   support

           0       0.96      0.99      0.98      3176
           1       0.45      0.09      0.15       158

    accuracy                           0.95      3334
   macro avg       0.70      0.54      0.56      3334
weighted avg       0.93      0.95      0.94      3334



In [5]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the cleaned datasets
train = pd.read_csv('cleaned_train.csv')

# Define features and target variable
X = train[['Prompt', 'Answer']]  # Adjust based on your actual features
y = train['Target']

# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Load the BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)  # Move the model to GPU

# Function to get BERT embeddings with batch processing
def get_bert_embeddings(text_list, batch_size=32):
    embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch_text = text_list[i:i + batch_size]

        # Tokenize the batch and convert to PyTorch tensors
        inputs = tokenizer(batch_text, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)

        # Forward pass through BERT to get embeddings
        with torch.no_grad():
            outputs = model(**inputs)
            # Take the mean of the last hidden states to get the sentence embedding
            batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Move to CPU for numpy conversion
            embeddings.append(batch_embeddings)

    # Concatenate all batches to form the final embedding matrix
    return np.concatenate(embeddings)

# Generate embeddings for training data
print("Generating BERT embeddings for prompts and answers...")
prompts_train = (X_train['Prompt'] + ' ' + X_train['Answer']).tolist()
prompt_embeddings_train = get_bert_embeddings(prompts_train)

# Generate embeddings for validation data
prompts_val = (X_val['Prompt'] + ' ' + X_val['Answer']).tolist()
prompt_embeddings_val = get_bert_embeddings(prompts_val)

# Check the shape of the embeddings
print("Train embeddings shape:", prompt_embeddings_train.shape)
print("Validation embeddings shape:", prompt_embeddings_val.shape)

# Apply SMOTE to the embeddings
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(prompt_embeddings_train, y_train)

# Check the balance of classes after SMOTE
print("Class distribution after SMOTE:\n", pd.Series(y_resampled).value_counts())

# Train a classifier on the resampled data
classifier = RandomForestClassifier()
classifier.fit(X_resampled, y_resampled)

# Evaluate on the validation set
y_val_pred = classifier.predict(prompt_embeddings_val)
print(confusion_matrix(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))

# Optionally, evaluate on the test set if you have true labels
# X_test = pd.read_csv('cleaned_test.csv')
# prompts_test = (X_test['Prompt'] + ' ' + X_test['Answer']).tolist()
# prompt_embeddings_test = get_bert_embeddings(prompts_test)
# y_test_pred = classifier.predict(prompt_embeddings_test)


Generating BERT embeddings for prompts and answers...
Train embeddings shape: (13334, 768)
Validation embeddings shape: (3334, 768)
Class distribution after SMOTE:
 Target
0    12610
1    12610
Name: count, dtype: int64
[[3163   13]
 [ 143   15]]
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      3176
           1       0.54      0.09      0.16       158

    accuracy                           0.95      3334
   macro avg       0.75      0.55      0.57      3334
weighted avg       0.94      0.95      0.94      3334



In [6]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the cleaned datasets
train = pd.read_csv('cleaned_train.csv')

# Define features and target variable
X = train[['Prompt', 'Answer']]
y = train['Target']

# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Load the BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

# Function to get BERT embeddings with batch processing
def get_bert_embeddings(text_list, batch_size=32):
    embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch_text = text_list[i:i + batch_size]
        inputs = tokenizer(batch_text, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)
        
        with torch.no_grad():
            outputs = model(**inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
            embeddings.append(batch_embeddings)

    return np.concatenate(embeddings)

# Generate embeddings for training and validation data
print("Generating BERT embeddings for prompts and answers...")
prompts_train = (X_train['Prompt'] + ' ' + X_train['Answer']).tolist()
prompt_embeddings_train = get_bert_embeddings(prompts_train)

prompts_val = (X_val['Prompt'] + ' ' + X_val['Answer']).tolist()
prompt_embeddings_val = get_bert_embeddings(prompts_val)

# Check the shape of the embeddings
print("Train embeddings shape:", prompt_embeddings_train.shape)
print("Validation embeddings shape:", prompt_embeddings_val.shape)

# Apply SMOTE to the embeddings
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(prompt_embeddings_train, y_train)

# Check the balance of classes after SMOTE
print("Class distribution after SMOTE:\n", pd.Series(y_resampled).value_counts())

# List of classifiers to evaluate
classifiers = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
}

# Initialize a DataFrame to store results
results_df = pd.DataFrame(columns=["Classifier", "Val Accuracy", "Val F1", "Val Precision", "Val Recall"])

# Function to evaluate each classifier
def evaluate_classifier(clf_name, clf, X_train, y_train, X_val, y_val):
    clf.fit(X_train, y_train)
    y_val_pred = clf.predict(X_val)
    
    accuracy = (y_val_pred == y_val).mean()
    f1 = f1_score(y_val, y_val_pred, average='weighted')
    precision = precision_score(y_val, y_val_pred, average='weighted')
    recall = recall_score(y_val, y_val_pred, average='weighted')
    
    results_df.loc[len(results_df)] = [clf_name, accuracy, f1, precision, recall]

# Loop through classifiers and evaluate each one
for clf_name, clf in classifiers.items():
    print(f"Evaluating {clf_name}...")
    evaluate_classifier(clf_name, clf, X_resampled, y_resampled, prompt_embeddings_val, y_val)

# Display the results
print(results_df)

# Optionally, save the results to a CSV file
results_df.to_csv('classifier_comparison_results.csv', index=False)


Generating BERT embeddings for prompts and answers...
Train embeddings shape: (13334, 768)
Validation embeddings shape: (3334, 768)
Class distribution after SMOTE:
 Target
0    12610
1    12610
Name: count, dtype: int64
Evaluating Logistic Regression...


NameError: name 'f1_score' is not defined