In [None]:
# Mount Google Drive - applicable, if working on Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Grid Sear̥ch CV

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
import pickle

# Load the training dataset
train_data_path = '/content/drive/MyDrive/D1/train.csv'
train_data = pd.read_csv(train_data_path)

# Load the validation dataset
val_data_path = '/content/drive/MyDrive/D1/val.csv'
val_data = pd.read_csv(val_data_path)

# Extract features and labels from training data
X_train = train_data['title']  # Assuming 'title' is the column containing text data
y_train = train_data['label'].astype(int)

# Extract features and labels from validation data
X_val = val_data['title']  # Assuming 'title' is the column containing text data
y_val = val_data['label'].astype(int)

# Define the pipeline with TfidfVectorizer and MultinomialNB
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=500)),  # Reduce number of features for faster computation
    ('nb', MultinomialNB())
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'nb__alpha': [0.1, 0.5, 1.0],  # Smoothing parameter
    'nb__fit_prior': [True, False]  # Whether to learn class prior probabilities or not
}

# Perform GridSearchCV with cross-validation on training data
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=3, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model and evaluate it on validation data
best_pipeline = grid_search.best_estimator_
y_pred = best_pipeline.predict(X_val)

# Print evaluation metrics
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.2%}")
print(f"Accuracy on validation set: {accuracy_score(y_val, y_pred):.2%}")
print("Classification Report:")
print(classification_report(y_val, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))

# Save the best pipeline using pickle
model_save_path = '/content/drive/MyDrive/D1/naive/gridsearch_pipeline.pkl'
with open(model_save_path, 'wb') as f:
    pickle.dump(best_pipeline, f)

print(f"Model saved at {model_save_path}")

# Load the test dataset
test_data_path = '/content/drive/MyDrive/D1/test.csv'
test_data = pd.read_csv(test_data_path)

# Extract features and labels from test data
X_test = test_data['title']  # Assuming 'title' is the column containing text data
y_test = test_data['label'].astype(int)

# Load the saved pipeline
with open(model_save_path, 'rb') as f:
    loaded_pipeline = pickle.load(f)

# Make predictions on the test data
y_test_pred = loaded_pipeline.predict(X_test)

# Print evaluation metrics for the test set
print(f"Accuracy on test set: {accuracy_score(y_test, y_test_pred):.2%}")
print("Classification Report:")
print(classification_report(y_test, y_test_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))

In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
import pickle
from transformers import BertTokenizer, BertModel

In [None]:
# Load and prepare data
train_data_path = '/content/drive/MyDrive/D1/train.csv'
val_data_path = '/content/drive/MyDrive/D1/val.csv'
test_data_path = '/content/drive/MyDrive/D1/test.csv'

train_data = pd.read_csv(train_data_path)
val_data = pd.read_csv(val_data_path)
test_data = pd.read_csv(test_data_path)

X_train = train_data['title']
y_train = train_data['label'].astype(int)
X_val = val_data['title']
y_val = val_data['label'].astype(int)
X_test = test_data['title']
y_test = test_data['label'].astype(int)

In [None]:
# Define and train Naive Bayes pipeline
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=500)),
    ('nb', MultinomialNB())
])

param_grid = {
    'nb__alpha': [0.1, 0.5, 1.0],
    'nb__fit_prior': [True, False]
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=3, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_pipeline = grid_search.best_estimator_
y_pred_val_nb = best_pipeline.predict(X_val)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.2%}")
print(f"Accuracy on validation set: {accuracy_score(y_val, y_pred_val_nb):.2%}")
print("Classification Report:")
print(classification_report(y_val, y_pred_val_nb))
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred_val_nb))

# Save the best Naive Bayes pipeline
model_save_path = '/content/drive/MyDrive/D1/D1_Naive/gridsearch_pipeline.pkl'
with open(model_save_path, 'wb') as f:
    pickle.dump(best_pipeline, f)

# Bayes Optimization

In [None]:
!pip install scikit-optimize

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from skopt import BayesSearchCV
from skopt.space import Real, Categorical
import pickle

# Paths to data files
train_data_path = '/content/drive/MyDrive/D1/train.csv'
val_data_path = '/content/drive/MyDrive/D1/val.csv'
test_data_path = '/content/drive/MyDrive/D1/test.csv'
model_save_path = '/content/drive/MyDrive/D1/naive/bayes_pipeline.pkl'

# Load data
train_data = pd.read_csv(train_data_path)
val_data = pd.read_csv(val_data_path)
test_data = pd.read_csv(test_data_path)

# Extract features and labels from training data
X_train = train_data['title']
y_train = train_data['label'].astype(int)

# Extract features and labels from validation data
X_val = val_data['title']
y_val = val_data['label'].astype(int)

# Extract features and labels from test data
X_test = test_data['title']
y_test = test_data['label'].astype(int)

# Define the pipeline with TF-IDF vectorizer and Naive Bayes model
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=500)),
    ('nb', MultinomialNB())
])

# Define the parameter space for Bayesian Optimization
param_space = {
    'nb__alpha': Real(1e-3, 1.0, prior='log-uniform'),
    'nb__fit_prior': Categorical([True, False])
}

# Perform Bayesian Optimization with cross-validation on training data
bayes_search = BayesSearchCV(
    estimator=pipeline,
    search_spaces=param_space,
    scoring='accuracy',
    cv=3,
    n_iter=20,
    verbose=2,
    n_jobs=-1
)

# Fit the Bayesian Optimization
bayes_search.fit(X_train, y_train)

# Get the best model and evaluate it on validation data
best_pipeline = bayes_search.best_estimator_
y_pred_val = best_pipeline.predict(X_val)

# Print best parameters and evaluation metrics on validation set
print(f"Best parameters: {bayes_search.best_params_}")
print(f"Best cross-validation score: {bayes_search.best_score_:.2%}")
print(f"Accuracy on validation set: {accuracy_score(y_val, y_pred_val):.2%}")
print("Classification Report (Validation Set):")
print(classification_report(y_val, y_pred_val))
print("Confusion Matrix (Validation Set):")
print(confusion_matrix(y_val, y_pred_val))

# Save the best model using pickle
with open(model_save_path, 'wb') as f:
    pickle.dump(best_pipeline, f)
print(f"Model saved at {model_save_path}")

# Evaluate the saved model on the test set
print("\nEvaluating on Test Set:")

# Load the trained model
with open(model_save_path, 'rb') as f:
    trained_model = pickle.load(f)

# Predict using the loaded model
y_pred_test = trained_model.predict(X_test)

# Evaluate performance on test set
print(f"Accuracy on test set: {accuracy_score(y_test, y_pred_test):.2%}")
print("Classification Report (Test Set):")
print(classification_report(y_test, y_pred_test))
print("Confusion Matrix (Test Set):")
print(confusion_matrix(y_test, y_pred_test))

# PBT

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.base import clone
from sklearn.pipeline import Pipeline
import pickle

# Load the training dataset
train_data_path = '/content/drive/MyDrive/D1/train.csv'
train_data = pd.read_csv(train_data_path)

# Load the validation dataset
val_data_path = '/content/drive/MyDrive/D1/val.csv'
val_data = pd.read_csv(val_data_path)

# Extract features and labels from training data
X_train = train_data['title']  # Assuming 'title' is the column containing text data
y_train = train_data['label'].astype(int)

# Extract features and labels from validation data
X_val = val_data['title']  # Assuming 'title' is the column containing text data
y_val = val_data['label'].astype(int)

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer(max_features=500)  # Reduce number of features for faster computation
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)

# Initialize population
population_size = 10
population = []

# Generate initial population with random hyperparameters
for _ in range(population_size):
    alpha = np.random.uniform(1e-3, 1.0)
    fit_prior = np.random.choice([True, False])
    model = MultinomialNB(alpha=alpha, fit_prior=fit_prior)
    model.fit(X_train_tfidf, y_train)
    population.append((model, alpha, fit_prior))

# Define number of iterations for PBT
iterations = 10

# Perform PBT
for iteration in range(iterations):
    scores = []

    # Evaluate each model in the population
    for model, alpha, fit_prior in population:
        y_pred = model.predict(X_val_tfidf)
        score = accuracy_score(y_val, y_pred)
        scores.append((score, model, alpha, fit_prior))

    # Sort population based on score
    scores.sort(reverse=True, key=lambda x: x[0])
    top_half = scores[:population_size // 2]
    bottom_half = scores[population_size // 2:]

    # Update bottom half of the population
    for i in range(len(bottom_half)):
        _, top_model, top_alpha, top_fit_prior = top_half[i]
        _, _, _, _ = bottom_half[i]

        # Clone the top model and perturb its hyperparameters
        new_alpha = np.clip(top_alpha * np.random.uniform(0.8, 1.2), 1e-3, 1.0)
        new_fit_prior = np.random.choice([True, False]) if np.random.rand() < 0.5 else top_fit_prior

        new_model = clone(top_model)
        new_model.set_params(alpha=new_alpha, fit_prior=new_fit_prior)
        new_model.fit(X_train_tfidf, y_train)

        population[population_size // 2 + i] = (new_model, new_alpha, new_fit_prior)

# Select the best model from the final population
best_model, best_alpha, best_fit_prior = max(population, key=lambda x: accuracy_score(y_val, x[0].predict(X_val_tfidf)))

# Define a pipeline with vectorizer and best model
pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('nb', best_model)
])

# Fit the pipeline on the entire training data with best hyperparameters
pipeline.fit(X_train, y_train)

# Evaluate the best model on validation data
y_pred = pipeline.predict(X_val)

# Print best hyperparameters and evaluation metrics
print(f"Best alpha: {best_alpha}")
print(f"Best fit_prior: {best_fit_prior}")
print(f"Accuracy on validation set: {accuracy_score(y_val, y_pred):.2%}")
print("Classification Report:")
print(classification_report(y_val, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))

# Save the pipeline (including vectorizer and best model) using pickle
model_save_path = '/content/drive/MyDrive/D1/naive/pbt_pipeline.pkl'
with open(model_save_path, 'wb') as f:
    pickle.dump(pipeline, f)

print(f"Pipeline saved at {model_save_path}")

In [None]:
import pandas as pd
import pickle
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Load the test dataset
test_data_path = '/content/drive/MyDrive/D1/test.csv'
test_data = pd.read_csv(test_data_path)

# Assuming 'title' is the column containing text data in the test dataset
X_test = test_data['title']
y_test = test_data['label'].astype(int)

# Load the saved pipeline
model_load_path = '/content/drive/MyDrive/D1/naive/pbt_pipeline.pkl'
with open(model_load_path, 'rb') as f:
    pipeline = pickle.load(f)

# Make predictions using the pipeline
y_pred = pipeline.predict(X_test)

# Evaluate the predictions
print(f"Accuracy on test set: {accuracy_score(y_test, y_pred):.2%}")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Genetic

In [None]:
!pip install deap

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from deap import base, creator, tools, algorithms
import random
import pickle

# Load the training dataset
train_data_path = '/content/drive/MyDrive/D1/train.csv'
train_data = pd.read_csv(train_data_path)

# Load the validation dataset
val_data_path = '/content/drive/MyDrive/D1/val.csv'
val_data = pd.read_csv(val_data_path)

# Extract features and labels from training data
X_train = train_data['title']  # Assuming 'title' is the column containing text data
y_train = train_data['label'].astype(int)  # Assuming 'label' is the column containing labels

# Extract features and labels from validation data
X_val = val_data['title']  # Assuming 'title' is the column containing text data
y_val = val_data['label'].astype(int)  # Assuming 'label' is the column containing labels

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer(max_features=500)  # Reduce number of features for faster computation
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)

# Define the evaluation function for genetic algorithm
def evaluate(individual):
    alpha = individual[0]
    fit_prior = bool(individual[1])
    model = MultinomialNB(alpha=alpha, fit_prior=fit_prior)
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_val_tfidf)
    accuracy = accuracy_score(y_val, y_pred)
    return (accuracy,)

# Set up the genetic algorithm
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register("attr_float", random.uniform, 1e-3, 1.0)
toolbox.register("attr_bool", random.randint, 0, 1)
toolbox.register("individual", tools.initCycle, creator.Individual,
                 (toolbox.attr_float, toolbox.attr_bool), n=1)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

toolbox.register("mate", tools.cxBlend, alpha=0.5)
toolbox.register("mutate", tools.mutPolynomialBounded, low=[1e-3, 0], up=[1.0, 1], eta=0.1, indpb=0.2)
toolbox.register("select", tools.selTournament, tournsize=3)
toolbox.register("evaluate", evaluate)

# Genetic Algorithm parameters
population_size = 20
generations = 40
cxpb, mutpb = 0.5, 0.2

# Initialize population
population = toolbox.population(n=population_size)

# Run the Genetic Algorithm
result_population, logbook = algorithms.eaSimple(population, toolbox, cxpb, mutpb, generations,
                                                 stats=None, halloffame=None, verbose=True)

# Select the best individual
best_individual = tools.selBest(result_population, k=1)[0]
best_alpha = best_individual[0]
best_fit_prior = bool(best_individual[1])

# Train the best model on the training data
best_model = MultinomialNB(alpha=best_alpha, fit_prior=best_fit_prior)
best_model.fit(X_train_tfidf, y_train)

# Define a pipeline with vectorizer and best model
pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('nb', best_model)
])

# Fit the pipeline on the entire training data with best hyperparameters
pipeline.fit(X_train, y_train)

# Evaluate the best model on validation data
y_pred = pipeline.predict(X_val)

# Print best hyperparameters and evaluation metrics
print(f"Best alpha: {best_alpha}")
print(f"Best fit_prior: {best_fit_prior}")
print(f"Accuracy on validation set: {accuracy_score(y_val, y_pred):.2%}")
print("Classification Report:")
print(classification_report(y_val, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))

# Save the pipeline (including vectorizer and best model) using pickle
model_save_path = '/content/drive/MyDrive/D1/naive/genetic_pipeline.pkl'
with open(model_save_path, 'wb') as f:
    pickle.dump(pipeline, f)

print(f"Pipeline saved at {model_save_path}")

In [None]:
# Load the test dataset
test_data_path = '/content/drive/MyDrive/D1/test.csv'
test_data = pd.read_csv(test_data_path)

# Assuming 'title' is the column containing text data in the test dataset
X_test = test_data['title']
y_test = test_data['label'].astype(int)

# Load the saved pipeline
model_load_path = '/content/drive/MyDrive/D1/naive/genetic_pipeline.pkl'
with open(model_load_path, 'rb') as f:
    pipeline = pickle.load(f)

# Make predictions using the pipeline
y_pred = pipeline.predict(X_test)

# Evaluate the predictions
print(f"Accuracy on test set: {accuracy_score(y_test, y_pred):.2%}")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


# Hyperband

In [None]:
!pip install hyperopt

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from hyperopt import hp, tpe, Trials, fmin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import pickle

# Load the training dataset
train_data_path = '/content/drive/MyDrive/D1/train.csv'
train_data = pd.read_csv(train_data_path)

# Load the validation dataset
val_data_path = '/content/drive/MyDrive/D1/val.csv'
val_data = pd.read_csv(val_data_path)

# Extract features and labels from training data
X_train = train_data['title']  # Assuming 'title' is the column containing text data
y_train = train_data['label'].astype(int)  # Assuming 'label' is the column containing labels

# Extract features and labels from validation data
X_val = val_data['title']  # Assuming 'title' is the column containing text data
y_val = val_data['label'].astype(int)  # Assuming 'label' is the column containing labels

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer(max_features=500)  # Reduce number of features for faster computation
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)

# Define the evaluation function for hyperopt
def evaluate(params):
    alpha = params['alpha']
    fit_prior = params['fit_prior']
    model = MultinomialNB(alpha=alpha, fit_prior=fit_prior)
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_val_tfidf)
    accuracy = accuracy_score(y_val, y_pred)
    return -accuracy  # Minimize negative accuracy (maximize accuracy)

# Define the search space
space = {
    'alpha': hp.loguniform('alpha', np.log(1e-3), np.log(1.0)),  # Smoothing parameter
    'fit_prior': hp.choice('fit_prior', [True, False])  # Whether to learn class prior probabilities or not
}

# Perform hyperparameter optimization with Hyperopt
trials = Trials()
best = fmin(fn=evaluate,
            space=space,
            algo=tpe.suggest,
            max_evals=100,  # Number of trials
            trials=trials)

# Get the best hyperparameters
best_alpha = best['alpha']
best_fit_prior = [True, False][best['fit_prior']]

# Train the best model on the full training data with the best hyperparameters
best_model = MultinomialNB(alpha=best_alpha, fit_prior=best_fit_prior)
best_model.fit(X_train_tfidf, y_train)

# Define a pipeline with vectorizer and best model
pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('nb', best_model)
])

# Fit the pipeline on the entire training data with best hyperparameters
pipeline.fit(X_train, y_train)

# Evaluate the best model on validation data
y_pred = pipeline.predict(X_val)

# Print best hyperparameters and evaluation metrics
print(f"Best alpha: {best_alpha}")
print(f"Best fit_prior: {best_fit_prior}")
print(f"Accuracy on validation set: {accuracy_score(y_val, y_pred):.2%}")
print("Classification Report:")
print(classification_report(y_val, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))

# Save the pipeline (including vectorizer and best model) using pickle
model_save_path = '/content/drive/MyDrive/D1/naive/hyperband_pipeline.pkl'
with open(model_save_path, 'wb') as f:
    pickle.dump(pipeline, f)

print(f"Pipeline saved at {model_save_path}")

In [None]:
# Load the test dataset
test_data_path = '/content/drive/MyDrive/D1/test.csv'
test_data = pd.read_csv(test_data_path)

# Assuming 'title' is the column containing text data in the test dataset
X_test = test_data['title']
y_test = test_data['label'].astype(int)

# Load the saved pipeline
model_load_path = '/content/drive/MyDrive/D1/naive/hyperband_pipeline.pkl'
with open(model_load_path, 'rb') as f:
    pipeline = pickle.load(f)

# Make predictions using the pipeline
y_pred = pipeline.predict(X_test)

# Evaluate the predictions
print(f"Accuracy on test set: {accuracy_score(y_test, y_pred):.2%}")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


# BERT + GSCV

In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from transformers import BertTokenizer, BertModel
import pickle

In [None]:
# Load the test dataset
test_data_path = '/content/drive/MyDrive/D1/test.csv'
test_data = pd.read_csv(test_data_path)

# Extract features and labels from test data
X_test = test_data['title']
y_test = test_data['label'].astype(int)

In [None]:
# Load and prepare BERT model and tokenizer
class BERTClassifier(torch.nn.Module):
    def __init__(self, bert_model):
        super(BERTClassifier, self).__init__()
        self.bert = bert_model
        self.fc1 = torch.nn.Linear(768, 512)
        self.fc2 = torch.nn.Linear(512, 2)
        self.relu = torch.nn.ReLU()
        self.softmax = torch.nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.fc1(pooled_output)
        x = self.relu(x)
        x = self.fc2(x)
        return self.softmax(x)

# Load BERT model
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model = BERTClassifier(bert_model)
bert_model.load_state_dict(torch.load('/content/drive/MyDrive/D1/BERT/dhruval_state_dict.pth'))
bert_model.eval()

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
def predict_bert(texts, batch_size=32):
    predictions = []
    num_batches = int(np.ceil(len(texts) / batch_size))
    for i in range(num_batches):
        batch_texts = texts[i*batch_size:(i+1)*batch_size]
        batch_texts = batch_texts.tolist()
        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = bert_model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
        logits = outputs
        batch_predictions = torch.argmax(logits, dim=1).numpy()
        predictions.extend(batch_predictions)
    return np.array(predictions)

In [None]:
# Load the saved Naive Bayes pipeline
model_save_path = '/content/drive/MyDrive/D1/D1_Naive/gridsearch_pipeline.pkl'
with open(model_save_path, 'rb') as f:
    nb_pipeline = pickle.load(f)

In [None]:
# Get predictions from both models
nb_predictions = nb_pipeline.predict(X_test)
bert_predictions = predict_bert(X_test)

# Combine predictions using majority voting
combined_predictions = np.array([np.bincount([nb_pred, bert_pred]).argmax() for nb_pred, bert_pred in zip(nb_predictions, bert_predictions)])

# Print evaluation metrics for the test set
print(f"Accuracy on test set: {accuracy_score(y_test, combined_predictions):.2%}")
print("Classification Report:")
print(classification_report(y_test, combined_predictions))
print("Confusion Matrix:")
print(confusion_matrix(y_test, combined_predictions))

# Test with Batch Size

In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from transformers import BertTokenizer, BertModel
import pickle

# Load the Naive Bayes pipeline
model_save_path = '/content/drive/MyDrive/D1/D1_Naive/gridsearch_pipeline.pkl'
with open(model_save_path, 'rb') as f:
    nb_pipeline = pickle.load(f)

# Load the test dataset
test_data_path = '/content/drive/MyDrive/D1/test.csv'
test_data = pd.read_csv(test_data_path)

# Sample a subset from the test data
subset_size = 1000  # Define the size of the subset
subset_data = test_data.sample(n=subset_size, random_state=42)

# Extract features and labels from the subset
X_test = subset_data['title']
y_test = subset_data['label'].astype(int)

# Load and prepare BERT model and tokenizer
class BERTClassifier(torch.nn.Module):
    def __init__(self, bert_model):
        super(BERTClassifier, self).__init__()
        self.bert = bert_model
        self.fc1 = torch.nn.Linear(768, 512)
        self.fc2 = torch.nn.Linear(512, 2)
        self.relu = torch.nn.ReLU()
        self.softmax = torch.nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.fc1(pooled_output)
        x = self.relu(x)
        x = self.fc2(x)
        return self.softmax(x)

# Load BERT model
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model = BERTClassifier(bert_model)
bert_model.load_state_dict(torch.load('/content/drive/MyDrive/D1/BERT/dhruval_state_dict.pth'))
bert_model.eval()

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define prediction functions
def predict_bert(texts):
    texts = texts.tolist()  # Convert to a list of strings
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = bert_model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
    logits = outputs
    return torch.argmax(logits, dim=1).numpy()

def predict_naive_bayes(texts):
    return nb_pipeline.predict(texts)

# Get predictions from both models
bert_predictions = predict_bert(X_test)
nb_predictions = predict_naive_bayes(X_test)

# Combine predictions using majority voting
combined_predictions = np.array([np.bincount([nb_pred, bert_pred]).argmax() for nb_pred, bert_pred in zip(nb_predictions, bert_predictions)])

# Calculate and print accuracy
accuracy = accuracy_score(y_test, combined_predictions)
print(f"Combined Accuracy: {accuracy:.2%}")

# Print additional metrics
print("Classification Report:")
print(classification_report(y_test, combined_predictions))
print("Confusion Matrix:")
print(confusion_matrix(y_test, combined_predictions))