In [None]:
# Mount Google Drive - applicable, if working on Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Grid Search CV

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
import pickle

# Load the training dataset
train_data_path = '/content/drive/MyDrive/D3/train_data_split.csv'
train_data = pd.read_csv(train_data_path)

# Load the validation dataset
val_data_path = '/content/drive/MyDrive/D3/val_data_split.csv'
val_data = pd.read_csv(val_data_path)

# Extract features and labels from training data
X_train = train_data['text']  # Assuming 'title' is the column containing text data
y_train = train_data['label'].astype(int)

# Extract features and labels from validation data
X_val = val_data['text']  # Assuming 'title' is the column containing text data
y_val = val_data['label'].astype(int)

# Define the pipeline with TfidfVectorizer and MultinomialNB
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=500)),  # Reduce number of features for faster computation
    ('nb', MultinomialNB())
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'nb__alpha': [0.1, 0.5, 1.0],  # Smoothing parameter
    'nb__fit_prior': [True, False]  # Whether to learn class prior probabilities or not
}

# Perform GridSearchCV with cross-validation on training data
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=3, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model and evaluate it on validation data
best_pipeline = grid_search.best_estimator_
y_pred = best_pipeline.predict(X_val)

# Print evaluation metrics
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.2%}")
print(f"Accuracy on validation set: {accuracy_score(y_val, y_pred):.2%}")
print("Classification Report:")
print(classification_report(y_val, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))

# Save the best pipeline using pickle
model_save_path = '/content/drive/MyDrive/D3/naive/gridsearch_pipeline.pkl'
with open(model_save_path, 'wb') as f:
    pickle.dump(best_pipeline, f)

print(f"Model saved at {model_save_path}")

# Load the test dataset
test_data_path = '/content/drive/MyDrive/D3/test_data_split.csv'
test_data = pd.read_csv(test_data_path)

# Extract features and labels from test data
X_test = test_data['text']  # Assuming 'title' is the column containing text data
y_test = test_data['label'].astype(int)

# Load the saved pipeline
with open(model_save_path, 'rb') as f:
    loaded_pipeline = pickle.load(f)

# Make predictions on the test data
y_test_pred = loaded_pipeline.predict(X_test)

# Print evaluation metrics for the test set
print(f"Accuracy on test set: {accuracy_score(y_test, y_test_pred):.2%}")
print("Classification Report:")
print(classification_report(y_test, y_test_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))

Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best parameters: {'nb__alpha': 0.1, 'nb__fit_prior': True}
Best cross-validation score: 87.20%
Accuracy on validation set: 86.83%
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.90      0.87      1189
           1       0.89      0.84      0.86      1188

    accuracy                           0.87      2377
   macro avg       0.87      0.87      0.87      2377
weighted avg       0.87      0.87      0.87      2377

Confusion Matrix:
[[1069  120]
 [ 193  995]]
Model saved at /content/drive/MyDrive/D3/naive/gridsearch_pipeline.pkl
Accuracy on test set: 87.59%
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.91      0.88      1189
           1       0.90      0.84      0.87      1189

    accuracy                           0.88      2378
   macro avg       0.88      0.88      0.88      2378
weighted avg       0

# Bayes Optimization

In [None]:
!pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-24.4.0-py3-none-any.whl (24 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-24.4.0 scikit-optimize-0.10.2


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from skopt import BayesSearchCV
from skopt.space import Real, Categorical
import pickle

# Paths to data files
train_data_path = '/content/drive/MyDrive/D3/dataset/train.csv'
val_data_path = '/content/drive/MyDrive/D3/dataset/val.csv'
test_data_path = '/content/drive/MyDrive/D3/dataset/test.csv'
model_save_path = '/content/drive/MyDrive/D3/naive/bayes_pipeline.pkl'

# Load data
train_data = pd.read_csv(train_data_path)
val_data = pd.read_csv(val_data_path)
test_data = pd.read_csv(test_data_path)

# Extract features and labels from training data
X_train = train_data['text']
y_train = train_data['label'].astype(int)

# Extract features and labels from validation data
X_val = val_data['text']
y_val = val_data['label'].astype(int)

# Extract features and labels from test data
X_test = test_data['text']
y_test = test_data['label'].astype(int)

# Define the pipeline with TF-IDF vectorizer and Naive Bayes model
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=500)),
    ('nb', MultinomialNB())
])

# Define the parameter space for Bayesian Optimization
param_space = {
    'nb__alpha': Real(1e-3, 1.0, prior='log-uniform'),
    'nb__fit_prior': Categorical([True, False])
}

# Perform Bayesian Optimization with cross-validation on training data
bayes_search = BayesSearchCV(
    estimator=pipeline,
    search_spaces=param_space,
    scoring='accuracy',
    cv=3,
    n_iter=20,
    verbose=2,
    n_jobs=-1
)

# Fit the Bayesian Optimization
bayes_search.fit(X_train, y_train)

# Get the best model and evaluate it on validation data
best_pipeline = bayes_search.best_estimator_
y_pred_val = best_pipeline.predict(X_val)

# Print best parameters and evaluation metrics on validation set
print(f"Best parameters: {bayes_search.best_params_}")
print(f"Best cross-validation score: {bayes_search.best_score_:.2%}")
print(f"Accuracy on validation set: {accuracy_score(y_val, y_pred_val):.2%}")
print("Classification Report (Validation Set):")
print(classification_report(y_val, y_pred_val))
print("Confusion Matrix (Validation Set):")
print(confusion_matrix(y_val, y_pred_val))

# Save the best model using pickle
with open(model_save_path, 'wb') as f:
    pickle.dump(best_pipeline, f)
print(f"Model saved at {model_save_path}")

# Evaluate the saved model on the test set
print("\nEvaluating on Test Set:")

# Load the trained model
with open(model_save_path, 'rb') as f:
    trained_model = pickle.load(f)

# Predict using the loaded model
y_pred_test = trained_model.predict(X_test)

# Evaluate performance on test set
print(f"Accuracy on test set: {accuracy_score(y_test, y_pred_test):.2%}")
print("Classification Report (Test Set):")
print(classification_report(y_test, y_pred_test))
print("Confusion Matrix (Test Set):")
print(confusion_matrix(y_test, y_pred_test))

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fi

# PBT

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.base import clone
from sklearn.pipeline import Pipeline
import pickle

# Load the training dataset
train_data_path = '/content/drive/MyDrive/D3/dataset/train.csv'
train_data = pd.read_csv(train_data_path)

# Load the validation dataset
val_data_path = '/content/drive/MyDrive/D3/dataset/val.csv'
val_data = pd.read_csv(val_data_path)

# Extract features and labels from training data
X_train = train_data['text']  # Assuming 'text' is the column containing text data
y_train = train_data['label'].astype(int)

# Extract features and labels from validation data
X_val = val_data['text']  # Assuming 'text' is the column containing text data
y_val = val_data['label'].astype(int)

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer(max_features=500)  # Reduce number of features for faster computation
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)

# Initialize population
population_size = 10
population = []

# Generate initial population with random hyperparameters
for _ in range(population_size):
    alpha = np.random.uniform(1e-3, 1.0)
    fit_prior = np.random.choice([True, False])
    model = MultinomialNB(alpha=alpha, fit_prior=fit_prior)
    model.fit(X_train_tfidf, y_train)
    population.append((model, alpha, fit_prior))

# Define number of iterations for PBT
iterations = 10

# Perform PBT
for iteration in range(iterations):
    scores = []

    # Evaluate each model in the population
    for model, alpha, fit_prior in population:
        y_pred = model.predict(X_val_tfidf)
        score = accuracy_score(y_val, y_pred)
        scores.append((score, model, alpha, fit_prior))

    # Sort population based on score
    scores.sort(reverse=True, key=lambda x: x[0])
    top_half = scores[:population_size // 2]
    bottom_half = scores[population_size // 2:]

    # Update bottom half of the population
    for i in range(len(bottom_half)):
        _, top_model, top_alpha, top_fit_prior = top_half[i]
        _, _, _, _ = bottom_half[i]

        # Clone the top model and perturb its hyperparameters
        new_alpha = np.clip(top_alpha * np.random.uniform(0.8, 1.2), 1e-3, 1.0)
        new_fit_prior = np.random.choice([True, False]) if np.random.rand() < 0.5 else top_fit_prior

        new_model = clone(top_model)
        new_model.set_params(alpha=new_alpha, fit_prior=new_fit_prior)
        new_model.fit(X_train_tfidf, y_train)

        population[population_size // 2 + i] = (new_model, new_alpha, new_fit_prior)

# Select the best model from the final population
best_model, best_alpha, best_fit_prior = max(population, key=lambda x: accuracy_score(y_val, x[0].predict(X_val_tfidf)))

# Define a pipeline with vectorizer and best model
pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('nb', best_model)
])

# Fit the pipeline on the entire training data with best hyperparameters
pipeline.fit(X_train, y_train)

# Evaluate the best model on validation data
y_pred = pipeline.predict(X_val)

# Print best hyperparameters and evaluation metrics
print(f"Best alpha: {best_alpha}")
print(f"Best fit_prior: {best_fit_prior}")
print(f"Accuracy on validation set: {accuracy_score(y_val, y_pred):.2%}")
print("Classification Report:")
print(classification_report(y_val, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))

# Save the pipeline (including vectorizer and best model) using pickle
model_save_path = '/content/drive/MyDrive/D3/naive/pbt_pipeline.pkl'
with open(model_save_path, 'wb') as f:
    pickle.dump(pipeline, f)

print(f"Pipeline saved at {model_save_path}")

Best alpha: 0.8439575710303715
Best fit_prior: False
Accuracy on validation set: 84.59%
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.89      0.83       850
           1       0.91      0.82      0.86      1168

    accuracy                           0.85      2018
   macro avg       0.84      0.85      0.84      2018
weighted avg       0.85      0.85      0.85      2018

Confusion Matrix:
[[753  97]
 [214 954]]
Pipeline saved at /content/drive/MyDrive/D3/naive/pbt_pipeline.pkl


In [None]:
import pandas as pd
import pickle
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Load the test dataset
test_data_path = '/content/drive/MyDrive/D3/dataset/test.csv'
test_data = pd.read_csv(test_data_path)

# Assuming 'text' is the column containing text data in the test dataset
X_test = test_data['text']
y_test = test_data['label'].astype(int)

# Load the saved pipeline
model_load_path = '/content/drive/MyDrive/D3/naive/pbt_pipeline.pkl'
with open(model_load_path, 'rb') as f:
    pipeline = pickle.load(f)

# Make predictions using the pipeline
y_pred = pipeline.predict(X_test)

# Evaluate the predictions
print(f"Accuracy on test set: {accuracy_score(y_test, y_pred):.2%}")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy on test set: 85.33%
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.89      0.83       822
           1       0.92      0.83      0.87      1196

    accuracy                           0.85      2018
   macro avg       0.85      0.86      0.85      2018
weighted avg       0.86      0.85      0.85      2018

Confusion Matrix:
[[734  88]
 [208 988]]


# Genetic

In [None]:
!pip install deap

Collecting deap
  Downloading deap-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.4/135.4 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: deap
Successfully installed deap-1.4.1


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from deap import base, creator, tools, algorithms
import random
import pickle

# Load the training dataset
train_data_path = '/content/drive/MyDrive/D3/dataset/train.csv'
train_data = pd.read_csv(train_data_path)

# Load the validation dataset
val_data_path = '/content/drive/MyDrive/D3/dataset/val.csv'
val_data = pd.read_csv(val_data_path)

# Extract features and labels from training data
X_train = train_data['text']  # Assuming 'text' is the column containing text data
y_train = train_data['label'].astype(int)  # Assuming 'label' is the column containing labels

# Extract features and labels from validation data
X_val = val_data['text']  # Assuming 'text' is the column containing text data
y_val = val_data['label'].astype(int)  # Assuming 'label' is the column containing labels

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer(max_features=500)  # Reduce number of features for faster computation
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)

# Define the evaluation function for genetic algorithm
def evaluate(individual):
    alpha = individual[0]
    fit_prior = bool(individual[1])
    model = MultinomialNB(alpha=alpha, fit_prior=fit_prior)
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_val_tfidf)
    accuracy = accuracy_score(y_val, y_pred)
    return (accuracy,)

# Set up the genetic algorithm
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register("attr_float", random.uniform, 1e-3, 1.0)
toolbox.register("attr_bool", random.randint, 0, 1)
toolbox.register("individual", tools.initCycle, creator.Individual,
                 (toolbox.attr_float, toolbox.attr_bool), n=1)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

toolbox.register("mate", tools.cxBlend, alpha=0.5)

# Custom mutation function to ensure alpha stays within the valid range
def custom_mutate(individual):
    individual[0] = max(1e-3, min(1.0, individual[0] + random.uniform(-0.05, 0.05)))
    individual[1] = random.randint(0, 1)
    return individual,

toolbox.register("mutate", custom_mutate)
toolbox.register("select", tools.selTournament, tournsize=3)
toolbox.register("evaluate", evaluate)

# Genetic Algorithm parameters
population_size = 20
generations = 40
cxpb, mutpb = 0.5, 0.2

# Initialize population
population = toolbox.population(n=population_size)

# Run the Genetic Algorithm
result_population, logbook = algorithms.eaSimple(population, toolbox, cxpb, mutpb, generations,
                                                 stats=None, halloffame=None, verbose=True)

# Select the best individual
best_individual = tools.selBest(result_population, k=1)[0]
best_alpha = best_individual[0]
best_fit_prior = bool(best_individual[1])

# Train the best model on the training data
best_model = MultinomialNB(alpha=best_alpha, fit_prior=best_fit_prior)
best_model.fit(X_train_tfidf, y_train)

# Define a pipeline with vectorizer and best model
pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('nb', best_model)
])

# Fit the pipeline on the entire training data with best hyperparameters
pipeline.fit(X_train, y_train)

# Evaluate the best model on validation data
y_pred = pipeline.predict(X_val)

# Print best hyperparameters and evaluation metrics
print(f"Best alpha: {best_alpha}")
print(f"Best fit_prior: {best_fit_prior}")
print(f"Accuracy on validation set: {accuracy_score(y_val, y_pred):.2%}")
print("Classification Report:")
print(classification_report(y_val, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))

# Save the pipeline (including vectorizer and best model) using pickle
model_save_path = '/content/drive/MyDrive/D3/naive/genetic_pipeline.pkl'
with open(model_save_path, 'wb') as f:
    pickle.dump(pipeline, f)

print(f"Pipeline saved at {model_save_path}")



gen	nevals
0  	20    
1  	7     
2  	7     
3  	9     
4  	8     
5  	15    
6  	14    
7  	11    
8  	11    
9  	10    
10 	14    
11 	11    
12 	16    
13 	14    
14 	15    
15 	13    
16 	12    
17 	10    
18 	17    
19 	13    
20 	14    
21 	14    
22 	15    
23 	12    
24 	17    
25 	10    
26 	11    
27 	10    
28 	15    
29 	14    
30 	10    
31 	17    
32 	12    
33 	14    
34 	16    
35 	16    
36 	14    
37 	14    
38 	15    
39 	13    
40 	13    
Best alpha: 0.5631399429766312
Best fit_prior: False
Accuracy on validation set: 84.59%
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.88      0.83       850
           1       0.91      0.82      0.86      1168

    accuracy                           0.85      2018
   macro avg       0.84      0.85      0.84      2018
weighted avg       0.85      0.85      0.85      2018

Confusion Matrix:
[[752  98]
 [213 955]]
Pipeline saved at /content/drive/MyDrive/D3/naive/genetic_p

In [None]:
# Load the test dataset
test_data_path = '/content/drive/MyDrive/D3/dataset/test.csv'
test_data = pd.read_csv(test_data_path)

# Assuming 'text' is the column containing text data in the test dataset
X_test = test_data['text']
y_test = test_data['label'].astype(int)

# Load the saved pipeline
model_load_path = '/content/drive/MyDrive/D3/naive/genetic_pipeline.pkl'
with open(model_load_path, 'rb') as f:
    pipeline = pickle.load(f)

# Make predictions using the pipeline
y_pred = pipeline.predict(X_test)

# Evaluate the predictions
print(f"Accuracy on test set: {accuracy_score(y_test, y_pred):.2%}")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy on test set: 85.28%
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.89      0.83       822
           1       0.92      0.83      0.87      1196

    accuracy                           0.85      2018
   macro avg       0.85      0.86      0.85      2018
weighted avg       0.86      0.85      0.85      2018

Confusion Matrix:
[[733  89]
 [208 988]]


# Hyperband

In [None]:
!pip install hyperopt



In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from hyperopt import hp, tpe, Trials, fmin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import pickle

# Load the training dataset
train_data_path = '/content/drive/MyDrive/D3/dataset/train.csv'
train_data = pd.read_csv(train_data_path)

# Load the validation dataset
val_data_path = '/content/drive/MyDrive/D3/dataset/val.csv'
val_data = pd.read_csv(val_data_path)

# Extract features and labels from training data
X_train = train_data['text']  # Assuming 'text' is the column containing text data
y_train = train_data['label'].astype(int)  # Assuming 'label' is the column containing labels

# Extract features and labels from validation data
X_val = val_data['text']  # Assuming 'text' is the column containing text data
y_val = val_data['label'].astype(int)  # Assuming 'label' is the column containing labels

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer(max_features=500)  # Reduce number of features for faster computation
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)

# Define the evaluation function for hyperopt
def evaluate(params):
    alpha = params['alpha']
    fit_prior = params['fit_prior']
    model = MultinomialNB(alpha=alpha, fit_prior=fit_prior)
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_val_tfidf)
    accuracy = accuracy_score(y_val, y_pred)
    return -accuracy  # Minimize negative accuracy (maximize accuracy)

# Define the search space
space = {
    'alpha': hp.loguniform('alpha', np.log(1e-3), np.log(1.0)),  # Smoothing parameter
    'fit_prior': hp.choice('fit_prior', [True, False])  # Whether to learn class prior probabilities or not
}

# Perform hyperparameter optimization with Hyperopt
trials = Trials()
best = fmin(fn=evaluate,
            space=space,
            algo=tpe.suggest,
            max_evals=100,  # Number of trials
            trials=trials)

# Get the best hyperparameters
best_alpha = best['alpha']
best_fit_prior = [True, False][best['fit_prior']]

# Train the best model on the full training data with the best hyperparameters
best_model = MultinomialNB(alpha=best_alpha, fit_prior=best_fit_prior)
best_model.fit(X_train_tfidf, y_train)

# Define a pipeline with vectorizer and best model
pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('nb', best_model)
])

# Fit the pipeline on the entire training data with best hyperparameters
pipeline.fit(X_train, y_train)

# Evaluate the best model on validation data
y_pred = pipeline.predict(X_val)

# Print best hyperparameters and evaluation metrics
print(f"Best alpha: {best_alpha}")
print(f"Best fit_prior: {best_fit_prior}")
print(f"Accuracy on validation set: {accuracy_score(y_val, y_pred):.2%}")
print("Classification Report:")
print(classification_report(y_val, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))

# Save the pipeline (including vectorizer and best model) using pickle
model_save_path = '/content/drive/MyDrive/D3/naive/hyperband_pipeline.pkl'
with open(model_save_path, 'wb') as f:
    pickle.dump(pipeline, f)

print(f"Pipeline saved at {model_save_path}")

100%|██████████| 100/100 [00:06<00:00, 16.00trial/s, best loss: -0.8458870168483648]
Best alpha: 0.32537259066901225
Best fit_prior: False
Accuracy on validation set: 84.59%
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.88      0.83       850
           1       0.91      0.82      0.86      1168

    accuracy                           0.85      2018
   macro avg       0.84      0.85      0.84      2018
weighted avg       0.85      0.85      0.85      2018

Confusion Matrix:
[[752  98]
 [213 955]]
Pipeline saved at /content/drive/MyDrive/D3/naive/hyperband_pipeline.pkl


In [None]:
# Load the test dataset
test_data_path = '/content/drive/MyDrive/D3/dataset/test.csv'
test_data = pd.read_csv(test_data_path)

# Assuming 'text' is the column containing text data in the test dataset
X_test = test_data['text']
y_test = test_data['label'].astype(int)

# Load the saved pipeline
model_load_path = '/content/drive/MyDrive/D3/naive/hyperband_pipeline.pkl'
with open(model_load_path, 'rb') as f:
    pipeline = pickle.load(f)

# Make predictions using the pipeline
y_pred = pipeline.predict(X_test)

# Evaluate the predictions
print(f"Accuracy on test set: {accuracy_score(y_test, y_pred):.2%}")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy on test set: 85.28%
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.89      0.83       822
           1       0.92      0.83      0.87      1196

    accuracy                           0.85      2018
   macro avg       0.85      0.86      0.85      2018
weighted avg       0.86      0.85      0.85      2018

Confusion Matrix:
[[733  89]
 [208 988]]


# ydata profling


In [None]:
# Install ydata_profiling if not already installed
!pip install ydata-profiling

import pandas as pd
from ydata_profiling import ProfileReport

Collecting ydata-profiling
  Downloading ydata_profiling-4.9.0-py2.py3-none-any.whl (356 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m356.2/356.2 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting visions[type_image_path]<0.7.7,>=0.7.5 (from ydata-profiling)
  Downloading visions-0.7.6-py3-none-any.whl (104 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.8/104.8 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
Collecting htmlmin==0.1.12 (from ydata-profiling)
  Downloading htmlmin-0.1.12.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting phik<0.13,>=0.11.1 (from ydata-profiling)
  Downloading phik-0.12.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (686 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m686.1/686.1 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
Collecting multimethod<2,>=1.4 (from ydata-profiling)
  Downloading multimethod-1.12-py3-none-any.whl (10 kB)
Collecti

In [None]:

# Load the CSV files for profiling
train_df = pd.read_csv('/content/drive/MyDrive/D3/dataset/train.csv')
val_df = pd.read_csv('/content/drive/MyDrive/D3/dataset/val.csv')
test_df = pd.read_csv('/content/drive/MyDrive/D3/dataset/test.csv')

# Perform ydata profiling on train dataset
train_profile = ProfileReport(train_df, title="Train Dataset Profiling Report")
train_profile.to_file("/content/drive/MyDrive/D3/dataset/train_profile_report.html")

# Perform ydata profiling on validation dataset
val_profile = ProfileReport(val_df, title="Validation Dataset Profiling Report")
val_profile.to_file("/content/drive/MyDrive/D3/dataset/val_profile_report.html")

# Perform ydata profiling on test dataset
test_profile = ProfileReport(test_df, title="Test Dataset Profiling Report")
test_profile.to_file("/content/drive/MyDrive/D3/dataset/test_profile_report.html")



Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]



Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]



Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
file_path = '/content/drive/MyDrive/D3/data.csv'
df = pd.read_csv(file_path)

# Number of rows before removing duplicate rows
rows_before = df.shape[0]

# Remove duplicate rows
df = df.drop_duplicates()

# Number of rows after removing duplicate rows
rows_after = df.shape[0]

# Split the dataset into train (70%) and temp (30%)
train_df, temp_df = train_test_split(df, test_size=0.30, random_state=42)

# Split the temp dataset into validation (15%) and test (15%)
val_df, test_df = train_test_split(temp_df, test_size=0.50, random_state=42)

# Save the datasets to CSV files
train_df.to_csv('/content/drive/MyDrive/D3/dataset/train.csv', index=False)
val_df.to_csv('/content/drive/MyDrive/D3/dataset/val.csv', index=False)
test_df.to_csv('/content/drive/MyDrive/D3/dataset/test.csv', index=False)

# Display the number of rows before and after removing duplicate rows
print(f'Number of rows before removing duplicates: {rows_before}')
print(f'Number of rows after removing duplicates: {rows_after}')

# Display the shapes of the datasets to verify the splits
print(f'Train set: {train_df.shape}')
print(f'Validation set: {val_df.shape}')
print(f'Test set: {test_df.shape}')

Number of rows before removing duplicates: 15848
Number of rows after removing duplicates: 13452
Train set: (9416, 5)
Validation set: (2018, 5)
Test set: (2018, 5)


In [None]:
train_df.head()

Unnamed: 0,id,title,author,text,label
1155,10724,Trumpâs FIRST Order: Anyone Burning An Ameri...,Martin Walsh,"\nPosted by Martin Walsh | Nov 11, 2016 | Libe...",1
15028,7200,Global Migration Meets Magic in Mohsin Hamidâ...,Alexandra Alter,"In an unnamed, city in the Muslim world, two...",0
6455,19242,WATCH: Gingrich Accuses Megyn Kelly Of Being â...,Davis,Hillary Howls in Laughter About Radical Muslim...,1
7917,11695,Badass Patriot Has MASSIVE Surprise For Thieve...,Amanda Shea,Badass Patriot Has MASSIVE Surprise For Thieve...,1
1728,2205,James Wesley Rawles: âDouble Up On Your Prep...,Mac Slavo,"\nEnjoy your turkey, family events and holiday...",1


In [None]:
import pandas as pd

# Load the training, validation, and test datasets
train_data_path = '/content/drive/MyDrive/D3/dataset/train.csv'
val_data_path = '/content/drive/MyDrive/D3/dataset/val.csv'
test_data_path = '/content/drive/MyDrive/D3/dataset/test.csv'

train_data = pd.read_csv(train_data_path)
val_data = pd.read_csv(val_data_path)
test_data = pd.read_csv(test_data_path)

# Print the number of rows in each dataset before combining
print(f"Number of rows in train dataset: {len(train_data)}")
print(f"Number of rows in validation dataset: {len(val_data)}")
print(f"Number of rows in test dataset: {len(test_data)}")

# Combine the datasets
combined_data = pd.concat([train_data, val_data, test_data], ignore_index=True)

# Print the number of rows in the combined dataset
print(f"Number of rows in combined dataset: {len(combined_data)}")

# Save the combined dataset to a CSV file
combined_data_path = '/content/drive/MyDrive/D3/combined_data.csv'
combined_data.to_csv(combined_data_path, index=False)

print(f"Combined dataset saved at {combined_data_path}")

Number of rows in train dataset: 9416
Number of rows in validation dataset: 2018
Number of rows in test dataset: 2018
Number of rows in combined dataset: 13452
Combined dataset saved at /content/drive/MyDrive/D3/combined_data.csv


# BERT + GSCV

In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from transformers import BertTokenizer, BertModel
import pickle

# Load the test dataset
test_data_path = '/content/drive/MyDrive/D3/test_data_split.csv'
test_data = pd.read_csv(test_data_path)

# Extract features and labels from test data
X_test = test_data['text']
y_test = test_data['label'].astype(int)

# Load and prepare BERT model and tokenizer
class BERTClassifier(torch.nn.Module):
    def __init__(self, bert_model):
        super(BERTClassifier, self).__init__()
        self.bert = bert_model
        self.classifier = torch.nn.Linear(768, 2)  # Adjust according to your architecture
        self.softmax = torch.nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        logits = self.classifier(pooled_output)
        return self.softmax(logits)

# Load BERT model and tokenizer
bert_model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Instantiate the classifier
bert_classifier = BERTClassifier(bert_model)

# Load the state dictionary
state_dict = torch.load('/content/drive/MyDrive/D3/bert_model.pth')

# Rename keys if needed (optional)
# state_dict['classifier.weight'] = state_dict.pop('fc1.weight')
# state_dict['classifier.bias'] = state_dict.pop('fc1.bias')

# Load the state dictionary into the model
bert_classifier.load_state_dict(state_dict)

# Set the model to evaluation mode
bert_classifier.eval()

# Function to make predictions with BERT
def predict_bert(texts, batch_size=32):
    predictions = []
    num_batches = int(np.ceil(len(texts) / batch_size))
    for i in range(num_batches):
        batch_texts = texts[i*batch_size:(i+1)*batch_size]
        batch_texts = batch_texts.tolist()
        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = bert_classifier(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
        logits = outputs
        batch_predictions = torch.argmax(logits, dim=1).numpy()
        predictions.extend(batch_predictions)
    return np.array(predictions)

# Load the saved Naive Bayes pipeline
model_save_path = '/content/drive/MyDrive/D3/naive/gridsearch_pipeline.pkl'
with open(model_save_path, 'rb') as f:
    nb_pipeline = pickle.load(f)

# Get predictions from both models
nb_predictions = nb_pipeline.predict(X_test)
bert_predictions = predict_bert(X_test)

# Combine predictions using majority voting
combined_predictions = np.array([np.bincount([nb_pred, bert_pred]).argmax() for nb_pred, bert_pred in zip(nb_predictions, bert_predictions)])

# Print evaluation metrics for the test set
print(f"Accuracy on test set: {accuracy_score(y_test, combined_predictions):.2%}")
print("Classification Report:")
print(classification_report(y_test, combined_predictions))
print("Confusion Matrix:")
print(confusion_matrix(y_test, combined_predictions))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Accuracy on test set: 50.46%
Classification Report:
              precision    recall  f1-score   support

           0       0.50      1.00      0.67      1189
           1       1.00      0.01      0.02      1189

    accuracy                           0.50      2378
   macro avg       0.75      0.50      0.34      2378
weighted avg       0.75      0.50      0.34      2378

Confusion Matrix:
[[1189    0]
 [1178   11]]


# Test with Batch Size

In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from transformers import BertTokenizer, BertModel
import pickle
import gc

# Function to clear memory
def clear_memory():
    gc.collect()
    torch.cuda.empty_cache()

# Load the Naive Bayes pipeline
model_save_path = '/content/drive/MyDrive/D3/naive/gridsearch_pipeline.pkl'
with open(model_save_path, 'rb') as f:
    nb_pipeline = pickle.load(f)

# Load the test dataset
test_data_path = '/content/drive/MyDrive/D3/test_data_split.csv'
test_data = pd.read_csv(test_data_path)

# Sample a subset from the test data
subset_size = 1000  # Define the size of the subset
subset_data = test_data.sample(n=subset_size, random_state=42)

# Extract features and labels from the subset
X_test = subset_data['text']
y_test = subset_data['label'].astype(int)

# Load and prepare BERT model and tokenizer
class BERTClassifier(torch.nn.Module):
    def __init__(self, bert_model):
        super(BERTClassifier, self).__init__()
        self.bert = bert_model
        self.classifier = torch.nn.Linear(768, 2)
        self.softmax = torch.nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        logits = self.classifier(pooled_output)
        return self.softmax(logits)

# Load BERT model
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_classifier = BERTClassifier(bert_model)

# Load the state dictionary
state_dict = torch.load('/content/drive/MyDrive/D3/bert_model.pth')

# Load the state dictionary into the model
bert_classifier.load_state_dict(state_dict)
bert_classifier.eval()

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define prediction function with batching
def predict_bert(texts, batch_size=16):
    predictions = []
    num_batches = int(np.ceil(len(texts) / batch_size))
    for i in range(num_batches):
        batch_texts = texts[i*batch_size:(i+1)*batch_size]
        batch_texts = batch_texts.tolist()
        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = bert_classifier(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
        logits = outputs
        batch_predictions = torch.argmax(logits, dim=1).cpu().numpy()
        predictions.extend(batch_predictions)
        clear_memory()  # Clear memory after each batch
    return np.array(predictions)

# Define Naive Bayes prediction function
def predict_naive_bayes(texts):
    return nb_pipeline.predict(texts)

# Process in subsets to manage memory
def process_in_subsets(X_test, y_test, subset_size=100):
    combined_predictions = []
    for i in range(0, len(X_test), subset_size):
        X_subset = X_test[i:i+subset_size]
        y_subset = y_test[i:i+subset_size]

        # Get predictions from both models
        bert_predictions = predict_bert(X_subset)
        nb_predictions = predict_naive_bayes(X_subset)

        # Combine predictions using majority voting
        subset_combined_predictions = np.array([np.bincount([nb_pred, bert_pred]).argmax() for nb_pred, bert_pred in zip(nb_predictions, bert_predictions)])
        combined_predictions.extend(subset_combined_predictions)

        clear_memory()  # Clear memory after each subset

    return np.array(combined_predictions)

# Process the test set in subsets
combined_predictions = process_in_subsets(X_test, y_test)

# Calculate and print accuracy
accuracy = accuracy_score(y_test, combined_predictions)
print(f"Combined Accuracy: {accuracy:.2%}")

# Print additional metrics
print("Classification Report")
print(classification_report(y_test, combined_predictions))
print("Confusion Matrix")
print(confusion_matrix(y_test, combined_predictions))

Combined Accuracy: 52.50%
Classification Report
              precision    recall  f1-score   support

           0       0.52      1.00      0.69       520
           1       1.00      0.01      0.02       480

    accuracy                           0.53      1000
   macro avg       0.76      0.51      0.35      1000
weighted avg       0.75      0.53      0.37      1000

Confusion Matrix
[[520   0]
 [475   5]]
