In [None]:
# Mount Google Drive - applicable, if working on Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Grid

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

# Load the data
train_path = '/content/drive/MyDrive/D1/train.csv'
val_path = '/content/drive/MyDrive/D1/val.csv'
test_path = '/content/drive/MyDrive/D1/test.csv'

train_data = pd.read_csv(train_path)
val_data = pd.read_csv(val_path)
test_data = pd.read_csv(test_path)

# Assuming the label column is 'label' and the text column is 'text'
X_train = train_data['title']
y_train = train_data['label']
X_val = val_data['title']
y_val = val_data['label']
X_test = test_data['title']
y_test = test_data['label']

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Initialize Grid Search CV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_tfidf, y_train)

# Get the best model
best_rf = grid_search.best_estimator_

# Validate the model
val_predictions = best_rf.predict(X_val_tfidf)
val_accuracy = accuracy_score(y_val, val_predictions)
print(f"Validation Accuracy: {val_accuracy}")

# Test the model
test_predictions = best_rf.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, test_predictions)
print(f"Test Accuracy: {test_accuracy}")

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Validation Accuracy: 0.9324424647364514
Test Accuracy: 0.939963614311704


In [None]:
# Save the model
rf_model_path = '/content/drive/MyDrive/D1/D1_Random/best_rf_model.pkl'
joblib.dump(best_rf, rf_model_path)

# Save the vectorizer
vectorizer_path = '/content/drive/MyDrive/D1/D1_Random/tfidf_vectorizer.pkl'
joblib.dump(vectorizer, vectorizer_path)

['/content/drive/MyDrive/D1/D1_Random/tfidf_vectorizer.pkl']

# Bayes


In [None]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

# Load datasets
train_path = '/content/drive/MyDrive/D1/train.csv'
val_path = '/content/drive/MyDrive/D1/val.csv'
test_path = '/content/drive/MyDrive/D1/test.csv'

train_data = pd.read_csv(train_path)
val_data = pd.read_csv(val_path)
test_data = pd.read_csv(test_path)

# Assuming 'label' is the target column
label_encoder = LabelEncoder()
train_data['label'] = label_encoder.fit_transform(train_data['label'])
val_data['label'] = label_encoder.transform(val_data['label'])
test_data['label'] = label_encoder.transform(test_data['label'])

# Separate features and target
X_train = train_data.drop(columns=['label'])
y_train = train_data['label']
X_val = val_data.drop(columns=['label'])
y_val = val_data['label']
X_test = test_data.drop(columns=['label'])
y_test = test_data['label']

# Initialize the Naive Bayes classifier (GaussianNB)
nb_classifier = GaussianNB()

# Fit the model
nb_classifier.fit(X_train, y_train)

# Predict on validation set
y_val_pred = nb_classifier.predict(X_val)

# Evaluate performance
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy (Naive Bayes): {val_accuracy:.4f}")

# Predict on test set
y_test_pred = nb_classifier.predict(X_test)

# Evaluate test performance
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy (Naive Bayes): {test_accuracy:.4f}")

# Optionally, print classification report
print("\nClassification Report on Test Set (Naive Bayes):")
print(classification_report(y_test, y_test_pred, target_names=label_encoder.classes_))


# pbt

In [None]:
pip install scikit-learn ray tune optuna


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import PopulationBasedTraining
from sklearn.preprocessing import LabelEncoder
from ray.tune.integration.optuna import OptunaSearch

# Load datasets
train_path = '/content/drive/MyDrive/D1/train.csv'
val_path = '/content/drive/MyDrive/D1/val.csv'
test_path = '/content/drive/MyDrive/D1/test.csv'

train_data = pd.read_csv(train_path)
val_data = pd.read_csv(val_path)
test_data = pd.read_csv(test_path)

# Preprocess data
label_encoder = LabelEncoder()
train_data['label'] = label_encoder.fit_transform(train_data['label'])
val_data['label'] = label_encoder.transform(val_data['label'])
test_data['label'] = label_encoder.transform(test_data['label'])

X_train = train_data.drop(columns=['label'])
y_train = train_data['label']
X_val = val_data.drop(columns=['label'])
y_val = val_data['label']
X_test = test_data.drop(columns=['label'])
y_test = test_data['label']

# Define the objective function for Ray Tune
def train_random_forest(config):
    model = RandomForestClassifier(
        n_estimators=config["n_estimators"],
        max_depth=config["max_depth"],
        min_samples_split=config["min_samples_split"],
        min_samples_leaf=config["min_samples_leaf"],
        random_state=42
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    tune.report(accuracy=accuracy)

# Define search space
search_space = {
    "n_estimators": tune.randint(100, 500),
    "max_depth": tune.randint(10, 50),
    "min_samples_split": tune.randint(2, 20),
    "min_samples_leaf": tune.randint(1, 10)
}

# Setup Ray Tune reporter
reporter = CLIReporter(
    metric_columns=["accuracy", "training_iteration"]
)

# Setup Population-Based Training scheduler
pbt_scheduler = PopulationBasedTraining(
    time_attr="training_iteration",
    metric="accuracy",
    mode="max",
    perturbation_interval=1,
    hyperparam_mutations={
        "n_estimators": [100, 200, 300],
        "max_depth": [10, 20, 30, 40],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4]
    }
)

# Start the tuning process
tuner = tune.Tuner(
    tune.with_resources(
        tune.run(
            train_random_forest,
            config=search_space,
            scheduler=pbt_scheduler,
            resources_per_trial={"cpu": 1, "gpu": 0},
            num_samples=10,
            progress_reporter=reporter
        ),
        resources={"cpu": 4, "gpu": 0}
    ),
    tune_config=tune.TuneConfig(
        num_samples=10,
        scheduler=pbt_scheduler
    )
)

results = tuner.fit()

# Print the best hyperparameters
best_config = results.get_best_result().config
print(f"Best hyperparameters: {best_config}")

# Train the final model with the best hyperparameters
best_rf_model = RandomForestClassifier(
    n_estimators=best_config["n_estimators"],
    max_depth=best_config["max_depth"],
    min_samples_split=best_config["min_samples_split"],
    min_samples_leaf=best_config["min_samples_leaf"],
    random_state=42
)
best_rf_model.fit(X_train, y_train)

# Evaluate on test set
y_test_pred = best_rf_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Optionally, print classification report
from sklearn.metrics import classification_report
print("\nClassification Report on Test Set:")
print(classification_report(y_test, y_test_pred, target_names=label_encoder.classes_))


# Genetic

In [None]:
pip install scikit-learn deap


In [None]:
import pandas as pd
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from deap import base, creator, tools, algorithms
from sklearn.preprocessing import LabelEncoder

# Load datasets
train_path = '/content/drive/MyDrive/D1/train.csv'
val_path = '/content/drive/MyDrive/D1/val.csv'
test_path = '/content/drive/MyDrive/D1/test.csv'

train_data = pd.read_csv(train_path)
val_data = pd.read_csv(val_path)
test_data = pd.read_csv(test_path)

# Preprocess data
label_encoder = LabelEncoder()
train_data['label'] = label_encoder.fit_transform(train_data['label'])
val_data['label'] = label_encoder.transform(val_data['label'])
test_data['label'] = label_encoder.transform(test_data['label'])

X_train = train_data.drop(columns=['label'])
y_train = train_data['label']
X_val = val_data.drop(columns=['label'])
y_val = val_data['label']
X_test = test_data.drop(columns=['label'])
y_test = test_data['label']

# Define the evaluation function
def evaluate_individual(individual):
    # Decode individual (hyperparameters)
    n_estimators = individual[0]
    max_depth = individual[1]
    min_samples_split = individual[2]
    min_samples_leaf = individual[3]

    # Create and train Random Forest model
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    model.fit(X_train, y_train)

    # Evaluate on validation set
    y_val_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_val_pred)

    return accuracy,

# Define Genetic Algorithm parameters
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()

# Define attributes (hyperparameters)
toolbox.register("n_estimators", random.choice, [100, 200, 300])
toolbox.register("max_depth", random.choice, [10, 20, 30, 40])
toolbox.register("min_samples_split", random.choice, [2, 5, 10])
toolbox.register("min_samples_leaf", random.choice, [1, 2, 4])

# Define the individual and population size
toolbox.register("individual", tools.initCycle, creator.Individual,
                 (toolbox.n_estimators, toolbox.max_depth, toolbox.min_samples_split, toolbox.min_samples_leaf), n=1)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

# Define genetic operators
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutUniformInt, low=[100, 10, 2, 1], up=[300, 40, 10, 4], indpb=0.2)
toolbox.register("select", tools.selTournament, tournsize=3)
toolbox.register("evaluate", evaluate_individual)

# Number of generations and population size
NGEN = 10
POP_SIZE = 10

def main():
    pop = toolbox.population(n=POP_SIZE)

    # Evaluate the entire population
    fitnesses = list(map(toolbox.evaluate, pop))
    for ind, fit in zip(pop, fitnesses):
        ind.fitness.values = fit

    # Begin the evolution
    for g in range(NGEN):
        print(f"-- Generation {g+1} --")

        # Select the next generation individuals
        offspring = toolbox.select(pop, len(pop))
        # Clone the selected individuals
        offspring = list(map(toolbox.clone, offspring))

        # Apply crossover and mutation on the offspring
        for child1, child2 in zip(offspring[::2], offspring[1::2]):
            if random.random() < 0.5:
                toolbox.mate(child1, child2)
                del child1.fitness.values
                del child2.fitness.values

        for mutant in offspring:
            if random.random() < 0.2:
                toolbox.mutate(mutant)
                del mutant.fitness.values

        # Evaluate the individuals with an invalid fitness
        invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
        fitnesses = map(toolbox.evaluate, invalid_ind)
        for ind, fit in zip(invalid_ind, fitnesses):
            ind.fitness.values = fit

        # Replace the population with the offspring
        pop[:] = offspring

        # Print the statistics for the current generation
        fits = [ind.fitness.values[0] for ind in pop]
        print(f"  Min Fitness: {min(fits)}")
        print(f"  Max Fitness: {max(fits)}")

    best_ind = tools.selBest(pop, 1)[0]
    print(f"\nBest individual: {best_ind}")
    print(f"Best fitness: {best_ind.fitness.values[0]}")

    # Train the final model with the best hyperparameters
    n_estimators, max_depth, min_samples_split, min_samples_leaf = best_ind
    best_rf_model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    best_rf_model.fit(X_train, y_train)

    # Evaluate on test set
    y_test_pred = best_rf_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    print(f"\nTest Accuracy: {test_accuracy:.4f}")

if __name__ == "__main__":
    main()


# Hyper band

In [None]:
pip install scikit-learn ray[tune]


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import HyperBandForBOHB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load datasets
train_path = '/content/drive/MyDrive/D1/train.csv'
val_path = '/content/drive/MyDrive/D1/val.csv'
test_path = '/content/drive/MyDrive/D1/test.csv'

train_data = pd.read_csv(train_path)
val_data = pd.read_csv(val_path)
test_data = pd.read_csv(test_path)

# Preprocess data
label_encoder = LabelEncoder()
train_data['label'] = label_encoder.fit_transform(train_data['label'])
val_data['label'] = label_encoder.transform(val_data['label'])
test_data['label'] = label_encoder.transform(test_data['label'])

X_train = train_data.drop(columns=['label'])
y_train = train_data['label']
X_val = val_data.drop(columns=['label'])
y_val = val_data['label']
X_test = test_data.drop(columns=['label'])
y_test = test_data['label']

# Define the trainable function
def train_random_forest(config):
    model = RandomForestClassifier(
        n_estimators=config["n_estimators"],
        max_depth=config["max_depth"],
        min_samples_split=config["min_samples_split"],
        min_samples_leaf=config["min_samples_leaf"],
        random_state=42
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    return accuracy

# Define search space
search_space = {
    "n_estimators": tune.randint(100, 500),
    "max_depth": tune.randint(10, 50),
    "min_samples_split": tune.randint(2, 20),
    "min_samples_leaf": tune.randint(1, 10)
}

# Setup Ray Tune reporter
reporter = CLIReporter(
    metric_columns=["accuracy", "training_iteration"]
)

# Setup HyperBand scheduler for BOHB (Bayesian Optimization HyperBand)
hyperband_scheduler = HyperBandForBOHB(
    time_attr="training_iteration",
    max_t=100,
    reduction_factor=3,
    metric="accuracy",
    mode="max"
)

# Start the tuning process
analysis = tune.run(
    train_random_forest,
    config=search_space,
    scheduler=hyperband_scheduler,
    resources_per_trial={"cpu": 1, "gpu": 0},
    num_samples=20,
    progress_reporter=reporter,
    verbose=1
)

# Get the best trial
best_trial = analysis.get_best_trial(metric="accuracy", mode="max")
best_config = best_trial.config
best_accuracy = best_trial.last_result["accuracy"]

print(f"Best hyperparameters: {best_config}")
print(f"Best accuracy: {best_accuracy:.4f}")

# Train the final model with the best hyperparameters
best_rf_model = RandomForestClassifier(
    n_estimators=best_config["n_estimators"],
    max_depth=best_config["max_depth"],
    min_samples_split=best_config["min_samples_split"],
    min_samples_leaf=best_config["min_samples_leaf"],
    random_state=42
)
best_rf_model.fit(X_train, y_train)

# Evaluate on test set
y_test_pred = best_rf_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")


# BERT + GSCV

In [None]:
import torch
import numpy as np
import pandas as pd
import joblib
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score

# Load the pre-trained BERT model
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
bert_model.eval()

# Tokenizer for BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define the EnsembleModel class
class EnsembleModel:
    def __init__(self, bert_model, rf_model, tokenizer, vectorizer, batch_size=32):
        self.bert_model = bert_model
        self.rf_model = rf_model
        self.tokenizer = tokenizer
        self.vectorizer = vectorizer
        self.batch_size = batch_size

    def predict(self, texts):
        bert_preds = []
        rf_preds = []

        for i in range(0, len(texts), self.batch_size):
            batch_texts = texts[i:i + self.batch_size]

            # Get BERT predictions
            inputs = self.tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = self.bert_model(**inputs)
                bert_output = outputs.logits.cpu().numpy()
                bert_preds.append(bert_output)

            # Get Random Forest predictions
            X_batch = self.vectorizer.transform(batch_texts)
            rf_output = self.rf_model.predict_proba(X_batch)[:, 1]
            rf_preds.append(rf_output)

        bert_preds = np.concatenate(bert_preds, axis=0)
        rf_preds = np.concatenate(rf_preds, axis=0)

        # Combine predictions (simple averaging)
        combined_outputs = (bert_preds[:, 1] + rf_preds) / 2
        return (combined_outputs > 0.5).astype(int)

# Load the Random Forest model and TF-IDF vectorizer
rf_model_path = '/content/drive/MyDrive/D1/D1_Random/best_rf_model.pkl'
vectorizer_path = '/content/drive/MyDrive/D1/D1_Random/tfidf_vectorizer.pkl'

best_rf_model = joblib.load(rf_model_path)
vectorizer = joblib.load(vectorizer_path)

# Load test data
test_data = pd.read_csv('/content/drive/MyDrive/D1/val.csv')
test_texts = test_data['title'].tolist()

# Instantiate the ensemble model
ensemble_model = EnsembleModel(bert_model, best_rf_model, tokenizer, vectorizer, batch_size=32)

# Make predictions on the test set
ensemble_predictions = ensemble_model.predict(test_texts)

# Calculate accuracy
accuracy = accuracy_score(test_data['label'], ensemble_predictions)
print(f"Ensemble Test Accuracy: {accuracy}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Ensemble Test Accuracy: 0.5115070527097253


# Testing with sampling

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
import joblib

# Load the pre-trained BERT model
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
bert_model.eval()

# Tokenizer for BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define the EnsembleModel class
class EnsembleModel:
    def __init__(self, bert_model, rf_model, tokenizer, vectorizer, batch_size=32, subset_size=None):
        self.bert_model = bert_model
        self.rf_model = rf_model
        self.tokenizer = tokenizer
        self.vectorizer = vectorizer
        self.batch_size = batch_size
        self.subset_size = subset_size

    def predict(self, texts):
        # Determine subset size
        if self.subset_size is None:
            self.subset_size = len(texts)  # Use entire dataset if subset size is not provided
        else:
            self.subset_size = min(self.subset_size, len(texts))  # Ensure subset size is within dataset length

        texts = texts[:self.subset_size]

        bert_preds = []
        rf_preds = []

        for i in range(0, len(texts), self.batch_size):
            batch_texts = texts[i:i + self.batch_size]

            # Get BERT predictions
            inputs = self.tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = self.bert_model(**inputs)
                bert_output = outputs.logits.cpu().numpy()
                bert_preds.append(bert_output)

            # Get Random Forest predictions
            X_batch = self.vectorizer.transform(batch_texts)
            rf_output = self.rf_model.predict_proba(X_batch)[:, 1]
            rf_preds.append(rf_output)

        bert_preds = np.concatenate(bert_preds, axis=0)
        rf_preds = np.concatenate(rf_preds, axis=0)

        # Combine predictions (simple averaging)
        combined_outputs = (bert_preds[:, 1] + rf_preds) / 2
        return (combined_outputs > 0.5).astype(int)

# Load the Random Forest model and TF-IDF vectorizer
rf_model_path = '/content/drive/MyDrive/D1/D1_Random/best_rf_model.pkl'
vectorizer_path = '/content/drive/MyDrive/D1/D1_Random/tfidf_vectorizer.pkl'

best_rf_model = joblib.load(rf_model_path)
vectorizer = joblib.load(vectorizer_path)

# Load validation data
val_path = '/content/drive/MyDrive/D1/val.csv'
val_data = pd.read_csv(val_path)
y_val = val_data['label'].values

# Instantiate the ensemble model with subset size
ensemble_model = EnsembleModel(bert_model, best_rf_model, tokenizer, vectorizer, batch_size=32, subset_size=1000)

# Make predictions on the validation set
val_titles = val_data['title'].tolist()
ensemble_predictions = ensemble_model.predict(val_titles)

# Calculate ensemble accuracy
ensemble_accuracy = accuracy_score(y_val[:ensemble_model.subset_size], ensemble_predictions)
print(f"Ensemble Validation Accuracy: {ensemble_accuracy}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Ensemble Validation Accuracy: 0.723
