## Grid Search CV

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Load the dataset
data_path = '/content/drive/MyDrive/D3/dataset/train.csv'
data = pd.read_csv(data_path)

# Extract features and labels
X = data['text']
y = data['label'].astype(int)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer(max_features=500)  # Reduce number of features for faster computation
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Define the SVM model and a smaller parameter grid
svm = SVC()
param_grid = {
    'C': [1, 10],
    'gamma': [0.1, 0.01],
    'kernel': ['rbf']
}

# Perform GridSearchCV with fewer folds
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=3, verbose=2, n_jobs=-1)
grid_search.fit(X_train_tfidf, y_train)

# Get the best model and evaluate it
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_tfidf)

# Print evaluation metrics
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_}")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2%}")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best parameters: {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
Best cross-validation score: 0.9137013078708662
Accuracy: 92.20%
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.89      0.90       766
           1       0.93      0.94      0.93      1118

    accuracy                           0.92      1884
   macro avg       0.92      0.92      0.92      1884
weighted avg       0.92      0.92      0.92      1884

Confusion Matrix:
[[ 684   82]
 [  65 1053]]


In [None]:
import joblib
# Save the best model
model_path = '/content/drive/MyDrive/D3/D3_svm/hyper.pkl'
joblib.dump(best_model, model_path)

['/content/drive/MyDrive/D3/D3_svm/hyper.pkl']

# Bayes

In [None]:
!pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/107.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m102.4/107.8 kB[0m [31m3.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-24.4.0-py3-none-any.whl (24 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-24.4.0 scikit-optimize-0.10.2


In [None]:
import pandas as pd
from skopt import BayesSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import joblib
from sklearn.metrics import classification_report, accuracy_score

train_data_path = '/content/drive/MyDrive/D3/dataset/train.csv'
val_data_path = '/content/drive/MyDrive/D3/dataset/val.csv'
test_data_path = '/content/drive/MyDrive/D3/dataset/test.csv'

train_data = pd.read_csv(train_data_path)
val_data = pd.read_csv(val_data_path)
test_data = pd.read_csv(test_data_path)

# Define the SVM model with a pipeline (optional: include scaling)
model = make_pipeline(StandardScaler(with_mean=False), SVC())  # Set with_mean=False

# Define parameter ranges for Bayesian optimization
param_space = {
    'svc__C': (1e-6, 100.0, 'log-uniform'),
    'svc__gamma': (1e-6, 100.0, 'log-uniform'),
    'svc__kernel': ['linear', 'rbf']
}

# Perform Bayesian optimization
opt = BayesSearchCV(
    model,
    param_space,
    n_iter=5,  # Adjust the number of iterations as needed
    cv=5,  # Cross-validation folds
    n_jobs=-1,  # Use all available cores
    verbose=1  # Print optimization progress
)

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
train_features = vectorizer.fit_transform(train_data['text'])

# Fit the optimizer on the transformed training data
opt.fit(train_features, train_data['label'])
# --- END_SOLUTION

# Save the best SVM model
import joblib

save_path = '/content/drive/MyDrive/D3/D3_svm/bayesiansvm.pkl'
joblib.dump(opt.best_estimator_, save_path)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits


['/content/drive/MyDrive/D3/D3_svm/bayesiansvm.pkl']

In [None]:
# Evaluate the model on the validation set
X_val = vectorizer.transform(val_data['text'])
y_val = val_data['label']
y_val_pred = opt.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print("Validation Set Performance:")
print(classification_report(y_val, y_val_pred))
print(f"Validation Accuracy: {val_accuracy:.4f}")

# Preprocess the test data
X_test = vectorizer.transform(test_data['text'])
y_test = test_data['label']
y_test_pred = opt.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test Set Performance:")
print(classification_report(y_test, y_test_pred))
print(f"Test Accuracy: {test_accuracy:.4f}")

Validation Set Performance:
              precision    recall  f1-score   support

           0       0.93      0.89      0.91       850
           1       0.92      0.95      0.94      1168

    accuracy                           0.93      2018
   macro avg       0.93      0.92      0.93      2018
weighted avg       0.93      0.93      0.93      2018

Validation Accuracy: 0.9277
Test Set Performance:
              precision    recall  f1-score   support

           0       0.92      0.91      0.92       822
           1       0.94      0.94      0.94      1196

    accuracy                           0.93      2018
   macro avg       0.93      0.93      0.93      2018
weighted avg       0.93      0.93      0.93      2018

Test Accuracy: 0.9316


# PBT

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import joblib

# Step 1: Load the datasets
train_data_path = '/content/drive/MyDrive/D3/dataset/train.csv'
val_data_path = '/content/drive/MyDrive/D3/dataset/val.csv'
test_data_path = '/content/drive/MyDrive/D3/dataset/test.csv'

train_data = pd.read_csv(train_data_path)
val_data = pd.read_csv(val_data_path)
test_data = pd.read_csv(test_data_path)

# Assuming your 'label' column needs encoding (if not binary already)
label_encoder = LabelEncoder()
train_data['label'] = label_encoder.fit_transform(train_data['label'])
val_data['label'] = label_encoder.transform(val_data['label'])
test_data['label'] = label_encoder.transform(test_data['label'])

# Step 2: Feature extraction (TF-IDF)
vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = vectorizer.fit_transform(train_data['text'])
X_val_tfidf = vectorizer.transform(val_data['text'])
X_test_tfidf = vectorizer.transform(test_data['text'])
y_train = train_data['label']
y_val = val_data['label']
y_test = test_data['label']

# Step 3: Train the SVM model
svm_model = SVC(kernel='linear', C=1.0, random_state=42)
svm_model.fit(X_train_tfidf, y_train)

# Step 4: Save the trained model
model_save_path = '/content/drive/MyDrive/D3/D3_svm/pbt_svm.pkl'
joblib.dump(svm_model, model_save_path)
print(f"Model saved successfully at {model_save_path}")

# Step 5: Evaluate the model's accuracy on validation data
val_predictions = svm_model.predict(X_val_tfidf)
val_accuracy = accuracy_score(y_val, val_predictions)
print(f"Validation Accuracy: {val_accuracy:.4f}")

# Step 6: Evaluate the model's accuracy on test data
test_predictions = svm_model.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, test_predictions)
print(f"Test Accuracy: {test_accuracy:.4f}")

Model saved successfully at /content/drive/MyDrive/D3/D3_svm/pbt_svm.pkl
Validation Accuracy: 0.9390
Test Accuracy: 0.9430


# Genetic

In [None]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [None]:
# Step 1: Load the datasets
train_data_path = '/content/drive/MyDrive/D3/dataset/train.csv'
val_data_path = '/content/drive/MyDrive/D3/dataset/val.csv'
test_data_path = '/content/drive/MyDrive/D3/dataset/test.csv'

train_data = pd.read_csv(train_data_path)
val_data = pd.read_csv(val_data_path)
test_data = pd.read_csv(test_data_path)

# Assuming your 'label' column needs encoding (if not binary already)
label_encoder = LabelEncoder()
train_data['label'] = label_encoder.fit_transform(train_data['label'])
val_data['label'] = label_encoder.transform(val_data['label'])
test_data['label'] = label_encoder.transform(test_data['label'])

# Step 2: Feature extraction (TF-IDF)
vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = vectorizer.fit_transform(train_data['text'])
X_val_tfidf = vectorizer.transform(val_data['text'])
X_test_tfidf = vectorizer.transform(test_data['text'])
y_train = train_data['label']
y_val = val_data['label']
y_test = test_data['label']

# Genetic Algorithm Settings
population_size = 10
num_generations = 5
mutation_rate = 0.1

# Genetic Algorithm Framework
population = []
for _ in range(population_size):
    # Randomly initialize SVM hyperparameters
    C = np.random.uniform(0.1, 10.0)
    kernel = np.random.choice(['linear', 'rbf'])
    gamma = 'scale' if kernel == 'rbf' else 'auto'

    # Train SVM with current hyperparameters
    svm_model = SVC(kernel=kernel, C=C, gamma=gamma, random_state=42)
    svm_model.fit(X_train_tfidf, y_train)

    # Evaluate fitness on validation set
    val_predictions = svm_model.predict(X_val_tfidf)
    accuracy = accuracy_score(y_val, val_predictions)

    # Store hyperparameters and accuracy in population
    population.append((svm_model, {'kernel': kernel, 'C': C, 'gamma': gamma}, accuracy))

In [None]:
# Evolution loop
for generation in range(num_generations):
    # Sort population by fitness (accuracy)
    population.sort(key=lambda x: x[2], reverse=True)

    # Print the best model in the current generation
    best_model, best_hyperparams, best_accuracy = population[0]
    print(f"Generation {generation + 1}: Best Accuracy = {best_accuracy:.4f}, Hyperparameters = {best_hyperparams}")

    # Select top performers to produce offspring
    selected_parents = population[:population_size // 2]

    # Crossover and mutation
    offspring_population = []
    for i in range(population_size):
        parent1, params1, _ = selected_parents[np.random.randint(len(selected_parents))]
        parent2, params2, _ = selected_parents[np.random.randint(len(selected_parents))]

        # Perform crossover (combine hyperparameters)
        child_params = {}
        for param_key in params1.keys():
            if np.random.rand() < 0.5:
                child_params[param_key] = params1[param_key]
            else:
                child_params[param_key] = params2[param_key]

        # Perform mutation (slight modification to hyperparameters)
        for param_key in child_params.keys():
            if np.random.rand() < mutation_rate:
                if param_key == 'C':
                    child_params[param_key] = np.random.uniform(0.1, 10.0)
                elif param_key == 'kernel':
                    child_params[param_key] = np.random.choice(['linear', 'rbf'])
                    if child_params[param_key] == 'rbf':
                        child_params['gamma'] = 'scale'  # Adjust gamma for RBF kernel

        # Train SVM with mutated hyperparameters
        child_model = SVC(kernel=child_params['kernel'], C=child_params['C'], gamma=child_params['gamma'], random_state=42)
        child_model.fit(X_train_tfidf, y_train)

        # Evaluate fitness on validation set
        val_predictions = child_model.predict(X_val_tfidf)
        accuracy = accuracy_score(y_val, val_predictions)

        # Add child to offspring population
        offspring_population.append((child_model, child_params, accuracy))

    # Replace the old population with the offspring
    population = offspring_population

Generation 1: Best Accuracy = 0.9470, Hyperparameters = {'kernel': 'rbf', 'C': 1.8480569904918551, 'gamma': 'scale'}
Generation 2: Best Accuracy = 0.9475, Hyperparameters = {'kernel': 'rbf', 'C': 2.121405951856806, 'gamma': 'scale'}
Generation 3: Best Accuracy = 0.9475, Hyperparameters = {'kernel': 'rbf', 'C': 2.121405951856806, 'gamma': 'scale'}
Generation 4: Best Accuracy = 0.9475, Hyperparameters = {'kernel': 'rbf', 'C': 2.121405951856806, 'gamma': 'scale'}
Generation 5: Best Accuracy = 0.9475, Hyperparameters = {'kernel': 'rbf', 'C': 2.121405951856806, 'gamma': 'scale'}


In [None]:
# Final best model evaluation on test set
population.sort(key=lambda x: x[2], reverse=True)
best_model, best_hyperparams, best_accuracy = population[0]
print(f"Best Model - Accuracy on Validation Set: {best_accuracy:.4f}, Hyperparameters: {best_hyperparams}")

# Evaluate on test set
test_predictions = best_model.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, test_predictions)
print(f"Accuracy on Test Set: {test_accuracy:.4f}")

import joblib

# Final best model evaluation on test set
population.sort(key=lambda x: x[2], reverse=True)
best_model, best_hyperparams, best_accuracy = population[0]
print(f"Best Model - Accuracy on Validation Set: {best_accuracy:.4f}, Hyperparameters: {best_hyperparams}")

# Save the best model
model_save_path = '/content/drive/MyDrive/D3/D3_svm/geneticsvm.pkl'
joblib.dump(best_model, model_save_path)
print(f"Saved the best SVM model to {model_save_path}")

# Evaluate on test set
test_predictions = best_model.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, test_predictions)
print(f"Accuracy on Test Set: {test_accuracy:.4f}")

Best Model - Accuracy on Validation Set: 0.9475, Hyperparameters: {'kernel': 'rbf', 'C': 2.121405951856806, 'gamma': 'scale'}
Accuracy on Test Set: 0.9559
Best Model - Accuracy on Validation Set: 0.9475, Hyperparameters: {'kernel': 'rbf', 'C': 2.121405951856806, 'gamma': 'scale'}
Saved the best SVM model to /content/drive/MyDrive/D3/D3_svm/geneticsvm.pkl
Accuracy on Test Set: 0.9559


# Hyperband

In [None]:
pip install scikit-optimize



In [None]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
import joblib
from skopt import BayesSearchCV
from skopt.callbacks import VerboseCallback
from skopt.space import Real, Categorical, Integer

train_data_path = '/content/drive/MyDrive/D3/dataset/train.csv'
val_data_path = '/content/drive/MyDrive/D3/dataset/val.csv'
test_data_path = '/content/drive/MyDrive/D3/dataset/test.csv'

train_data = pd.read_csv(train_data_path)
val_data = pd.read_csv(val_data_path)
test_data = pd.read_csv(test_data_path)
# ... (Your previous code) ...
# Convert 'label' column to string type before using .str accessor
train_data['label'] = train_data['label'].astype(str).str.lower().apply(lambda x: 1 if x == 'false' else 0)
val_data['label'] = val_data['label'].astype(str).str.lower().apply(lambda x: 1 if x == 'false' else 0)
test_data['label'] = test_data['label'].astype(str).str.lower().apply(lambda x: 1 if x == 'false' else 0)

# --- Check label distribution AFTER conversion ---
print("Train data labels after conversion:", train_data['label'].value_counts())
print("Validation data labels after conversion:", val_data['label'].value_counts())
# ... rest of your code
# Combine train and val data for hyperparameter tuning
X_train = train_data['text']
y_train = train_data['label']
X_val = val_data['text']
y_val = val_data['label']


# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)

# Define the parameter space for Hyperband
param_space = {
    'C': Real(1e-6, 1e+6, prior='log-uniform'),
    'gamma': Real(1e-6, 1e+1, prior='log-uniform'),
    'kernel': Categorical(['linear', 'poly', 'rbf', 'sigmoid']),
    'degree': Integer(1, 8),  # Only used for 'poly' kernel
}

Train data labels after conversion: label
0    9416
Name: count, dtype: int64
Validation data labels after conversion: label
0    2018
Name: count, dtype: int64


In [None]:
# Check unique labels in training data
print(y_train.unique())

# Check distribution of labels
print(y_train.value_counts())

[0]
label
0    9416
Name: count, dtype: int64


In [None]:
# Check if you have more than one class in your training data
if len(train_data['label'].unique()) <= 1:
    print("WARNING: Your training data only has one class. This will cause errors with SVM.")
else:
    print("You have multiple classes in your training data, you should be good to go!")



In [None]:
print(train_data.head())
print(val_data.head())
print(train_data['label'].value_counts())

      id                                              title           author  \
0  10724  Trumpâs FIRST Order: Anyone Burning An Ameri...     Martin Walsh   
1   7200  Global Migration Meets Magic in Mohsin Hamidâ...  Alexandra Alter   
2  19242  WATCH: Gingrich Accuses Megyn Kelly Of Being â...            Davis   
3  11695  Badass Patriot Has MASSIVE Surprise For Thieve...      Amanda Shea   
4   2205  James Wesley Rawles: âDouble Up On Your Prep...        Mac Slavo   

                                                text  label  
0  \nPosted by Martin Walsh | Nov 11, 2016 | Libe...      0  
1  In an unnamed,   city in the Muslim world, two...      0  
2  Hillary Howls in Laughter About Radical Muslim...      0  
3  Badass Patriot Has MASSIVE Surprise For Thieve...      0  
4  \nEnjoy your turkey, family events and holiday...      0  
      id                                              title  \
0  11387        How WiFi & Other EMFs Cause Biological Harm   
1   8187             

In [None]:
# Initialize the SVM model
svc = SVC()

# Stratified K-Fold for better class distribution in splits
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Perform Hyperband search with StratifiedKFold
hyperband = BayesSearchCV(svc, param_space, n_iter=5, cv=skf, n_jobs=-1, random_state=42, scoring='accuracy')

# Check if any fold has only one class
for train_index, test_index in skf.split(X_train_tfidf, y_train):
    if len(np.unique(y_train.iloc[train_index])) <= 1:
        print("WARNING: A fold has only one class. This will cause errors with SVM.")
        break  # Stop if a problematic fold is found
else:  # This block executes if no break occurred
    try:
        # Add verbose parameter to see the progress of the fitting process
        hyperband.fit(X_train_tfidf, y_train, verbose=2) # Try fitting the model
        # Print the best parameters found
        print(f"Best parameters found: {hyperband.best_params_}")
    except Exception as e:
        print(f"An error occurred during model fitting: {e}") # Print the error if fitting fails

    # Save the best model
    if hasattr(hyperband, 'best_estimator_'): # Check if the model was fit successfully
        best_model = hyperband.best_estimator_
        model_path = '/content/drive/MyDrive/D3/D3_svm/hyperbandsvm.pkl'
        joblib.dump(best_model, model_path)
    else:
        print("Model fitting failed. Cannot save the best estimator.")



In [None]:
# Evaluate the model on the validation set
y_val_pred = best_model.predict(X_val_tfidf)
print("Validation Set Performance:")
print(classification_report(y_val, y_val_pred))

# Preprocess the test data
X_test = test_data['text']
y_test = test_data['label']
X_test_tfidf = vectorizer.transform(X_test)

# Evaluate the model on the test set
y_test_pred = best_model.predict(X_test_tfidf)
print("Test Set Performance:")
print(classification_report(y_test, y_test_pred))

Validation Set Performance:
              precision    recall  f1-score   support

           0       1.00      0.42      0.59      2018
           1       0.00      0.00      0.00         0

    accuracy                           0.42      2018
   macro avg       0.50      0.21      0.29      2018
weighted avg       1.00      0.42      0.59      2018



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Test Set Performance:
              precision    recall  f1-score   support

           0       1.00      0.40      0.58      2018
           1       0.00      0.00      0.00         0

    accuracy                           0.40      2018
   macro avg       0.50      0.20      0.29      2018
weighted avg       1.00      0.40      0.58      2018



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
