In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Population-based Training

In [None]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib  # Correct import for joblib

In [None]:
# Step 1: Load the datasets
train_data_path = '/content/drive/MyDrive/Project11_FakeNewsDetection/train.csv'
val_data_path = '/content/drive/MyDrive/Project11_FakeNewsDetection/val.csv'
test_data_path = '/content/drive/MyDrive/Project11_FakeNewsDetection/test.csv'

train_data = pd.read_csv(train_data_path)
val_data = pd.read_csv(val_data_path)
test_data = pd.read_csv(test_data_path)

# Assuming your 'label' column needs encoding (if not binary already)
label_encoder = LabelEncoder()
train_data['label'] = label_encoder.fit_transform(train_data['label'])
val_data['label'] = label_encoder.transform(val_data['label'])
test_data['label'] = label_encoder.transform(test_data['label'])


In [None]:
# Step 2: Feature extraction (TF-IDF)
vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = vectorizer.fit_transform(train_data['title'])
X_val_tfidf = vectorizer.transform(val_data['title'])
X_test_tfidf = vectorizer.transform(test_data['title'])
y_train = train_data['label']
y_val = val_data['label']
y_test = test_data['label']

In [None]:
# Step 3: Train the SVM model
svm_model = SVC(kernel='linear', C=1.0, random_state=42)
svm_model.fit(X_train_tfidf, y_train)

In [None]:
# Step 4: Save the trained model
model_save_path = '/content/drive/MyDrive/svmmodels/pbt_svm.pkl'
joblib.dump(svm_model, model_save_path)

['/content/drive/MyDrive/svmmodels/pbt_svm.pkl']

In [None]:
# Step 5: Evaluate the model's accuracy on validation data
val_predictions = svm_model.predict(X_val_tfidf)
accuracy = accuracy_score(y_val, val_predictions)
print(f"Validation Accuracy: {accuracy:.4f}")

Validation Accuracy: 0.9411


# Genetic Algorithms

In [None]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [None]:
# Step 1: Load the datasets
train_data_path = '/content/drive/MyDrive/Project11_FakeNewsDetection/train.csv'
val_data_path = '/content/drive/MyDrive/Project11_FakeNewsDetection/val.csv'
test_data_path = '/content/drive/MyDrive/Project11_FakeNewsDetection/test.csv'

train_data = pd.read_csv(train_data_path)
val_data = pd.read_csv(val_data_path)
test_data = pd.read_csv(test_data_path)

# Assuming your 'label' column needs encoding (if not binary already)
label_encoder = LabelEncoder()
train_data['label'] = label_encoder.fit_transform(train_data['label'])
val_data['label'] = label_encoder.transform(val_data['label'])
test_data['label'] = label_encoder.transform(test_data['label'])

In [None]:
# Step 2: Feature extraction (TF-IDF)
vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = vectorizer.fit_transform(train_data['title'])
X_val_tfidf = vectorizer.transform(val_data['title'])
X_test_tfidf = vectorizer.transform(test_data['title'])
y_train = train_data['label']
y_val = val_data['label']
y_test = test_data['label']

In [None]:
# Genetic Algorithm Settings
population_size = 10
num_generations = 5
mutation_rate = 0.1

In [None]:
# Genetic Algorithm Framework
population = []
for _ in range(population_size):
    # Randomly initialize SVM hyperparameters
    C = np.random.uniform(0.1, 10.0)
    kernel = np.random.choice(['linear', 'rbf'])
    gamma = 'scale' if kernel == 'rbf' else 'auto'

    # Train SVM with current hyperparameters
    svm_model = SVC(kernel=kernel, C=C, gamma=gamma, random_state=42)
    svm_model.fit(X_train_tfidf, y_train)

    # Evaluate fitness on validation set
    val_predictions = svm_model.predict(X_val_tfidf)
    accuracy = accuracy_score(y_val, val_predictions)

    # Store hyperparameters and accuracy in population
    population.append((svm_model, {'kernel': kernel, 'C': C, 'gamma': gamma}, accuracy))

In [None]:
# Evolution loop
for generation in range(num_generations):
    # Sort population by fitness (accuracy)
    population.sort(key=lambda x: x[2], reverse=True)

    # Print the best model in the current generation
    best_model, best_hyperparams, best_accuracy = population[0]
    print(f"Generation {generation + 1}: Best Accuracy = {best_accuracy:.4f}, Hyperparameters = {best_hyperparams}")

    # Select top performers to produce offspring
    selected_parents = population[:population_size // 2]

    # Crossover and mutation
    offspring_population = []
    for i in range(population_size):
        parent1, params1, _ = selected_parents[np.random.randint(len(selected_parents))]
        parent2, params2, _ = selected_parents[np.random.randint(len(selected_parents))]

        # Perform crossover (combine hyperparameters)
        child_params = {}
        for param_key in params1.keys():
            if np.random.rand() < 0.5:
                child_params[param_key] = params1[param_key]
            else:
                child_params[param_key] = params2[param_key]

        # Perform mutation (slight modification to hyperparameters)
        for param_key in child_params.keys():
            if np.random.rand() < mutation_rate:
                if param_key == 'C':
                    child_params[param_key] = np.random.uniform(0.1, 10.0)
                elif param_key == 'kernel':
                    child_params[param_key] = np.random.choice(['linear', 'rbf'])
                    if child_params[param_key] == 'rbf':
                        child_params['gamma'] = 'scale'  # Adjust gamma for RBF kernel

        # Train SVM with mutated hyperparameters
        child_model = SVC(kernel=child_params['kernel'], C=child_params['C'], gamma=child_params['gamma'], random_state=42)
        child_model.fit(X_train_tfidf, y_train)

        # Evaluate fitness on validation set
        val_predictions = child_model.predict(X_val_tfidf)
        accuracy = accuracy_score(y_val, val_predictions)

        # Add child to offspring population
        offspring_population.append((child_model, child_params, accuracy))

    # Replace the old population with the offspring
    population = offspring_population

Generation 1: Best Accuracy = 0.9541, Hyperparameters = {'kernel': 'rbf', 'C': 1.268173565515574, 'gamma': 'scale'}
Generation 2: Best Accuracy = 0.9541, Hyperparameters = {'kernel': 'rbf', 'C': 1.268173565515574, 'gamma': 'scale'}
Generation 3: Best Accuracy = 0.9541, Hyperparameters = {'kernel': 'rbf', 'C': 1.8570765612040026, 'gamma': 'scale'}
Generation 4: Best Accuracy = 0.9541, Hyperparameters = {'kernel': 'rbf', 'C': 1.268173565515574, 'gamma': 'scale'}
Generation 5: Best Accuracy = 0.9541, Hyperparameters = {'kernel': 'rbf', 'C': 1.268173565515574, 'gamma': 'scale'}


In [None]:
# Final best model evaluation on test set
population.sort(key=lambda x: x[2], reverse=True)
best_model, best_hyperparams, best_accuracy = population[0]
print(f"Best Model - Accuracy on Validation Set: {best_accuracy:.4f}, Hyperparameters: {best_hyperparams}")

# Evaluate on test set
test_predictions = best_model.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, test_predictions)
print(f"Accuracy on Test Set: {test_accuracy:.4f}")

Best Model - Accuracy on Validation Set: 0.9541, Hyperparameters: {'kernel': 'rbf', 'C': 1.8570765612040026, 'gamma': 'scale'}
Accuracy on Test Set: 0.9551


In [None]:
import joblib

# Final best model evaluation on test set
population.sort(key=lambda x: x[2], reverse=True)
best_model, best_hyperparams, best_accuracy = population[0]
print(f"Best Model - Accuracy on Validation Set: {best_accuracy:.4f}, Hyperparameters: {best_hyperparams}")

# Save the best model
model_save_path = '/content/drive/MyDrive/svmmodels/geneticsvm.pkl'
joblib.dump(best_model, model_save_path)
print(f"Saved the best SVM model to {model_save_path}")

# Evaluate on test set
test_predictions = best_model.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, test_predictions)
print(f"Accuracy on Test Set: {test_accuracy:.4f}")

Best Model - Accuracy on Validation Set: 0.9541, Hyperparameters: {'kernel': 'rbf', 'C': 1.8570765612040026, 'gamma': 'scale'}
Saved the best SVM model to /content/drive/MyDrive/svmmodels/geneticsvm.pkl
Accuracy on Test Set: 0.9551


# Hyperband

In [None]:
pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-24.4.0-py3-none-any.whl (24 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-24.4.0 scikit-optimize-0.10.2


In [None]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
import joblib
from skopt import BayesSearchCV
from skopt.callbacks import VerboseCallback
from skopt.space import Real, Categorical, Integer

In [None]:
train_data_path = '/content/drive/MyDrive/Project11_FakeNewsDetection/train.csv'
val_data_path = '/content/drive/MyDrive/Project11_FakeNewsDetection/val.csv'
test_data_path = '/content/drive/MyDrive/Project11_FakeNewsDetection/test.csv'

train_data = pd.read_csv(train_data_path)
val_data = pd.read_csv(val_data_path)
test_data = pd.read_csv(test_data_path)
# ... (Your previous code) ...
# Convert 'label' column to string type before using .str accessor
train_data['label'] = train_data['label'].astype(str).str.lower().apply(lambda x: 1 if x == 'false' else 0)
val_data['label'] = val_data['label'].astype(str).str.lower().apply(lambda x: 1 if x == 'false' else 0)
test_data['label'] = test_data['label'].astype(str).str.lower().apply(lambda x: 1 if x == 'false' else 0)

# --- Check label distribution AFTER conversion ---
print("Train data labels after conversion:", train_data['label'].value_counts())
print("Validation data labels after conversion:", val_data['label'].value_counts())
# ... rest of your code
# Combine train and val data for hyperparameter tuning
X_train = train_data['title']
y_train = train_data['label']
X_val = val_data['title']
y_val = val_data['label']

Train data labels after conversion: label
0    16436
1    14992
Name: count, dtype: int64
Validation data labels after conversion: label
0    3522
1    3213
Name: count, dtype: int64


In [None]:
# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)

# Define the parameter space for Hyperband
param_space = {
    'C': Real(1e-6, 1e+6, prior='log-uniform'),
    'gamma': Real(1e-6, 1e+1, prior='log-uniform'),
    'kernel': Categorical(['linear', 'poly', 'rbf', 'sigmoid']),
    'degree': Integer(1, 8),  # Only used for 'poly' kernel
}

In [None]:
# Check unique labels in training data
print(y_train.unique())

# Check distribution of labels
print(y_train.value_counts())

[0 1]
label
0    16436
1    14992
Name: count, dtype: int64


In [None]:
# Initialize the SVM model
svc = SVC()

# Stratified K-Fold for better class distribution in splits
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Perform Hyperband search with StratifiedKFold
hyperband = BayesSearchCV(svc, param_space, n_iter=5, cv=skf, n_jobs=-1, random_state=42, scoring='accuracy')
hyperband.fit(X_train_tfidf, y_train, callback=VerboseCallback(n_total=20))

# Print the best parameters found
print(f"Best parameters found: {hyperband.best_params_}")

Iteration No: 1 started. Searching for the next optimal point.
Iteration No: 1 ended. Search finished for the next optimal point.
Time taken: 438.4660
Function value obtained: -0.8546
Current minimum: -0.8546
Iteration No: 2 started. Searching for the next optimal point.
Iteration No: 2 ended. Search finished for the next optimal point.
Time taken: 53.1981
Function value obtained: -0.9369
Current minimum: -0.9369
Iteration No: 3 started. Searching for the next optimal point.
Iteration No: 3 ended. Search finished for the next optimal point.
Time taken: 189.6086
Function value obtained: -0.5230
Current minimum: -0.9369
Iteration No: 4 started. Searching for the next optimal point.
Iteration No: 4 ended. Search finished for the next optimal point.
Time taken: 267.8876
Function value obtained: -0.9356
Current minimum: -0.9369
Iteration No: 5 started. Searching for the next optimal point.
Iteration No: 5 ended. Search finished for the next optimal point.
Time taken: 114.2453
Function value

In [None]:
# Save the best model
best_model = hyperband.best_estimator_
model_path = '/content/drive/MyDrive/svmmodels/hyperbandsvm.pkl'
joblib.dump(best_model, model_path)

['/content/drive/MyDrive/svmmodels/hyperbandsvm.pkl']

In [None]:
# Evaluate the model on the validation set
y_val_pred = best_model.predict(X_val_tfidf)
print("Validation Set Performance:")
print(classification_report(y_val, y_val_pred))

# Preprocess the test data
X_test = test_data['title']
y_test = test_data['label']
X_test_tfidf = vectorizer.transform(X_test)

# Evaluate the model on the test set
y_test_pred = best_model.predict(X_test_tfidf)
print("Test Set Performance:")
print(classification_report(y_test, y_test_pred))

Validation Set Performance:
              precision    recall  f1-score   support

           0       0.95      0.94      0.94      3522
           1       0.94      0.94      0.94      3213

    accuracy                           0.94      6735
   macro avg       0.94      0.94      0.94      6735
weighted avg       0.94      0.94      0.94      6735

Test Set Performance:
              precision    recall  f1-score   support

           0       0.95      0.94      0.94      3402
           1       0.93      0.95      0.94      3194

    accuracy                           0.94      6596
   macro avg       0.94      0.94      0.94      6596
weighted avg       0.94      0.94      0.94      6596

