In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Load dataset (replace 'news_articles.csv' with the path to your dataset)
data = pd.read_csv('news_articles.csv')

# Data preprocessing
# We'll use the 'text' and 'label' columns for this task
data = data[['text', 'label']].dropna()

# Splitting data into training and testing sets
X = data['text']
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Text transformation using TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Model training - Logistic Regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# Model evaluation
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

# Saving the trained model and TF-IDF vectorizer for future use
joblib.dump(model, 'fake_news_detection_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

print("Model and vectorizer saved as 'fake_news_detection_model.pkl' and 'tfidf_vectorizer.pkl'.")


Accuracy: 0.6926829268292682
Classification Report:
              precision    recall  f1-score   support

        Fake       0.67      0.96      0.79       249
        Real       0.83      0.27      0.41       161

    accuracy                           0.69       410
   macro avg       0.75      0.62      0.60       410
weighted avg       0.73      0.69      0.64       410

Model and vectorizer saved as 'fake_news_detection_model.pkl' and 'tfidf_vectorizer.pkl'.


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib
import optuna
from optuna.samplers import TPESampler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

# Load dataset
data = pd.read_csv('news_articles.csv')

# Data preprocessing - select the 'text' and 'label' columns, drop any missing values
data = data[['text', 'label']].dropna()

# Splitting data into training and testing sets
X = data['text']
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model optimization process using Optuna
def objective(trial):
    # Hyperparameters for tuning
    C = trial.suggest_float("C", 1e-5, 10.0, log=True)
    max_df = trial.suggest_float("max_df", 0.5, 1.0)
    stop_words = 'english'
    
    # Create the pipeline: TF-IDF and Logistic Regression
    model_pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words=stop_words, max_df=max_df)),
        ('log_reg', LogisticRegression(C=C, max_iter=1000, random_state=42))
    ])

    # Perform cross-validation
    score = cross_val_score(model_pipeline, X_train, y_train, n_jobs=-1, cv=3, scoring="accuracy")
    return score.mean()

# Create Optuna study
sampler = TPESampler(seed=42)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=30)

# Best hyperparameters
best_params = study.best_params
print("Best hyperparameters:", best_params)

# Build the final model using the best hyperparameters
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=best_params['max_df'])
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train the Logistic Regression model with optimized hyperparameters
model = LogisticRegression(C=best_params['C'], max_iter=1000, random_state=42)
model.fit(X_train_tfidf, y_train)

# Model evaluation
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

# Save the model and vectorizer
joblib.dump(model, 'optimized_fake_news_detection_model.pkl')
joblib.dump(tfidf_vectorizer, 'optimized_tfidf_vectorizer.pkl')

# Testing function to predict new examples
def test_model(examples):
    examples_tfidf = tfidf_vectorizer.transform(examples)
    predictions = model.predict(examples_tfidf)
    return predictions

# Example test over the dataset
test_examples = X_test[:5]
predicted_labels = test_model(test_examples)

# Display test results
test_results = pd.DataFrame({
    'Text': test_examples,
    'Predicted Label': predicted_labels,
    'Actual Label': y_test[:5].values
})

print(test_results)

# Save the test results
test_results.to_csv('test_results.csv', index=False)

print("Model and vectorizer saved as 'optimized_fake_news_detection_model.pkl' and 'optimized_tfidf_vectorizer.pkl'.")


  from .autonotebook import tqdm as notebook_tqdm
[I 2024-10-13 11:00:54,117] A new study created in memory with name: no-name-96dc690d-6a60-4d78-9d4e-b2453758b43a
[I 2024-10-13 11:00:55,783] Trial 0 finished with value: 0.6359753389003846 and parameters: {'C': 0.0017670169402947942, 'max_df': 0.9753571532049581}. Best is trial 0 with value: 0.6359753389003846.
[I 2024-10-13 11:00:56,357] Trial 1 finished with value: 0.644512302647586 and parameters: {'C': 0.24658329458549083, 'max_df': 0.7993292420985183}. Best is trial 1 with value: 0.644512302647586.
[I 2024-10-13 11:00:56,926] Trial 2 finished with value: 0.6359753389003846 and parameters: {'C': 8.632008168602535e-05, 'max_df': 0.5779972601681014}. Best is trial 1 with value: 0.644512302647586.
[I 2024-10-13 11:00:57,491] Trial 3 finished with value: 0.6359753389003846 and parameters: {'C': 2.231010801867923e-05, 'max_df': 0.9330880728874675}. Best is trial 1 with value: 0.644512302647586.
[I 2024-10-13 11:00:58,066] Trial 4 finish

Best hyperparameters: {'C': 9.297431007889415, 'max_df': 0.5097754659842255}
Accuracy: 0.7634146341463415
Classification Report:
              precision    recall  f1-score   support

        Fake       0.76      0.89      0.82       249
        Real       0.77      0.57      0.65       161

    accuracy                           0.76       410
   macro avg       0.77      0.73      0.74       410
weighted avg       0.76      0.76      0.75       410

                                                   Text Predicted Label  \
1808  home  be the change  government corruption  pr...            Fake   
694   podcast play in new window  download  embed \n...            Fake   
906   new leaked clinton emails came from the device...            Fake   
544   email \n\nhillary supporter robert dougherty f...            Fake   
1847  id love to see clinton spend all her money and...            Fake   

     Actual Label  
1808         Real  
694          Fake  
906          Fake  
544          