In [28]:
import mlflow
import pandas as pd
import os
import sys
import json
import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [29]:
df = pd.read_csv("data.csv")

In [30]:
# data preprocessing

# Define text preprocessing functions
def lemmatization(text):
    """Lemmatize the text."""
    lemmatizer = WordNetLemmatizer()
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text]
    return " ".join(text)

def remove_stop_words(text):
    """Remove stop words from the text."""
    stop_words = set(stopwords.words("english"))
    text = [word for word in str(text).split() if word not in stop_words]
    return " ".join(text)

def removing_numbers(text):
    """Remove numbers from the text."""
    text = ''.join([char for char in text if not char.isdigit()])
    return text

def lower_case(text):
    """Convert text to lower case."""
    text = text.split()
    text = [word.lower() for word in text]
    return " ".join(text)

def removing_punctuations(text):
    """Remove punctuations from the text."""
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = text.replace('؛', "")
    text = re.sub('\s+', ' ', text).strip()
    return text

def removing_urls(text):
    """Remove URLs from the text."""
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def normalize_text(df):
    """Normalize the text data."""
    try:
        df['text'] = df['text'].apply(lower_case)
        df['text'] = df['text'].apply(remove_stop_words)
        df['text'] = df['text'].apply(removing_numbers)
        df['text'] = df['text'].apply(removing_punctuations)
        df['text'] = df['text'].apply(removing_urls)
        df['text'] = df['text'].apply(lemmatization)
        
        df['title'] = df['title'].apply(lower_case)
        df['title'] = df['title'].apply(remove_stop_words)
        df['title'] = df['title'].apply(removing_numbers)
        df['title'] = df['title'].apply(removing_punctuations)
        df['title'] = df['title'].apply(removing_urls)
        df['title'] = df['title'].apply(lemmatization)
        return df
    except Exception as e:
        print(f'Error during text normalization: {e}')
        raise

In [31]:
df.head()

Unnamed: 0,title,text,date,source,author,category,label
0,Foreign Democrat final.,more tax development both store agreement lawy...,2023-03-10,NY Times,Paula George,Politics,real
1,To offer down resource great point.,probably guess western behind likely next inve...,2022-05-25,Fox News,Joseph Hill,Politics,fake
2,Himself church myself carry.,them identify forward present success risk sev...,2022-09-01,CNN,Julia Robinson,Business,fake
3,You unit its should.,phone which item yard Republican safe where po...,2023-02-07,Reuters,Mr. David Foster DDS,Science,fake
4,Billion believe employee summer how.,wonder myself fact difficult course forget exa...,2023-04-03,CNN,Austin Walker,Technology,fake


In [32]:
df.drop(columns=['date'], inplace=True)
df.dropna(inplace=True)

In [33]:
df.head()

Unnamed: 0,title,text,source,author,category,label
0,Foreign Democrat final.,more tax development both store agreement lawy...,NY Times,Paula George,Politics,real
1,To offer down resource great point.,probably guess western behind likely next inve...,Fox News,Joseph Hill,Politics,fake
2,Himself church myself carry.,them identify forward present success risk sev...,CNN,Julia Robinson,Business,fake
3,You unit its should.,phone which item yard Republican safe where po...,Reuters,Mr. David Foster DDS,Science,fake
4,Billion believe employee summer how.,wonder myself fact difficult course forget exa...,CNN,Austin Walker,Technology,fake


In [34]:
#Lower case
for col in df.columns:
  df[col] = df[col].str.lower()

In [35]:
df = normalize_text(df)
df.head()

Unnamed: 0,title,text,source,author,category,label
0,foreign democrat final,tax development store agreement lawyer hear ou...,ny times,paula george,politics,real
1,offer resource great point,probably guess western behind likely next inve...,fox news,joseph hill,politics,fake
2,church carry,identify forward present success risk several ...,cnn,julia robinson,business,fake
3,unit should,phone item yard republican safe police identif...,reuters,mr. david foster dds,science,fake
4,billion believe employee summer how,wonder fact difficult course forget exactly pa...,cnn,austin walker,technology,fake


In [36]:
df['label'] = df['label'].map({'fake': 0, 'real': 1})
df.head()

Unnamed: 0,title,text,source,author,category,label
0,foreign democrat final,tax development store agreement lawyer hear ou...,ny times,paula george,politics,1
1,offer resource great point,probably guess western behind likely next inve...,fox news,joseph hill,politics,0
2,church carry,identify forward present success risk several ...,cnn,julia robinson,business,0
3,unit should,phone item yard republican safe police identif...,reuters,mr. david foster dds,science,0
4,billion believe employee summer how,wonder fact difficult course forget exactly pa...,cnn,austin walker,technology,0


In [44]:
df.isnull().sum()
df.to_csv('data_cleaned.csv', index=False)
df.head()

Unnamed: 0,title,text,source,author,category,label
0,foreign democrat final,tax development store agreement lawyer hear ou...,ny times,paula george,politics,1
1,offer resource great point,probably guess western behind likely next inve...,fox news,joseph hill,politics,0
2,church carry,identify forward present success risk several ...,cnn,julia robinson,business,0
3,unit should,phone item yard republican safe police identif...,reuters,mr. david foster dds,science,0
4,billion believe employee summer how,wonder fact difficult course forget exactly pa...,cnn,austin walker,technology,0


In [39]:
vectorizer = CountVectorizer(max_features=5000)
X = vectorizer.fit_transform(df[['title','text','source','author','category']].apply(lambda x: ' '.join(x), axis=1)).toarray()
y = df['label']

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [41]:
import dagshub

mlflow.set_tracking_uri('https://dagshub.com/udaygupta8899/Fake-News-Detection.mlflow/')
dagshub.init(repo_owner='udaygupta8899', repo_name='Fake-News-Detection', mlflow=True)
mlflow.set_experiment("Random Forest Classifier Baseline")

2025/05/16 20:35:21 INFO mlflow.tracking.fluent: Experiment with name 'Random Forest Classifier Baseline' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/59d9ec40a9664e2693db6a8d80867f32', creation_time=1747407921181, experiment_id='0', last_update_time=1747407921181, lifecycle_stage='active', name='Random Forest Classifier Baseline', tags={}>

In [42]:
import time
import optuna
import logging
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

logging.info("Starting MLflow run...")

def objective(trial):
    """Defines the objective function for Optuna to optimize."""
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = trial.suggest_int('max_depth', 2, 32, log=True)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        criterion=criterion,
        random_state=42,
        n_jobs=-1  # Use all available cores
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

# Create a study object and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=15) # You can adjust the number of trials

print("Best trial:")
trial = study.best_trial
print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

# Train a model with the best hyperparameters found by Optuna
best_params = study.best_trial.params
best_model = RandomForestClassifier(random_state=42, n_jobs=-1, **best_params)
best_model.fit(X_train, y_train)

# Evaluate the best model
y_pred_best = best_model.predict(X_test)
print("\nEvaluation of the best model:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_best)}")
print("Classification Report:")
print(classification_report(y_test, y_pred_best))

with mlflow.start_run():
    start_time = time.time()
    
    try:
        # Log the model
        mlflow.sklearn.log_model(best_model, "model")
        
        # Log parameters and metrics
        mlflow.log_params(best_params)
        mlflow.log_metric("accuracy", accuracy_score(y_test, y_pred_best))
        
        # Log the vectorizer
        mlflow.log_artifact("vectorizer.pkl")
        
        # Log the run duration
        end_time = time.time()
        mlflow.log_metric("run_duration", end_time - start_time)
        mlflow.log_param("n_trials", 15)    # Number of trials for Optuna
        mlflow.log_param("n_estimators", best_params['n_estimators'])
        mlflow.log_param("max_depth", best_params['max_depth'])
        mlflow.log_param("min_samples_split", best_params['min_samples_split'])
        mlflow.log_param("min_samples_leaf", best_params['min_samples_leaf'])
        mlflow.log_param("criterion", best_params['criterion'])
        mlflow.log_param("model_type", "RandomForestClassifier")
        mlflow.log_param("dataset", "Fake News Detection")
        mlflow.log_param("vectorizer_type", "CountVectorizer")
        mlflow.log_param("vectorizer_max_features", 5000)
        mlflow.log_param("train_size", len(X_train))
        mlflow.log_param("test_size", len(X_test))
        mlflow.log_param("random_state", 42)
        mlflow.log_param("n_jobs", -1)  # Use all available cores
    except Exception as e:
        logging.error(f"An error occurred: {e}", exc_info=True)



  from .autonotebook import tqdm as notebook_tqdm
2025-05-16 20:47:25,232 - INFO - Starting MLflow run...
[I 2025-05-16 20:47:25,239] A new study created in memory with name: no-name-aa2de83d-f236-4a48-bbfc-40e77bc07776
[I 2025-05-16 20:47:27,507] Trial 0 finished with value: 0.5093085106382979 and parameters: {'n_estimators': 92, 'max_depth': 3, 'min_samples_split': 8, 'min_samples_leaf': 5, 'criterion': 'entropy'}. Best is trial 0 with value: 0.5093085106382979.
[I 2025-05-16 20:47:38,637] Trial 1 finished with value: 0.4973404255319149 and parameters: {'n_estimators': 158, 'max_depth': 24, 'min_samples_split': 8, 'min_samples_leaf': 11, 'criterion': 'entropy'}. Best is trial 0 with value: 0.5093085106382979.
[I 2025-05-16 20:47:41,362] Trial 2 finished with value: 0.5086436170212766 and parameters: {'n_estimators': 199, 'max_depth': 3, 'min_samples_split': 10, 'min_samples_leaf': 13, 'criterion': 'entropy'}. Best is trial 0 with value: 0.5093085106382979.
[I 2025-05-16 20:47:45,155]

Best trial:
  Value: 0.5172872340425532
  Params: 
    n_estimators: 54
    max_depth: 10
    min_samples_split: 15
    min_samples_leaf: 20
    criterion: gini

Evaluation of the best model:
Accuracy: 0.5172872340425532
Classification Report:
              precision    recall  f1-score   support

           0       0.52      0.58      0.55      2294
           1       0.51      0.46      0.48      2218

    accuracy                           0.52      4512
   macro avg       0.52      0.52      0.51      4512
weighted avg       0.52      0.52      0.52      4512



2025-05-16 20:51:10,699 - ERROR - An error occurred: [Errno 2] No such file or directory: 'vectorizer.pkl'
Traceback (most recent call last):
  File "C:\Users\ASUS\AppData\Local\Temp\ipykernel_2856\3777927607.py", line 70, in <module>
    mlflow.log_artifact("vectorizer.pkl")
  File "d:\Anaconda\envs\news\lib\site-packages\mlflow\tracking\fluent.py", line 1179, in log_artifact
    MlflowClient().log_artifact(run_id, local_path, artifact_path)
  File "d:\Anaconda\envs\news\lib\site-packages\mlflow\tracking\client.py", line 2379, in log_artifact
    self._tracking_client.log_artifact(run_id, local_path, artifact_path)
  File "d:\Anaconda\envs\news\lib\site-packages\mlflow\tracking\_tracking_service\client.py", line 931, in log_artifact
    artifact_repo.log_artifact(local_path, artifact_path)
  File "d:\Anaconda\envs\news\lib\site-packages\mlflow\store\artifact\http_artifact_repo.py", line 62, in log_artifact
    with open(local_file, "rb") as f:
FileNotFoundError: [Errno 2] No such file

🏃 View run luminous-calf-211 at: https://dagshub.com/udaygupta8899/Fake-News-Detection.mlflow/#/experiments/0/runs/ef79df73b4144998b457945d522c0580
🧪 View experiment at: https://dagshub.com/udaygupta8899/Fake-News-Detection.mlflow/#/experiments/0
