In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv(r"C:\Users\khetr\OneDrive\Desktop\mlops\New folder\fake-news\raw\FakeNewsNet.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23196 entries, 0 to 23195
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   title          23196 non-null  object
 1   news_url       22866 non-null  object
 2   source_domain  22866 non-null  object
 3   tweet_num      23196 non-null  int64 
 4   real           23196 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 906.2+ KB


In [4]:
df.head()

Unnamed: 0,title,news_url,source_domain,tweet_num,real
0,Kandi Burruss Explodes Over Rape Accusation on...,http://toofab.com/2017/05/08/real-housewives-a...,toofab.com,42,1
1,People's Choice Awards 2018: The best red carp...,https://www.today.com/style/see-people-s-choic...,www.today.com,0,1
2,Sophia Bush Sends Sweet Birthday Message to 'O...,https://www.etonline.com/news/220806_sophia_bu...,www.etonline.com,63,1
3,Colombian singer Maluma sparks rumours of inap...,https://www.dailymail.co.uk/news/article-33655...,www.dailymail.co.uk,20,1
4,Gossip Girl 10 Years Later: How Upper East Sid...,https://www.zerchoo.com/entertainment/gossip-g...,www.zerchoo.com,38,1


In [5]:
df=df.drop("news_url",axis=1)

In [6]:
df=df.dropna()

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22866 entries, 0 to 23195
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   title          22866 non-null  object
 1   source_domain  22866 non-null  object
 2   tweet_num      22866 non-null  int64 
 3   real           22866 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 893.2+ KB


In [8]:
import re
import spacy
nlp = spacy.load("en_core_web_sm")

In [9]:
def removing_urls(text):
    """Remove URLs from the text."""
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def preprocessing (text):
    """text cleaning"""
    text = text.lower()
    text = nlp(text)
    tokens = [token.lemma_ for token in text if not token.is_punct]
    text = " ".join(tokens)
    text = text.strip()
    return text

In [10]:
preprocessing("This is an example of spaCy preprocessing.")

'this be an example of spacy preprocessing'

In [11]:
df["title"] = df["title"].apply(removing_urls)

In [12]:
df["title"] = df["title"].apply(preprocessing)

In [13]:
df.head()

Unnamed: 0,title,source_domain,tweet_num,real
0,kandi burruss explode over rape accusation on ...,toofab.com,42,1
1,people 's choice award 2018 the good red carpe...,www.today.com,0,1
2,sophia bush send sweet birthday message to one...,www.etonline.com,63,1
3,colombian singer maluma spark rumour of inappr...,www.dailymail.co.uk,20,1
4,gossip girl 10 year later how upper east sider...,www.zerchoo.com,38,1


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,FunctionTransformer

In [15]:
text_cols = ["title","source_domain"]
num_cols = ["tweet_num"]

In [16]:
text_pipeline = Pipeline([
    ("combine", FunctionTransformer(
        lambda x: x.astype(str).agg(" ".join, axis=1),
        validate=False
    )),
    ("tfidf", TfidfVectorizer(max_features=10000))
])

In [17]:
preprocessor = ColumnTransformer(
    transformers=[
        ("text", text_pipeline, text_cols),
        ("num", StandardScaler(), num_cols)
    ]
)


In [18]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score


In [19]:
from sklearn.model_selection import train_test_split
x,xt = train_test_split(df,test_size=0.2,random_state=42)

In [20]:
x_train = x.drop("real",axis=1)
y_train = x["real"]

In [21]:
x_test = xt.drop("real",axis=1)
y_test = xt["real"]

In [29]:
# mlflow.set_tracking_uri(
#     "https://dagshub.com/suraj-5556/fake-news.mlflow"
# )
import dagshub
dagshub.init(repo_owner='suraj-5556', repo_name='fake-news', mlflow=True)

In [24]:
import mlflow
import mlflow.sklearn

In [30]:
mlflow.sklearn.autolog(
    log_models=True,
    silent=True
)

In [31]:
mlflow.set_experiment("Fake_News_Classification")

2025/12/24 17:50:10 INFO mlflow.tracking.fluent: Experiment with name 'Fake_News_Classification' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/4d9053f8bbe847a99f0669e282d89990', creation_time=1766578814135, experiment_id='0', last_update_time=1766578814135, lifecycle_stage='active', name='Fake_News_Classification', tags={}>

In [38]:
with mlflow.start_run():

    pipeline = Pipeline([
        ("preprocess", preprocessor),
        ("model", LogisticRegression(
            max_iter=500,
            penalty="l1",
            class_weight="balanced",
            random_state=42, solver="saga"
        ))
    ])

    # Train
    pipeline.fit(x_train, y_train)

    # Predict
    y_pred = pipeline.predict(x_test)

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")

    # Log metrics (explicit for clarity)
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("f1_score", f1)

    # Log full pipeline manually (optional but recommended)
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="model",
        registered_model_name="LogisticRegressionPipeline"
    )


Registered model 'LogisticRegressionPipeline' already exists. Creating a new version of this model...
2025/12/24 18:05:08 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LogisticRegressionPipeline, version 5
Created version '5' of model 'LogisticRegressionPipeline'.


üèÉ View run gaudy-newt-375 at: https://dagshub.com/suraj-5556/fake-news.mlflow/#/experiments/0/runs/dcfcb9e3ee7144d4aef39f729d8dfd9c
üß™ View experiment at: https://dagshub.com/suraj-5556/fake-news.mlflow/#/experiments/0


In [39]:
with mlflow.start_run():

    pipeline = Pipeline([
        ("preprocess", preprocessor),
        ("model", LogisticRegression(
            max_iter=1000,
            penalty="l1",
            class_weight="balanced",
            random_state=42, solver="saga"
        ))
    ])

    # Train
    pipeline.fit(x_train, y_train)

    # Predict
    y_pred = pipeline.predict(x_test)

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")

    # Log metrics (explicit for clarity)
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("f1_score", f1)

    # Log full pipeline manually (optional but recommended)
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="model",
        registered_model_name="LogisticRegressionPipeline"
    )


Registered model 'LogisticRegressionPipeline' already exists. Creating a new version of this model...
2025/12/24 18:07:10 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LogisticRegressionPipeline, version 6
Created version '6' of model 'LogisticRegressionPipeline'.


üèÉ View run angry-skunk-382 at: https://dagshub.com/suraj-5556/fake-news.mlflow/#/experiments/0/runs/d5560c222e954c5bb0a002efa74f5607
üß™ View experiment at: https://dagshub.com/suraj-5556/fake-news.mlflow/#/experiments/0


In [40]:
with mlflow.start_run():

    pipeline = Pipeline([
        ("preprocess", preprocessor),
        ("model", LogisticRegression(
            max_iter=5000,
            penalty="l1",
            class_weight="balanced",
            random_state=42, solver="saga"
        ))
    ])

    # Train
    pipeline.fit(x_train, y_train)

    # Predict
    y_pred = pipeline.predict(x_test)

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")

    # Log metrics (explicit for clarity)
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("f1_score", f1)

    # Log full pipeline manually (optional but recommended)
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="model",
        registered_model_name="LogisticRegressionPipeline"
    )


Registered model 'LogisticRegressionPipeline' already exists. Creating a new version of this model...
2025/12/24 18:11:19 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LogisticRegressionPipeline, version 7
Created version '7' of model 'LogisticRegressionPipeline'.


üèÉ View run respected-penguin-46 at: https://dagshub.com/suraj-5556/fake-news.mlflow/#/experiments/0/runs/f3da05c609ba4dbb8f174cec8a4e65c0
üß™ View experiment at: https://dagshub.com/suraj-5556/fake-news.mlflow/#/experiments/0


In [41]:
with mlflow.start_run():

    pipeline = Pipeline([
        ("preprocess", preprocessor),
        ("model", LogisticRegression(
            max_iter=10000,
            penalty="l1",
            class_weight="balanced",
            random_state=42, solver="saga"
        ))
    ])

    # Train
    pipeline.fit(x_train, y_train)

    # Predict
    y_pred = pipeline.predict(x_test)

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")

    # Log metrics (explicit for clarity)
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("f1_score", f1)

    # Log full pipeline manually (optional but recommended)
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="model",
        registered_model_name="LogisticRegressionPipeline"
    )


Registered model 'LogisticRegressionPipeline' already exists. Creating a new version of this model...
2025/12/24 18:15:30 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LogisticRegressionPipeline, version 8
Created version '8' of model 'LogisticRegressionPipeline'.


üèÉ View run exultant-hawk-605 at: https://dagshub.com/suraj-5556/fake-news.mlflow/#/experiments/0/runs/2da7124681c64f138eedbd0e511a83ee
üß™ View experiment at: https://dagshub.com/suraj-5556/fake-news.mlflow/#/experiments/0


In [42]:
from sklearn.svm import SVC

In [43]:
mlflow.set_experiment("svc")

2025/12/24 18:15:37 INFO mlflow.tracking.fluent: Experiment with name 'svc' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/e64a7409f17e41d0b47d8f259ca2862c', creation_time=1766580338761, experiment_id='1', last_update_time=1766580338761, lifecycle_stage='active', name='svc', tags={}>

In [49]:
with mlflow.start_run():

    pipeline = Pipeline([
        ("preprocess", preprocessor),
        ("model", SVC(
            C=0.1,
            max_iter=1000,
            kernel="linear",
            class_weight="balanced",
            random_state=42
        ))
    ])

    # Train
    pipeline.fit(x_train, y_train)

    # Predict
    y_pred = pipeline.predict(x_test)

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")

    # Log metrics (explicit for clarity)
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("f1_score", f1)

    # Log full pipeline manually (optional but recommended)
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="model",
        registered_model_name="LogisticRegressionPipeline"
    )


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
Registered model 'LogisticRegressionPipeline' already exists. Creating a new version of this model...
2025/12/24 18:23:25 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LogisticRegressionPipeline, version 13
Created version '13' of model 'LogisticRegressionPipeline'.


üèÉ View run big-vole-912 at: https://dagshub.com/suraj-5556/fake-news.mlflow/#/experiments/1/runs/82c21e0a7ff34f31a0fa081829943d15
üß™ View experiment at: https://dagshub.com/suraj-5556/fake-news.mlflow/#/experiments/1


In [50]:
with mlflow.start_run():

    pipeline = Pipeline([
        ("preprocess", preprocessor),
        ("model", SVC(
            C=0.1,
            max_iter=1000,
            kernel="rbf",
            class_weight="balanced",
            random_state=42
        ))
    ])

    # Train
    pipeline.fit(x_train, y_train)

    # Predict
    y_pred = pipeline.predict(x_test)

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")

    # Log metrics (explicit for clarity)
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("f1_score", f1)

    # Log full pipeline manually (optional but recommended)
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="model",
        registered_model_name="LogisticRegressionPipeline"
    )


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
Registered model 'LogisticRegressionPipeline' already exists. Creating a new version of this model...
2025/12/24 18:24:36 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LogisticRegressionPipeline, version 14
Created version '14' of model 'LogisticRegressionPipeline'.


üèÉ View run rogue-fawn-91 at: https://dagshub.com/suraj-5556/fake-news.mlflow/#/experiments/1/runs/1e6b3cbc6327437684b70b28ca59c53f
üß™ View experiment at: https://dagshub.com/suraj-5556/fake-news.mlflow/#/experiments/1


In [51]:
from sklearn.ensemble import RandomForestClassifier

In [52]:
mlflow.set_experiment("RandomForestClassifier")

2025/12/24 18:24:38 INFO mlflow.tracking.fluent: Experiment with name 'RandomForestClassifier' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/18110b135a91492daffad6ba5b10b7c2', creation_time=1766580880080, experiment_id='2', last_update_time=1766580880080, lifecycle_stage='active', name='RandomForestClassifier', tags={}>

In [61]:
with mlflow.start_run():

    pipeline = Pipeline([
        ("preprocess", preprocessor),
        ("model", RandomForestClassifier(
            n_estimators=100,
            criterion="gini",
            max_depth=20,
            min_samples_split=2,
            min_samples_leaf=1,
            max_features="sqrt",
            class_weight="balanced",
            random_state=42
        ))
    ])

    # Train
    pipeline.fit(x_train, y_train)

    # Predict
    y_pred = pipeline.predict(x_test)

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")

    # Log metrics (explicit for clarity)
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("f1_score", f1)

    # Log full pipeline manually (optional but recommended)
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="model",
        registered_model_name="LogisticRegressionPipeline"
    )


Registered model 'LogisticRegressionPipeline' already exists. Creating a new version of this model...
2025/12/24 19:39:28 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LogisticRegressionPipeline, version 23
Created version '23' of model 'LogisticRegressionPipeline'.


üèÉ View run unruly-flea-185 at: https://dagshub.com/suraj-5556/fake-news.mlflow/#/experiments/2/runs/d4cbb270c0504d24b5ec764446025c93
üß™ View experiment at: https://dagshub.com/suraj-5556/fake-news.mlflow/#/experiments/2


In [62]:
with mlflow.start_run():

    pipeline = Pipeline([
        ("preprocess", preprocessor),
        ("model", RandomForestClassifier(
            n_estimators=100,
            criterion="entropy",
            max_depth=20,
            min_samples_split=2,
            min_samples_leaf=1,
            max_features="sqrt",
            class_weight="balanced",
            random_state=42
        ))
    ])

    # Train
    pipeline.fit(x_train, y_train)

    # Predict
    y_pred = pipeline.predict(x_test)

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")

    # Log metrics (explicit for clarity)
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("f1_score", f1)

    # Log full pipeline manually (optional but recommended)
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="model",
        registered_model_name="LogisticRegressionPipeline"
    )


Registered model 'LogisticRegressionPipeline' already exists. Creating a new version of this model...
2025/12/24 19:42:25 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LogisticRegressionPipeline, version 24
Created version '24' of model 'LogisticRegressionPipeline'.


üèÉ View run indecisive-panda-824 at: https://dagshub.com/suraj-5556/fake-news.mlflow/#/experiments/2/runs/52bb4011e7524e5db15ff203c5f6c84d
üß™ View experiment at: https://dagshub.com/suraj-5556/fake-news.mlflow/#/experiments/2


In [63]:
with mlflow.start_run():

    pipeline = Pipeline([
        ("preprocess", preprocessor),
        ("model", RandomForestClassifier(
            n_estimators=200,
            criterion="gini",
            max_depth=20,
            min_samples_split=2,
            min_samples_leaf=1,
            max_features="sqrt",
            class_weight="balanced",
            random_state=42
        ))
    ])

    # Train
    pipeline.fit(x_train, y_train)

    # Predict
    y_pred = pipeline.predict(x_test)

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")

    # Log metrics (explicit for clarity)
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("f1_score", f1)

    # Log full pipeline manually (optional but recommended)
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="model",
        registered_model_name="LogisticRegressionPipeline"
    )


Registered model 'LogisticRegressionPipeline' already exists. Creating a new version of this model...
2025/12/24 19:46:59 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LogisticRegressionPipeline, version 25
Created version '25' of model 'LogisticRegressionPipeline'.


üèÉ View run ambitious-stag-585 at: https://dagshub.com/suraj-5556/fake-news.mlflow/#/experiments/2/runs/c6e86e52d0634d10bb29101201aef615
üß™ View experiment at: https://dagshub.com/suraj-5556/fake-news.mlflow/#/experiments/2


In [64]:
with mlflow.start_run():

    pipeline = Pipeline([
        ("preprocess", preprocessor),
        ("model", RandomForestClassifier(
            n_estimators=200,
            criterion="entropy",
            max_depth=20,
            min_samples_split=2,
            min_samples_leaf=1,
            max_features="sqrt",
            class_weight="balanced",
            random_state=42
        ))
    ])

    # Train
    pipeline.fit(x_train, y_train)

    # Predict
    y_pred = pipeline.predict(x_test)

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")

    # Log metrics (explicit for clarity)
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("f1_score", f1)

    # Log full pipeline manually (optional but recommended)
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="model",
        registered_model_name="LogisticRegressionPipeline"
    )


Registered model 'LogisticRegressionPipeline' already exists. Creating a new version of this model...
2025/12/24 19:52:04 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LogisticRegressionPipeline, version 26
Created version '26' of model 'LogisticRegressionPipeline'.


üèÉ View run puzzled-bird-497 at: https://dagshub.com/suraj-5556/fake-news.mlflow/#/experiments/2/runs/78d6c59fc3c143e29283280411bb77b8
üß™ View experiment at: https://dagshub.com/suraj-5556/fake-news.mlflow/#/experiments/2
