In [1]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import seaborn as sns
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import datetime
from scipy.sparse import hstack, csr_matrix
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from imblearn.over_sampling import RandomOverSampler

# ===============================
# 1. Load dataset
# ===============================

df = pd.read_csv("fake.csv")

# ===============================
# 2. Preprocessing 
# ===============================

print(df.columns)

drop_idx = df[df["type"] == "bs"].sample(n=11000, random_state=42).index

# Drop them from the dataframe
df = df.drop(drop_idx)

for i in ['uuid', 'ord_in_thread', 'crawled', 'site_url','spam_score', 'main_img_url']:
    del df[i]
    
df['author'].fillna('unknown', inplace=True)

df["domain_rank"].fillna(1_000_000, inplace=True)

df.dropna(subset=['text','title','domain_rank'], inplace=True)


languages = [
    'english', 'ignore', 'german', 'french', 'spanish', 'russian', 'greek',
    'finnish', 'portuguese', 'arabic', 'dutch', 'italian', 'turkish',
    'norwegian', 'chinese', 'polish'
]

countries = [
    'US', 'N/A', 'DE', 'FR', 'ES', 'RU', 'GR',
    'FI', 'PT', 'DZ', 'NL', 'IT',
    'TR', 'NO', 'CN', 'PL'
]

language_to_country = dict(zip(languages, countries))

df["country"] = df["language"].map(language_to_country)



Index(['uuid', 'ord_in_thread', 'author', 'published', 'title', 'text',
       'language', 'crawled', 'site_url', 'country', 'domain_rank',
       'thread_title', 'spam_score', 'main_img_url', 'replies_count',
       'participants_count', 'likes', 'comments', 'shares', 'type'],
      dtype='object')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['author'].fillna('unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["domain_rank"].fillna(1_000_000, inplace=True)


In [2]:
#turning ['bias' 'conspiracy' 'fake' 'bs' 'satire' 'hate' 'junksci' 'state'] into [0,1,2,3,4,5,6,7]
type_mapping = {t: i for i, t in enumerate(df["type"].unique())}
df["type"] = df["type"].map(type_mapping)

print(df["type"].unique())




[0 1 2 3 4 5 6 7]


In [None]:
#preprocsessing published
#converting the object into strings
print(df['published'].dtype)
df['published'] = df['published'].astype('string')
print(df['published'].dtype)

#using 
import datetime

for i in range(len(df["published"])):
    idk  =df["published"].iloc[i]
    time  = datetime.datetime.fromisoformat(idk)
    timestamp = time.timestamp()
    df["published"].iloc[i] = str(timestamp)


object
string


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df["published"].iloc[i] = str(timestamp)


In [4]:
#converting the string of time stamps into floats 
print(df['published'].dtype)
df['published'] = df['published'].astype('float')
print(df['published'].dtype)  

string
float64


In [5]:
#tf idf to work on the string columns 

df["combined"] = (
    df["author"].astype(str) + " " +
    df["published"].astype(str) + " " +
    df["title"].astype(str) + " " +
    df["text"].astype(str) + " " +
    df["language"].astype(str) + " " +
    df["country"].astype(str) + " " +
    df["thread_title"].astype(str)
)


tfidf = TfidfVectorizer(
    max_features=10000,
    stop_words="english",       # remove common words like "the", "in", "and"
    ngram_range=(1,2),          # optional: use unigrams + bigrams for richer features
    lowercase=True              # ensures everything is lowercase
) 
tfidf_matrix = tfidf.fit_transform(df["combined"])

print(tfidf_matrix.shape)



(1818, 10000)


In [6]:
#to see the type we are dealing with

#converting every "object" column into a string


for i in ["author","title","text","language","country","thread_title","combined"]:
    df[i] = df[i].astype('string') 

for i in df.columns: 
   print(i,":",df[i].dtype)


author : string
published : float64
title : string
text : string
language : string
country : string
domain_rank : float64
thread_title : string
replies_count : int64
participants_count : int64
likes : int64
comments : int64
shares : int64
type : int64
combined : string


In [86]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import pandas as pd


# Prepare numeric features
numeric_features = df[[
    "domain_rank", "likes", "comments", "shares",
    "participants_count", "replies_count", "published"
]].values

# Ensure numeric_features is float for hstack
numeric_features = numeric_features.astype(float)

# Combine text + numeric
X = hstack([tfidf_matrix, numeric_features])
y = df["type"]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Apply SMOTE only on the training set
smote = SMOTE(random_state=42, k_neighbors=5)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", dict(pd.Series(y_train).value_counts()))
print("After SMOTE:", dict(pd.Series(y_train_resampled).value_counts()))

# Define XGBoost model (regularized to reduce overfitting)
model = XGBClassifier(
    n_estimators=500,
    max_depth=4,
    min_child_weight=5,
    gamma=1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=1,
    random_state=42,
    n_jobs=-1,
    use_label_encoder=False,
    eval_metric="mlogloss"
)

# Train on resampled data (no sample_weight here, SMOTE handled imbalance)
model.fit(X_train_resampled, y_train_resampled)

# Predict
y_pred = model.predict(X_test)

# Evaluation
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Before SMOTE: {3: np.int64(993), 1: np.int64(288), 0: np.int64(248), 5: np.int64(172), 7: np.int64(85), 6: np.int64(72), 4: np.int64(70), 2: np.int64(13)}
After SMOTE: {1: np.int64(993), 3: np.int64(993), 0: np.int64(993), 6: np.int64(993), 7: np.int64(993), 4: np.int64(993), 5: np.int64(993), 2: np.int64(993)}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


KeyboardInterrupt: 

In [10]:
from sklearn.utils import resample

dfs = []
for label, group in df.groupby("type"):
    if len(group) < 400:
        group_upsampled = resample(
            group,
            replace=True,
            n_samples=400,
            random_state=42
        )
        dfs.append(group_upsampled)
    else:
        dfs.append(group)
df = pd.concat(dfs, ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle


In [11]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from scipy.sparse import coo_matrix
import numpy as np

##################################RANDOM FOREST################################################

# Prepare numeric features                    
numeric_features = df[[
    "domain_rank", "likes", "comments", "shares",
    "participants_count", "replies_count", "published"
]].values.astype(float)

# Combine text + numeric
X = hstack([tfidf_matrix, numeric_features])
y = df["type"]

# Ensure CSR format
if isinstance(X, coo_matrix):
    X = X.tocsr()

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
train_accuracies = []
valid_accuracies = []

for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), 1):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Random Forest model renamed to model1
    model1 = RandomForestClassifier(
        n_estimators=1000,
        max_depth=10,
        min_samples_split=15,
        min_samples_leaf=2,
        class_weight="balanced",
        random_state=42,
        n_jobs=-1
    )
    model1.fit(X_train, y_train)

    # Training accuracy
    y_train_pred = model1.predict(X_train)
    train_acc = accuracy_score(y_train, y_train_pred)

    # Validation accuracy
    y_pred = model1.predict(X_test)
    valid_acc = accuracy_score(y_test, y_pred)

    print(f"Fold {fold}: train acc = {train_acc:.4f}, valid acc = {valid_acc:.4f}")

    train_accuracies.append(train_acc)
    valid_accuracies.append(valid_acc)

# Print means
print("\nMean train accuracy:", np.mean(train_accuracies))
print("Mean valid accuracy:", np.mean(valid_accuracies))

import joblib

# Save vectorizer + model
joblib.dump(tfidf, "tfidf_vectorizer.pkl")
joblib.dump(model1, "xgb_model.pkl")

import joblib
import datetime
import numpy as np
from scipy.sparse import hstack

# Load saved vectorizer and model
tfidf_vectorizer = joblib.load("tfidf_vectorizer.pkl")
xgb_model = joblib.load("xgb_model.pkl")

#example1 : # Predicted type: "conspiracy"
# New record
new_record = {
    "author": "Jane Smith",
    "published": "2023-08-15T12:30:00",
    "title": "Hidden Truth Behind Moon Landing",
    "text": "Many believe the Apollo missions were staged in Hollywood studios. Evidence suggests photos contain lighting anomalies that could not exist on the Moon.",
    "language": "en",
    "country": "US",
    "domain_rank": 5321,
    "thread_title": "Moon landing conspiracy resurfaces",
    "replies_count": 56,
    "participants_count": 24,
    "likes": 180,
    "comments": 90,
    "shares": 45
}

# Convert published into timestamp
time_new = datetime.datetime.fromisoformat(new_record["published"])
timestamp_new = time_new.timestamp()

# Build combined text (same way as training)
combined_text = (
    str(new_record["author"]) + " " +
    str(timestamp_new) + " " +
    str(new_record["title"]) + " " +
    str(new_record["text"]) + " " +
    str(new_record["language"]) + " " +
    str(new_record["country"]) + " " +
    str(new_record["thread_title"])
)

# --- Transform with the trained vectorizer ---
new_tfidf = tfidf_vectorizer.transform([combined_text])

# --- Numeric features (same order as training) ---
new_numeric = np.array([[
    new_record["domain_rank"],
    new_record["likes"],
    new_record["comments"],
    new_record["shares"],
    new_record["participants_count"],
    new_record["replies_count"],
    timestamp_new
]], dtype=float)

# --- Combine TF-IDF + numeric ---
new_X = hstack([new_tfidf, new_numeric])

# --- Predict ---

list = ['bias', 'conspiracy', 'fake' ,'bs' ,'satire' ,'hate' ,'junksci' ,'state']

prediction = xgb_model.predict(new_X)
print("Predicted type:", list[prediction[0]])

#example2 : hate

new1_record = {
    "author": "Unknown",
    "published": "2021-07-22T20:00:00+02:00",
    "title": "Group X is destroying our country",
    "text": "People from Group X are ruining our traditions and must be stopped.",
    "language": "english",
    "country": "US",
    "domain_rank": 600000,
    "thread_title": "Group X Hate",
    "replies_count": 500,
    "participants_count": 200,
    "likes": 1000,
    "comments": 800,
    "shares": 1200
}

# Convert published into timestamp
time_new1 = datetime.datetime.fromisoformat(new1_record["published"])
timestamp_new1 = time_new1.timestamp()

# Build combined text (same way as training)
combined_text1 = (
    str(new1_record["author"]) + " " +
    str(timestamp_new1) + " " +
    str(new1_record["title"]) + " " +
    str(new1_record["text"]) + " " +
    str(new1_record["language"]) + " " +
    str(new1_record["country"]) + " " +
    str(new1_record["thread_title"])
)

# --- Transform with the trained vectorizer ---
new1_tfidf = tfidf_vectorizer.transform([combined_text1])

# --- Numeric features (same order as training) ---
new1_numeric = np.array([[
    new1_record["domain_rank"],
    new1_record["likes"],
    new1_record["comments"],
    new1_record["shares"],
    new1_record["participants_count"],
    new1_record["replies_count"],
    timestamp_new1
]], dtype=float)

# --- Combine TF-IDF + numeric ---
new1_X = hstack([new1_tfidf, new1_numeric])

# --- Predict ---
prediction1 = xgb_model.predict(new1_X)

list = ['bias', 'conspiracy', 'fake' ,'bs' ,'satire' ,'hate' ,'junksci' ,'state']

print("Predicted type:", list[prediction1[0]])


#example3 : satire

new2_record = {
    "author": "Satire News",
    "published": "2023-01-01T08:00:00+02:00",
    "title": "Aliens demand free Wi-Fi from world leaders",
    "text": "In a shocking event, aliens have refused to leave Earth until they get unlimited internet access.",
    "language": "english",
    "country": "CA",
    "domain_rank": 20000,
    "thread_title": "Aliens protest",
    "replies_count": 80,
    "participants_count": 40,
    "likes": 5000,
    "comments": 1200,
    "shares": 2000
}

# Convert published into timestamp
time_new2 = datetime.datetime.fromisoformat(new2_record["published"])
timestamp_new2 = time_new2.timestamp()

# Build combined text (same way as training)
combined_text1 = (
    str(new2_record["author"]) + " " +
    str(timestamp_new2) + " " +
    str(new2_record["title"]) + " " +
    str(new2_record["text"]) + " " +
    str(new2_record["language"]) + " " +
    str(new2_record["country"]) + " " +
    str(new2_record["thread_title"])
)

# --- Transform with the trained vectorizer ---
new2_tfidf = tfidf_vectorizer.transform([combined_text1])

# --- Numeric features (same order as training) ---
new2_numeric = np.array([[
    new2_record["domain_rank"],
    new2_record["likes"],
    new2_record["comments"],
    new2_record["shares"],
    new2_record["participants_count"],
    new2_record["replies_count"],
    timestamp_new2
]], dtype=float)

# --- Combine TF-IDF + numeric ---
new2_X = hstack([new2_tfidf, new2_numeric])

# --- Predict ---
prediction1 = xgb_model.predict(new2_X)

list = ['bias', 'conspiracy', 'fake' ,'bs' ,'satire' ,'hate' ,'junksci' ,'state']

print("Predicted type:", list[prediction1[0]])

#example4 : bias

new3_record = {
    "author": "Political Blogger",
    "published": "2022-05-10T09:00:00+02:00",
    "title": "Only Party Z can save the country",
    "text": "Party Z is the only choice for true patriots, the opposition wants to destroy everything we stand for.",
    "language": "english",
    "country": "US",
    "domain_rank": 300000,
    "thread_title": "Elections 2022",
    "replies_count": 400,
    "participants_count": 150,
    "likes": 2000,
    "comments": 900,
    "shares": 500
}

# Convert published into timestamp
time_new3 = datetime.datetime.fromisoformat(new_record["published"])
timestamp_new3 = time_new3.timestamp()

# Build combined text (same way as training)
combined_text1 = (
    str(new3_record["author"]) + " " +
    str(timestamp_new3) + " " +
    str(new3_record["title"]) + " " +
    str(new3_record["text"]) + " " +
    str(new3_record["language"]) + " " +
    str(new3_record["country"]) + " " +
    str(new3_record["thread_title"])
)

# --- Transform with the trained vectorizer ---
new3_tfidf = tfidf_vectorizer.transform([combined_text1])

# --- Numeric features (same order as training) ---
new3_numeric = np.array([[
    new3_record["domain_rank"],
    new3_record["likes"],
    new3_record["comments"],
    new3_record["shares"],
    new3_record["participants_count"],
    new3_record["replies_count"],
    timestamp_new3
]], dtype=float)

# --- Combine TF-IDF + numeric ---
new3_X = hstack([new3_tfidf, new3_numeric])

# --- Predict ---
prediction1 = xgb_model.predict(new3_X)

list = ['bias', 'conspiracy', 'fake' ,'bs' ,'satire' ,'hate' ,'junksci' ,'state']

print("Predicted type:", list[prediction1[0]])


#example5 : junksci

new4_record = {
    "author": "Health Guru",
    "published": "2021-09-20T14:00:00+02:00",
    "title": "Drink lemon water to cure all diseases",
    "text": "Doctors don’t want you to know this secret: lemon water cures cancer, diabetes, and heart disease.",
    "language": "english",
    "country": "AU",
    "domain_rank": 600000,
    "thread_title": "Natural cures",
    "replies_count": 220,
    "participants_count": 90,
    "likes": 800,
    "comments": 400,
    "shares": 350
}

# Convert published into timestamp
time_new4 = datetime.datetime.fromisoformat(new_record["published"])
timestamp_new4 = time_new4.timestamp()

# Build combined text (same way as training)
combined_text1 = (
    str(new4_record["author"]) + " " +
    str(timestamp_new4) + " " +
    str(new4_record["title"]) + " " +
    str(new4_record["text"]) + " " +
    str(new4_record["language"]) + " " +
    str(new4_record["country"]) + " " +
    str(new4_record["thread_title"])
)

# --- Transform with the trained vectorizer ---
new4_tfidf = tfidf_vectorizer.transform([combined_text1])

# --- Numeric features (same order as training) ---
new4_numeric = np.array([[
    new4_record["domain_rank"],
    new4_record["likes"],
    new4_record["comments"],
    new4_record["shares"],
    new4_record["participants_count"],
    new4_record["replies_count"],
    timestamp_new4
]], dtype=float)

# --- Combine TF-IDF + numeric ---
new4_X = hstack([new4_tfidf, new4_numeric])

# --- Predict ---
prediction1 = xgb_model.predict(new4_X)

list = ['bias', 'conspiracy', 'fake' ,'bs' ,'satire' ,'hate' ,'junksci' ,'state']

print("Predicted type:", list[prediction1[0]])


#bs
new_record_bs = {
    "author": "Anonymous",
    "published": "2021-09-15T14:30:00+02:00",
    "title": "10 Shocking Secrets About Water You Didn’t Know",
    "text": "Scientists are hiding the truth! Drinking tap water turns your brain into mush. Big corporations don’t want you to know the secret of pure water crystals.",
    "language": "english",
    "country": "US",
    "domain_rank": 500000,
    "thread_title": "Water Conspiracy?",
    "replies_count": 350,
    "participants_count": 120,
    "likes": 800,
    "comments": 600,
    "shares": 1000
}

# Convert published into timestamp
time_newbs = datetime.datetime.fromisoformat(new_record["published"])
timestamp_newbs = time_newbs.timestamp()

# Build combined text (same way as training)
combined_textbs = (
    str(new_record_bs["author"]) + " " +
    str(timestamp_newbs) + " " +
    str(new_record_bs["title"]) + " " +
    str(new_record_bs["text"]) + " " +
    str(new_record_bs["language"]) + " " +
    str(new_record_bs["country"]) + " " +
    str(new_record_bs["thread_title"])
)

# --- Transform with the trained vectorizer ---
newbs_tfidf = tfidf_vectorizer.transform([combined_textbs])

# --- Numeric features (same order as training) ---
newbs_numeric = np.array([[
    new_record_bs["domain_rank"],
    new_record_bs["likes"],
    new_record_bs["comments"],
    new_record_bs["shares"],
    new_record_bs["participants_count"],
    new_record_bs["replies_count"],
    timestamp_new4
]], dtype=float)

# --- Combine TF-IDF + numeric ---
newbs_X = hstack([newbs_tfidf, newbs_numeric])

# --- Predict ---
prediction1 = xgb_model.predict(newbs_X)

list = ['bias', 'conspiracy', 'fake' ,'bs' ,'satire' ,'hate' ,'junksci' ,'state']

print("Predicted type:", list[prediction1[0]])


ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,1].shape[0] == 3276, expected 1818.

In [8]:
print(df["type"].value_counts())



type
3    464
1    412
0    354
5    246
7    121
6    102
4    100
2     19
Name: count, dtype: int64


In [None]:

print(df["type"].value_counts())

type
3    464
1    412
6    400
7    400
0    400
4    400
2    400
5    400
Name: count, dtype: int64
