In [1]:
!pip install xgboost





[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip





In [8]:
import pandas as pd
import numpy as np
import os
import re
import warnings
import psutil
import logging
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import xgboost as xgb

# Configure Logging
logging.basicConfig(level=logging.INFO, force=True)

# Check Available Memory
print(f"🛑 Available Memory: {psutil.virtual_memory().available / (1024**3):.2f} GB", flush=True)

# Ignore Warnings for Clean Output
warnings.filterwarnings("ignore")

# Confirm Execution
print("✅ Jupyter is Running Properly!", flush=True)


🛑 Available Memory: 1.10 GB
✅ Jupyter is Running Properly!


In [9]:
# Verify if dataset files exist
print("🔍 Checking Dataset Paths...")
print("Fake.csv Exists:", os.path.exists("../data/raw/Fake.csv"))
print("True.csv Exists:", os.path.exists("../data/raw/True.csv"))

# Load Dataset (Limit rows to reduce memory usage)
df_fake = pd.read_csv("../data/raw/Fake.csv", nrows=50)
df_true = pd.read_csv("../data/raw/True.csv", nrows=50)

print("✅ Datasets Loaded Successfully!")


🔍 Checking Dataset Paths...
Fake.csv Exists: True
True.csv Exists: True
✅ Datasets Loaded Successfully!


In [10]:
# Assign Labels
df_fake["label"] = 1  # Fake News
df_true["label"] = 0  # True News

# Merge Both Datasets
df = pd.concat([df_fake, df_true]).sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle Data

# Check dataset info
print(df.head())
print("✅ Dataset Prepared!")


                                               title  \
0  Callista Gingrich becomes Trump's envoy to pop...   
1  FBI Russia probe helped by Australian diplomat...   
2  Federal judge partially lifts Trump's latest r...   
3   White House Panics Knowing Flynn Is Going To ...   
4   Watch This Awesome Mashup of Michael Flynn Le...   

                                                text       subject  \
0  VATICAN CITY (Reuters) - Callista Gingrich, wi...  politicsNews   
1  WASHINGTON (Reuters) - Trump campaign adviser ...  politicsNews   
2  WASHINGTON (Reuters) - A federal judge in Seat...  politicsNews   
3  While Donald Trump has been taking vacations, ...          News   
4  Donald Trump s disgraced National Security Adv...          News   

                 date  label  
0  December 22, 2017       0  
1  December 30, 2017       0  
2  December 24, 2017       0  
3    December 1, 2017      1  
4    December 1, 2017      1  
✅ Dataset Prepared!


In [11]:
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Apply cleaning to the text column
df['text'] = df['text'].apply(clean_text)

print("✅ Text Preprocessing Completed!")


✅ Text Preprocessing Completed!


In [12]:
# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=500)

# Transform text data
X = vectorizer.fit_transform(df["text"]).toarray()
y = df["label"]

print(f"✅ TF-IDF Transformation Completed! Shape: {X.shape}")


✅ TF-IDF Transformation Completed! Shape: (100, 500)


In [13]:
# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("✅ Data Split Completed!")


✅ Data Split Completed!


In [14]:
# Initialize XGBoost Classifier
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss")

# Train Model
xgb_model.fit(X_train, y_train)

print("✅ XGBoost Model Trained!")


✅ XGBoost Model Trained!


In [15]:
# Make Predictions
y_pred = xgb_model.predict(X_test)

# Evaluate Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"📊 Model Accuracy: {accuracy:.4f}")


📊 Model Accuracy: 1.0000


In [16]:
# Initialize Stratified K-Fold with 5 splits
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform Stratified K-Fold Cross Validation
scores = cross_val_score(xgb_model, X, y, cv=skf, scoring='accuracy')

# Display results
print("\n🔹 Stratified K-Fold Accuracy Scores:", scores)
print("✅ Mean Accuracy:", np.mean(scores))
print("📉 Standard Deviation:", np.std(scores))



🔹 Stratified K-Fold Accuracy Scores: [0.95 1.   1.   1.   1.  ]
✅ Mean Accuracy: 0.99
📉 Standard Deviation: 0.020000000000000018
