In [1]:
import pandas as pd
import numpy as np
import warnings
import psutil
import logging
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Configure Logging
logging.basicConfig(level=logging.INFO)

# Check Available Memory
print(f"🛑 Available Memory: {psutil.virtual_memory().available / (1024**3):.2f} GB", flush=True)

# Ignore Warnings for Clean Output
warnings.filterwarnings("ignore")

# Test if Jupyter is running
print("✅ Jupyter is running!")


🛑 Available Memory: 2.11 GB
✅ Jupyter is running!


In [3]:
# Load only 5000 rows from each dataset to reduce memory usage
df_fake = pd.read_csv("../data/raw/Fake.csv", nrows=5000)
df_true = pd.read_csv("../data/raw/True.csv", nrows=5000)

# Assign labels (1 = Fake, 0 = True)
df_fake['label'] = 1
df_true['label'] = 0

# Combine both datasets
df = pd.concat([df_fake, df_true]).reset_index(drop=True)

# Shuffle dataset for better distribution
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Check dataset size
print(f"📊 Dataset Loaded! Total samples: {len(df)}")
df.head()


📊 Dataset Loaded! Total samples: 10000


Unnamed: 0,title,text,subject,date,label
0,Russia says Trump's 'aggressive' stance on Ira...,MOSCOW (Reuters) - Russia said on Friday after...,politicsNews,"October 13, 2017",0
1,"Michelle Obama Posed for A Photo, So Conserva...","Recently, Michelle Obama posed for InStyle mag...",News,"September 11, 2016",1
2,"Chaffetz Says Screw Congress, Might Leave Bef...",House Oversight Committee Chairman Jason Chaff...,News,"April 20, 2017",1
3,Disabled Americans Fight Back After Being Str...,"Throughout America, tens of thousands of peopl...",News,"September 6, 2016",1
4,Trump Campaign Chair Says Racism Didn’t Exist...,President Obama is right it appears the peop...,News,"September 22, 2016",1


In [5]:
import re  # Ensure re is imported

def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Apply cleaning to the text column
df['text'] = df['text'].apply(clean_text)

print("✅ Text Preprocessing Completed!")


✅ Text Preprocessing Completed!


In [None]:
# Reduce features to 5000 to prevent memory issues
vectorizer = TfidfVectorizer(max_features=5000)

# Convert text into numerical format
X = vectorizer.fit_transform(df['text'])
y = df['label']

print(f"✅ TF-IDF Transformation Completed! Shape: {X.shape}")


✅ TF-IDF Transformation Completed! Shape: (10000, 5000)


In [7]:
# Reduce features to 5000 to prevent memory issues
vectorizer = TfidfVectorizer(max_features=5000)

# Convert text into numerical format
X = vectorizer.fit_transform(df['text'])
y = df['label']

print(f"✅ TF-IDF Transformation Completed! Shape: {X.shape}")


✅ TF-IDF Transformation Completed! Shape: (10000, 5000)


In [8]:
# Initialize Stratified K-Fold with 5 splits
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define SVM Model
svm_model = SVC(kernel='linear')

# Perform Stratified K-Fold Cross Validation
scores = cross_val_score(svm_model, X, y, cv=skf, scoring='accuracy')

# Display results
print("\n🔹 Stratified K-Fold Accuracy Scores:", scores)
print("✅ Mean Accuracy:", np.mean(scores))
print("📉 Standard Deviation:", np.std(scores))



🔹 Stratified K-Fold Accuracy Scores: [0.9975 0.9965 0.9985 0.999  0.9955]
✅ Mean Accuracy: 0.9974000000000001
📉 Standard Deviation: 0.001280624847486557
