In [1]:
!pip install gdown



In [2]:
# import pandas as pd
# df = pd.read_csv('../data/processed/cleaned_data.csv')
#https://drive.google.com/file/d/1Sdg9CJKbpmccciLWTvnFv3FCKCrywyls/view?usp=sharing

# Downloading and loading the dataset
import gdown
import pandas as pd

# Define Google Drive link for the dataset
gdrive_link = "https://drive.google.com/uc?export=download&id=1Sdg9CJKbpmccciLWTvnFv3FCKCrywyls"  # Replace FILE_ID_RAW_DATA

# Define local path to save the dataset
local_path = "../data/processed/cleaned_data.csv"

# Download the dataset
print("Downloading the dataset for EDA...")
gdown.download(gdrive_link, local_path, quiet=False)

# Load the dataset
df = pd.read_csv(local_path)
print("Dataset loaded successfully.")

df.head()


Downloading the dataset for EDA...


Downloading...
From: https://drive.google.com/uc?export=download&id=1Sdg9CJKbpmccciLWTvnFv3FCKCrywyls
To: e:\fake-news-detection\data\processed\cleaned_data.csv
100%|██████████| 84.1M/84.1M [00:01<00:00, 53.1MB/s]


Dataset loaded successfully.


Unnamed: 0,title,text,label,title_length,text_length,sentiment,readability,label_encoded
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,FAKE,79,2893,0.082132,71.14,0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,FAKE,69,1898,-0.005004,62.78,0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",FAKE,90,3597,-0.012345,63.59,0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",FAKE,78,2774,-0.023118,62.07,0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,FAKE,70,2346,-0.011722,67.08,0


# Basic Cleaning

In [3]:
#Converting all text to lowercase to ensure consistency
df['title'] = df['title'].str.lower()
df['text'] = df['text'].str.lower()

In [4]:
#Removing Punctuation and Special Characters
import re

df['title'] = df['title'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['text'] = df['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))


# Tokenization

In [4]:
import nltk
nltk.download('punkt')  # Core tokenizer
nltk.download('punkt_tab')
nltk.download('stopwords')  # For stopword removal
nltk.download('wordnet')  # For lemmatization
nltk.download('omw-1.4')  # For WordNet lemmatizer


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\joshv\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\joshv\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\joshv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\joshv\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\joshv\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [5]:
!pip install nltk



In [6]:
from nltk.tokenize import word_tokenize

def safe_word_tokenize(text):
    if not isinstance(text, str):
        return []  # Return an empty list for non-string values
    return word_tokenize(text)

# Apply tokenizer to title and text columns
df['title_tokens'] = df['title'].apply(safe_word_tokenize)
df['text_tokens'] = df['text'].apply(safe_word_tokenize)

# Display tokenized data
print(df[['title_tokens', 'text_tokens']].head())


                                        title_tokens  \
0  [donald, trump, sends, out, embarrassing, new,...   
1  [drunk, bragging, trump, staffer, started, rus...   
2  [sheriff, david, clarke, becomes, an, internet...   
3  [trump, is, so, obsessed, he, even, has, obama...   
4  [pope, francis, just, called, out, donald, tru...   

                                         text_tokens  
0  [donald, trump, just, couldn, t, wish, all, am...  
1  [house, intelligence, committee, chairman, dev...  
2  [on, friday, it, was, revealed, that, former, ...  
3  [on, christmas, day, donald, trump, announced,...  
4  [pope, francis, used, his, annual, christmas, ...  


# Remove Stopwords

In [7]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
df['title'] = df['title'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# def preprocess_text(text):
#     text = text.lower()  # Lowercase
#     text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove punctuation
#     text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
#     stop_words = set(stopwords.words('english'))
#     text = ' '.join(word for word in text.split() if word not in stop_words)  # Remove stopwords
#     return text

# # Apply to title and text columns
# df['title'] = df['title'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\joshv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Lemmatization

In [8]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

df['title'] = df['title'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
df['text'] = df['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\joshv\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\joshv\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [9]:
# # Save the dataset with preprocessed text
# df.to_csv('text_preprocessed.csv', index=False)
# print("Preprocessed text saved successfully!")



In [10]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31794 entries, 0 to 31793
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   title          31794 non-null  object 
 1   text           31794 non-null  object 
 2   label          31794 non-null  object 
 3   title_length   31794 non-null  int64  
 4   text_length    31794 non-null  int64  
 5   sentiment      31794 non-null  float64
 6   readability    31794 non-null  float64
 7   label_encoded  31794 non-null  int64  
 8   title_tokens   31794 non-null  object 
 9   text_tokens    31794 non-null  object 
dtypes: float64(2), int64(3), object(5)
memory usage: 2.4+ MB
None


# Feature Engineering

In [11]:
# Scale numeric features to ensure they have similar ranges
from sklearn.preprocessing import StandardScaler

# Numeric features to scale
numeric_features = ['title_length', 'text_length', 'sentiment', 'readability']
scaler = StandardScaler()

# Scale numeric features
df[numeric_features] = scaler.fit_transform(df[numeric_features])

print(df[numeric_features].head())


   title_length  text_length  sentiment  readability
0      0.516916     0.226032   0.301809     1.022791
1     -0.130742    -0.369363  -0.660008     0.497542
2      1.229340     0.647296  -0.741033     0.548434
3      0.452151     0.154824  -0.859950     0.452934
4     -0.065976    -0.101286  -0.734159     0.767706


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize titles
title_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
title_vectors = title_vectorizer.fit_transform(df['title'])

# Vectorize text
text_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
text_vectors = text_vectorizer.fit_transform(df['text'])

print("Title TF-IDF Shape:", title_vectors.shape)
print("Text TF-IDF Shape:", text_vectors.shape)


Title TF-IDF Shape: (31794, 5000)
Text TF-IDF Shape: (31794, 5000)


In [13]:
import numpy as np
# Get feature names from the vectorizer
title_feature_names = title_vectorizer.get_feature_names_out().tolist()
text_feature_names = text_vectorizer.get_feature_names_out().tolist()

# Combine feature names (if using multiple vectorizers)
feature_names = list(title_feature_names) + list(text_feature_names)
print(feature_names)

np.save('features_names_variable.npy', feature_names)
print(len(title_vectorizer.get_feature_names_out()) + len(text_vectorizer.get_feature_names_out()))



10000


In [14]:
from scipy.sparse import hstack, csr_matrix  # Import sparse utilities

# Combine numeric features with sparse TF-IDF features
numeric_data = df[numeric_features].values
numeric_data_sparse = csr_matrix(numeric_data)  # Convert to sparse matrix

# Use sparse hstack to combine all features
final_features = hstack([numeric_data_sparse, title_vectors, text_vectors])

# Ensure the target variable remains as a dense array
labels = df['label_encoded'].values

print("Final Feature Shape:", final_features.shape)


Final Feature Shape: (31794, 10004)


In [15]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31794 entries, 0 to 31793
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   title          31794 non-null  object 
 1   text           31794 non-null  object 
 2   label          31794 non-null  object 
 3   title_length   31794 non-null  float64
 4   text_length    31794 non-null  float64
 5   sentiment      31794 non-null  float64
 6   readability    31794 non-null  float64
 7   label_encoded  31794 non-null  int64  
 8   title_tokens   31794 non-null  object 
 9   text_tokens    31794 non-null  object 
dtypes: float64(4), int64(1), object(5)
memory usage: 2.4+ MB
None


In [16]:
print(numeric_features)  # Should exclude 'label_encoded'


['title_length', 'text_length', 'sentiment', 'readability']


In [17]:
import joblib

# Save features and labels
joblib.dump(final_features, 'final_features.pkl')
joblib.dump(labels, 'labels.pkl')
joblib.dump(title_vectorizer, 'title_vectorizer.pkl')
joblib.dump(text_vectorizer, 'text_vectorizer.pkl')


['text_vectorizer.pkl']