# Import necessary libraries


In [4]:
import pandas as pd
import re
from google.colab import drive
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

!gdown https://drive.google.com/uc?id=1p3Y2ANrNqFktLfwdgfACvNmPL1xmvPgk -O ./fake_or_real_news.csv

Downloading...
From: https://drive.google.com/uc?id=1p3Y2ANrNqFktLfwdgfACvNmPL1xmvPgk
To: /content/fake_or_real_news.csv
100% 31.4M/31.4M [00:00<00:00, 110MB/s] 


# Download the necessary NLTK data and stopwords

In [5]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Load the dataset

In [8]:
print("Loading the dataset...")

df = pd.read_csv('./fake_or_real_news.csv', encoding='utf-8')

Loading the dataset...


# Data Cleaning

In [9]:
print("\nChecking for missing values in the dataset...")
print(df.isnull().sum())


Checking for missing values in the dataset...
Unnamed: 0    0
title         0
text          0
label         0
dtype: int64


# Drop rows with missing values

In [10]:
df = df.dropna()
print("\nDropped rows with missing values (if any).")


Dropped rows with missing values (if any).


# Text Preprocessing

In [11]:
print("\nPreprocessing the text data...")
stemmer = PorterStemmer()


Preprocessing the text data...


In [12]:
def preprocess(text):
    # Remove punctuations and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)

    # Convert to lowercase
    text = text.lower()

    # Remove stopwords and stem the words
    text = [stemmer.stem(word) for word in text.split() if word not in set(stopwords.words('english'))]

    return ' '.join(text)

df['cleaned_text'] = df['text'].apply(preprocess)
print("Text data preprocessing complete.")

Text data preprocessing complete.


# Feature Extraction using TF-IDF

In [13]:
print("\nExtracting features using TF-IDF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.7, max_features=5000)
X = tfidf_vectorizer.fit_transform(df['cleaned_text'])
y = df['label']
print("Feature extraction complete.")


Extracting features using TF-IDF...
Feature extraction complete.


# Model Training

In [14]:
print("\nSplitting the data into training and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print("Training the Logistic Regression model...")
model = LogisticRegression()
model.fit(X_train, y_train)
print("Model training complete.")


Splitting the data into training and test sets...
Training the Logistic Regression model...
Model training complete.


# Model Evaluation

In [15]:
print("\nEvaluating the model...")
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Evaluating the model...
Accuracy: 91.16%

Classification Report:
               precision    recall  f1-score   support

        FAKE       0.91      0.91      0.91       615
        REAL       0.92      0.91      0.91       652

    accuracy                           0.91      1267
   macro avg       0.91      0.91      0.91      1267
weighted avg       0.91      0.91      0.91      1267

