# Fake News Detection and Fact Verifier

Problem Statement - Create a platform where users can input or paste a news article, and the system should analyze the text to detect whether it might be fake news. 

In [17]:
import pandas as pd

In [19]:
# Load datasets
true = pd.read_csv('True.csv')
fake = pd.read_csv('Fake.csv')


In [21]:
true['label'] = 1  # Real
fake['label'] = 0  # Fake


In [23]:
# Combine Both Files
df = pd.concat([true, fake], ignore_index=True)
df = df[['title', 'text', 'label']]
df.to_csv('news_dataset.csv', index=False)


In [24]:
true = pd.read_csv('True.csv')
fact_base = true['title'] + ' ' + true['text']  #It contains only real news articles.combines both the title and 
                                                #the body of the news into one string per row


In [25]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer #It converts text data into numerical features using TF-IDF (Term Frequency-Inverse Document Frequency).
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline #Pipeline is a utility class that lets you bundle multiple preprocessing steps and a model into one object.
import joblib


In [26]:
# Load data
df = pd.read_csv('news_dataset.csv')
X = df['text']
y = df['label']


In [31]:
df.isna().sum()

title    0
text     0
label    0
dtype: int64

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   44898 non-null  object
 1   text    44898 non-null  object
 2   label   44898 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.0+ MB


In [41]:
df.shape

(44898, 3)

In [43]:
# Split into train and test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [45]:
# Pipeline
model = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', LogisticRegression())
])


In [47]:
# Train
model.fit(X_train, y_train)


In [48]:
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.9)
fact_vectors = vectorizer.fit_transform(fact_base)

In [49]:

# Save model
joblib.dump(model, 'model.pkl')


['model.pkl']

In [50]:
from sklearn.metrics.pairwise import cosine_similarity

def find_best_match(user_input, fact_vectors, vectorizer, fact_base):#Finds the most similar real news article (from a verified dataset) to the news article entered by the user.
    input_vec = vectorizer.transform([user_input])#Converts the user's input into a vector using TF-IDF.
    similarities = cosine_similarity(input_vec, fact_vectors).flatten()
    best_match_idx = similarities.argmax()
    score = similarities[best_match_idx]
    return fact_base.iloc[best_match_idx], score


#Compares it with all verified real articles using cosine similarity.
#Returns the closest matching real article + how similar it is (as a score).