<a href="https://colab.research.google.com/github/yellowflickerbeat/FakeNews_Detection/blob/main/vader_kword_flesch_100.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
#stopwords removes all the repetative words which don't substance to the text
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
#WordNetLemmatizer converts words into their roots considering its context
from sklearn.feature_extraction.text import TfidfVectorizer
#TfidfVectorizer measures importance of diff words
from sklearn.model_selection import train_test_split
#splits data into test and train
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [1]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
#mounting the mandatory datasets
news = pd.read_table('/content/train.tsv')
news.shape

(26471, 6)

In [6]:
news = news.rename(columns={'Unnamed: 0' : 'id'})
#find out missing values in dataset
news.isnull().sum()

Unnamed: 0,0
id,0
title,0
text,0
subject,0
date,0
label,1


In [7]:
Y = news['label']
#news = news.drop('label', axis=1)
print(Y)

0        1.0
1        0.0
2        1.0
3        0.0
4        0.0
        ... 
26466    1.0
26467    1.0
26468    1.0
26469    0.0
26470    NaN
Name: label, Length: 26471, dtype: float64


In [8]:
!pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━[0m [32m112.6/126.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [9]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
def get_sentiment_features(text):
    scores = analyzer.polarity_scores(text)
    return scores['neg'], scores['neu'], scores['pos'], scores['compound']

# Apply the sentiment analysis function to the dataset
news[['neg', 'neu', 'pos', 'compound']] = news['title'].apply(get_sentiment_features).apply(pd.Series)

# Prepare the feature set and target variable
X = news[['neg', 'neu', 'pos', 'compound']]  # Use VADER sentiment features
#y = news['label']  # 1 for real, 0 for fake

# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [10]:
Y_test = Y_test.dropna().reset_index(drop=True)
X_test, Y_test = X_test.align(Y_test, join='inner', axis=0)
print(Y_train.isnull().sum(), Y_test.isnull().sum())

1 0


In [11]:
y_train_cleaned = Y_train.dropna()

# Align X_train by keeping only the rows that correspond to the non-NaN values in y_train_cleaned
X_train_cleaned = X_train.loc[y_train_cleaned.index]

# Reset the indices of both X_train_cleaned and y_train_cleaned
X_train_cleaned = X_train_cleaned.reset_index(drop=True)
y_train_cleaned = y_train_cleaned.reset_index(drop=True)

# Now check for null values in the cleaned data
print("y_train_cleaned NaN values:", y_train_cleaned.isnull().sum())
print("X_train_cleaned NaN values:", X_train_cleaned.isnull().sum())
print("Y_test NaN values:", Y_test.isnull().sum())

y_train_cleaned NaN values: 0
X_train_cleaned NaN values: neg         0
neu         0
pos         0
compound    0
dtype: int64
Y_test NaN values: 0


In [12]:
model = LogisticRegression()
model.fit(X_train_cleaned, y_train_cleaned)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(Y_test, y_pred)
print(f"Model Accuracy: {accuracy:.3f}")

Model Accuracy: 0.520


In [17]:
from sklearn.ensemble import RandomForestClassifier
import textstat

fake_news_words = [
    "Exclusive", "Shocking", "Revealed", "Exposed", "Unbelievable",
    "Scandal", "Unprecedented", "Breaking", "Conspiracy", "Hoax",
    "Bombshell", "Alert", "Hacked", "Disturbing", "Cover-up",
    "Unfounded", "Widespread", "Propaganda", "Debunked", "Controversial"
]

def contains_fake_news_keywords(text, keywords):
    tokens = text.split()  # Assuming text is already cleaned and tokenized
    return any(keyword in tokens for keyword in keywords)

# Function to calculate average sentence length
def average_sentence_length(text):
    sentences = text.split('.')  # Split text into sentences (basic splitting)
    num_sentences = len([s for s in sentences if s.strip()])
    num_words = len(text.split())
    return num_words / num_sentences if num_sentences > 0 else 0

def flesch_kincaid_score(text):
    return textstat.flesch_kincaid_grade(text)

# Apply feature engineering before splitting the data
news['keyword_feature'] = news['title'].apply(lambda x: contains_fake_news_keywords(x, fake_news_words)) | \
                          news['text'].apply(lambda x: contains_fake_news_keywords(x, fake_news_words))

news['avg_sentence_length'] = news['text'].apply(average_sentence_length)
news['flesch_kincaid'] = news['text'].apply(flesch_kincaid_score)

# Combine VADER sentiment features and keyword feature
X = news[['neg', 'neu', 'pos', 'compound', 'keyword_feature', 'avg_sentence_length', 'flesch_kincaid']]

# Now split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Handle NaN values in y_train
y_train_cleaned = Y_train.dropna()
X_train_cleaned = X_train.loc[y_train_cleaned.index].reset_index(drop=True)
y_train_cleaned = y_train_cleaned.reset_index(drop=True)

# Train a simple model (Random Forest Classifier)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_cleaned, y_train_cleaned)

y_pred = clf.predict(X_test)
accuracy = accuracy_score(Y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

Model Accuracy: 0.8138


In [16]:
!pip install textstat

Collecting textstat
  Downloading textstat-0.7.4-py3-none-any.whl.metadata (14 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.0-py3-none-any.whl.metadata (3.2 kB)
Downloading textstat-0.7.4-py3-none-any.whl (105 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-0.17.0-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.17.0 textstat-0.7.4


In [19]:
from sklearn.preprocessing import OneHotEncoder

# One-hot encode the subject names
encoder = OneHotEncoder()
subject_encoded = encoder.fit_transform(news[['subject']]).toarray()

# Add subject encoding as new features
subject_columns = [f"subject_{cat}" for cat in encoder.categories_[0]]
subject_df = pd.DataFrame(subject_encoded, columns=subject_columns)

# Combine with existing features
X = pd.concat([news[['neg', 'neu', 'pos', 'compound', 'keyword_feature', 'avg_sentence_length', 'flesch_kincaid']],
               subject_df], axis=1)
Y = news['label']  # Target variable

# Handle NaN values
X.fillna(0, inplace=True)
Y.fillna(0, inplace=True)

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, Y_train)

# Make predictions
y_pred = clf.predict(X_test)
accuracy = accuracy_score(Y_test, y_pred)

print(f"Model Accuracy (With Subject Features): {accuracy:.4f}")


Model Accuracy (With Subject Features): 1.0000
