<a href="https://colab.research.google.com/github/yellowflickerbeat/FakeNews_Detection/blob/main/textblob(no_lemma).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
#stopwords removes all the repetative words which don't substance to the text
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
#WordNetLemmatizer converts words into their roots considering its context
from sklearn.feature_extraction.text import TfidfVectorizer
#TfidfVectorizer measures importance of diff words
from sklearn.model_selection import train_test_split
#splits data into test and train
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
print(stopwords.words('english'))
#examples of all stopwords

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

Data Pre-Processing

In [3]:
#mounting the mandatory datasets
news = pd.read_table('/content/train.tsv')

In [4]:
news.shape
#getting to know the contents' size

(24889, 6)

In [5]:
#printing the first 10 rows and columns
news = news.rename(columns={'Unnamed:0' : 'id'})
news.head(10)

Unnamed: 0.1,Unnamed: 0,title,text,subject,date,label
0,2619,Ex-CIA head says Trump remarks on Russia inter...,Former CIA director John Brennan on Friday cri...,politicsNews,"July 22, 2017",1.0
1,16043,YOU WON’T BELIEVE HIS PUNISHMENT! HISPANIC STO...,How did this man come to OWN this store? There...,Government News,"Jun 19, 2017",0.0
2,876,Federal Reserve governor Powell's policy views...,President Donald Trump on Thursday tapped Fede...,politicsNews,"November 2, 2017",1.0
3,19963,SCOUNDREL HILLARY SUPPORTER STARTS “TrumpLeaks...,Hillary Clinton ally David Brock is offering t...,left-news,"Sep 17, 2016",0.0
4,10783,NANCY PELOSI ARROGANTLY DISMISSES Questions on...,Pleading ignorance is a perfect ploy for Nancy...,politics,"May 26, 2017",0.0
5,18522,EU's Tusk appealed to Rajoy to avoid escalatio...,European Council President Donald Tusk appeale...,worldnews,"October 2, 2017",1.0
6,270,Country Guitarist Who Survived Vegas Shooting ...,"Caleb Keeter, a lifelong proponent of the Seco...",News,"October 2, 2017",0.0
7,7628,Clinton says 'there is no case here' in FBI em...,Democratic presidential candidate Hillary Clin...,politicsNews,"October 31, 2016",1.0
8,9599,ABC NEWS REPORTS: Las Vegas Massacre Suspect’s...,The investigation into the Las Vegas massacre ...,politics,"Oct 25, 2017",0.0
9,11234,BONKERS BERNIE SANDERS: Prioritizing Jobs Over...,https://www.youtube.com/watch?v=GPqQIlWksbgVer...,politics,"Apr 1, 2017",0.0


In [6]:
#find out missing values in dataset
news.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
title,0
text,0
subject,1
date,1
label,1


In [None]:
#renaming a column in dataset to uniquely identify eash item
news = news.rename(columns={'Unnamed: 0' : 'id'})

In [7]:
#removing the previous values in 'label' column
Y = news['label']
news = news.drop('label', axis=1)

**Lemmatization**

Lemmatization helps to reduce dimensionality esp as we are using logistic regression

Unlike stemming, it reduces the words too its base form only after considering the words' contexts

In [None]:
lemma = WordNetLemmatizer()
print(Y)

0       1.0
1       0.0
2       1.0
3       0.0
4       0.0
       ... 
4082    1.0
4083    1.0
4084    0.0
4085    0.0
4086    NaN
Name: label, Length: 4087, dtype: float64


In [None]:
def lemmatization(content):
    lemmatizer = WordNetLemmatizer()
    # Remove all non-alphabetic characters & convert to lowercase
    lemmatized_content = re.sub('[^a-zA-Z]', ' ', content)
    lemmatized_content = lemmatized_content.lower()
    # Split the content into words
    lemmatized_content = lemmatized_content.split()
    # Remove stopwords and apply lemmatization
    lemmatized_content = [lemmatizer.lemmatize(word) for word in lemmatized_content if word not in stopwords.words('english')]
    # Join the words back into a single string
    lemmatized_content = ' '.join(lemmatized_content)
    return lemmatized_content


In [None]:
nltk.download('wordnet')
news['title'] = news['title'].apply(lemmatization)
news['text'] = news['text'].apply(lemmatization)

[nltk_data] Downloading package wordnet to /root/nltk_data...


Sentiment Analysis of News' Titles

In [8]:
from textblob import TextBlob
def get_sentiment_features(content):
    analysis = TextBlob(content)
    return analysis.sentiment.polarity, analysis.sentiment.subjectivity

# Apply the sentiment analysis function to the dataset
news[['polarity', 'subjectivity']] = news['title'].apply(get_sentiment_features).apply(pd.Series)

# Prepare the feature set and target variable
X = news[['polarity', 'subjectivity']]  # Use sentiment features

# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [9]:
Y = Y.dropna()
Y = Y.reset_index(drop=True)
print(Y)

0        1.0
1        0.0
2        1.0
3        0.0
4        0.0
        ... 
24883    0.0
24884    0.0
24885    1.0
24886    1.0
24887    0.0
Name: label, Length: 24888, dtype: float64


In [13]:
Y_test = Y_test.dropna().reset_index(drop=True)
X_test, Y_test = X_test.align(Y_test, join='inner', axis=0)
print(Y_train.isnull().sum(), Y_test.isnull().sum())

1 0


In [20]:
y_train = Y_train.dropna()
X_train = X_train.loc[y_train.index]
X_train_cleaned = X_train.reset_index(drop=True)
y_train_cleaned = y_train.reset_index(drop=True)
print(Y_train.isnull().sum(), Y_test.isnull().sum())

1 0


In [21]:
y_train_cleaned = Y_train.dropna()

# Align X_train by keeping only the rows that correspond to the non-NaN values in y_train_cleaned
X_train_cleaned = X_train.loc[y_train_cleaned.index]

# Reset the indices of both X_train_cleaned and y_train_cleaned
X_train_cleaned = X_train_cleaned.reset_index(drop=True)
y_train_cleaned = y_train_cleaned.reset_index(drop=True)

# Now check for null values in the cleaned data
print("y_train_cleaned NaN values:", y_train_cleaned.isnull().sum())
print("X_train_cleaned NaN values:", X_train_cleaned.isnull().sum())
print("Y_test NaN values:", Y_test.isnull().sum())

y_train_cleaned NaN values: 0
X_train_cleaned NaN values: polarity        0
subjectivity    0
dtype: int64
Y_test NaN values: 0


In [23]:
model = LogisticRegression()
model.fit(X_train_cleaned, y_train_cleaned)

Y_pred = model.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Model Accuracy: {accuracy:.3f}")

Model Accuracy: 0.500


	TextBlob or VADER??

B. Bhutani, N. Rastogi, P. Sehgal and A. Purwar, "Fake News Detection Using Sentiment Analysis," 2019 Twelfth International Conference on Contemporary Computing (IC3), Noida, India, 2019, pp. 1-5, doi: 10.1109/IC3.2019.8844880. keywords: {Social networking (online);Vocabulary;Sentiment analysis;Information technology;Media;Buildings;Logistics;Fake News;Naive Bayes;Random Forest;Cosine similarity tf-idf;sentiment};

