<a href="https://colab.research.google.com/github/zainulabedin2022skipq/Fake-News-Detection-using-ML-algorithms/blob/main/Fake_News_Detection_using_N_Gram_and_HPO_on_Logisitc_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# Loading the dataset to a pandas DataFrame
news_dataset = pd.read_csv('/content/train.csv')


In [4]:
# Replacing the null values with empty string
news_dataset = news_dataset.fillna('')

In [5]:
# Merging the author name and news title
news_dataset['content'] = news_dataset['author'] + ' ' + news_dataset['title']


In [6]:
# Separating the data & label
X = news_dataset.drop(columns='label', axis=1)
Y = news_dataset['label']


In [7]:
port_stem = PorterStemmer()

In [8]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content


In [9]:
news_dataset['content'] = news_dataset['content'].apply(stemming)

In [10]:
# Separating the data and label
X = news_dataset['content'].values
Y = news_dataset['label'].values

In [11]:
# Converting the textual data to numerical data using N-gram representation
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X = vectorizer.fit_transform(X)

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)


In [22]:
# Define the parameter grid for hyperparameter optimization
param_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

In [19]:
# Train a logistic regression model using maximum likelihood estimation
model = LogisticRegression()

In [23]:
# Perform grid search to find the best hyperparameters
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', error_score='raise')
grid_search.fit(X_train, Y_train)

In [24]:
# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [25]:
# Fit the best model on the training data
best_model.fit(X_train, Y_train)


In [26]:
# Accuracy score on the training data
X_train_prediction = best_model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy score of the training data:', training_data_accuracy)

Accuracy score of the training data: 0.9999399038461538


In [27]:
# Predict on the test data
Y_pred = best_model.predict(X_test)

In [28]:
# Calculate precision
precision = precision_score(Y_test, Y_pred)
print('Precision:', precision)

Precision: 0.9961649089165868


In [29]:
# Calculate recall
recall = recall_score(Y_test, Y_pred)
print('Recall:', recall)

Recall: 0.9975996159385502


In [30]:
# Calculate F1-measure
f1 = f1_score(Y_test, Y_pred)
print('F1-measure:', f1)


F1-measure: 0.9968817462221157


In [31]:
# Calculate AUC
auc = roc_auc_score(Y_test, Y_pred)
print('AUC:', auc)

AUC: 0.9968739533712973
