<a href="https://colab.research.google.com/github/zainulabedin2022skipq/Fake-News-Detection-using-ML-algorithms/blob/main/Fake_News_Detection_using_HyperParameter_Optimization_technique_on_Logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
# Loading the dataset to a pandas DataFrame
news_dataset = pd.read_csv('/content/train.csv')

In [5]:
# Replacing the null values with empty string
news_dataset = news_dataset.fillna('')

In [6]:
# Merging the author name and news title
news_dataset['content'] = news_dataset['author'] + ' ' + news_dataset['title']

In [7]:

# Separating the data & label
X = news_dataset.drop(columns='label', axis=1)
Y = news_dataset['label']

In [8]:
port_stem = PorterStemmer()

In [9]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [10]:
news_dataset['content'] = news_dataset['content'].apply(stemming)


In [11]:
# Separating the data and label
X = news_dataset['content'].values
Y = news_dataset['label'].values

In [12]:
# Converting the textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

In [13]:
X = vectorizer.transform(X)


In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [15]:
# Define the parameter grid for hyperparameter optimization
param_grid = {
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs'],
    'max_iter': [100, 200, 500]
}

In [16]:
# Create the logistic regression model
model = LogisticRegression()

In [17]:
# Perform grid search to find the best hyperparameters
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, Y_train)


In [18]:
# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [19]:
# Fit the best model on the training data
best_model.fit(X_train, Y_train)

In [20]:
# Accuracy score on the training data
X_train_prediction = best_model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy score of the training data:', training_data_accuracy)

Accuracy score of the training data: 0.9990384615384615


In [22]:
# Make predictions on the test data
Y_test_prediction = best_model.predict(X_test)

In [24]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [25]:
# Calculate precision
precision = precision_score(Y_test, Y_test_prediction)
print('Precision:', precision)


Precision: 0.9871121718377088


In [26]:
# Calculate recall
recall = recall_score(Y_test, Y_test_prediction)
print('Recall:', recall)


Recall: 0.9927988478156505


In [27]:
# Calculate F1-measure
f1 = f1_score(Y_test, Y_test_prediction)
print('F1-measure:', f1)


F1-measure: 0.9899473432264241


In [28]:
# Calculate AUC
auc = roc_auc_score(Y_test, Y_test_prediction)
print('AUC:', auc)

AUC: 0.9898996646396501
