<a href="https://colab.research.google.com/github/zainulabedin2022skipq/Fake-News-Detection-using-ML-algorithms/blob/main/Fake_News_Detection_using_HyperParameter_Optimization_technique_on_SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [78]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [43]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [44]:
# Loading the dataset to a pandas DataFrame
news_dataset = pd.read_csv('/content/train.csv')

In [46]:
# Replacing the null values with empty string
news_dataset = news_dataset.fillna('')

In [47]:
# Merging the author name and news title
news_dataset['content'] = news_dataset['author'] + ' ' + news_dataset['title']

In [53]:

# Separating the data & label
X = news_dataset.drop(columns='label', axis=1)
Y = news_dataset['label']

In [49]:
port_stem = PorterStemmer()

In [81]:
def stemming(content):
    content = re.sub('[^a-zA-Z]', ' ', content)  # Remove non-alphabetic characters
    content = content.lower()
    content = content.split()
    content = [port_stem.stem(word) for word in content if not word in stopwords.words('english')]
    content = ' '.join(content)
    return content

In [82]:
news_dataset['content'] = news_dataset['content'].apply(stemming)


In [83]:
# Separating the data and label
X = news_dataset['content'].values
Y = news_dataset['label'].values

In [85]:
# Converting the textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

In [86]:
X = vectorizer.transform(X)


In [87]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [92]:
# Reduce the size of the parameter grid
param_grid = {
    'C': [1],
    'kernel': ['linear'],
    'gamma': ['scale']
}

In [93]:
# Create the SVM model
model = SVC()

In [94]:
# Perform grid search to find the best hyperparameters
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', error_score='raise')
grid_search.fit(X_train, Y_train)

In [95]:
# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_


In [96]:
# Fit the best model on the training data
best_model.fit(X_train, Y_train)

In [97]:
# Accuracy score on the training data
X_train_prediction = best_model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy score of the training data:', training_data_accuracy)

Accuracy score of the training data: 0.9974759615384615


In [100]:
# Calculate precision
precision = precision_score(Y_test, y_pred)
print('Precision:', precision)

Precision: 0.9604105571847508


In [101]:
# Calculate recall
recall = recall_score(Y_test, y_pred)
print('Recall:', recall)

Recall: 0.9433509361497839


In [103]:
# Calculate F1-measure
f1 = f1_score(Y_test, y_pred)
print('F1-measure:', f1)

F1-measure: 0.9518043109711793


In [104]:
# Calculate AUC
auc = roc_auc_score(Y_test, y_pred)
print('AUC:', auc)

AUC: 0.9521761902703663
