<a href="https://colab.research.google.com/github/zeeshanahmad10809/covid_fake_news_classification/blob/main/ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Sentiment Analysis (Machine Learning Techniques)

### Install & Import Dependencies

In [1]:
!pip install tweet-preprocessor

You should consider upgrading via the '/home/malang/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
import os
import re
import string
import random
import preprocessor as p
import numpy as np
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from pprint import pprint
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report

nltk.download("wordnet")

[nltk_data] Downloading package wordnet to /home/malang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Set random seed values to attain deterministic behaviours
SEED_VALUE = 19
os.environ["PYTHONHASHSEED"] = str(SEED_VALUE)
random.seed(SEED_VALUE)
np.random.seed(SEED_VALUE)

### Text Preprocessing

In [4]:
def remove_url(tweet):
    return " ".join(
        re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t]) |(\w+:\/\/\S+)", " ", tweet).split()
    )


def remove_punctuation(tweet):
    for ch in string.punctuation:
        if ch in tweet:
            tweet = tweet.replace(ch, "")
    return tweet


def lower_case(tweet):
    return tweet.lower().strip()


def lemmatize(tweet):
    lemmatizer = WordNetLemmatizer()
    tweet = " ".join(lemmatizer.lemmatize(token) for token in tweet.split(" "))
    return tweet


def preprocess_tweet(tweet):
    # tweet = p.clean(tweet)
    tweet = remove_url(tweet)
    tweet = remove_punctuation(tweet)
    tweet = lower_case(tweet)
    tweet = lemmatize(tweet)
    return tweet

### Dataset

In [5]:
DATASET1 = "COVID FakeNews Data.csv"
DATASET2 = "dataset-Non-extremist-Extremist.csv"

In [6]:
class Dataset:
    def __init__(self, dataset_name):
        self.dataset_name = dataset_name
        data = None
        try:
            data = pd.read_csv(self.dataset_name)
        except FileNotFoundError:
            logger.warning("Dataset File is missing!")
            os._exit(0)
        if self.dataset_name == "dataset-Non-extremist-Extremist.csv":
            data["Tweet label"] = data["Tweet label"].replace("Non-extremist", 0)
            data["Tweet label"] = data["Tweet label"].replace("Extremist", 1)
            col_list = data.columns.to_list()
            col_list = [col_list[-1], col_list[0]]
            data = data[col_list]

        data.iloc[:, 0] = data.iloc[:, 0].apply(preprocess_tweet)
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            data.iloc[:, 0], data.iloc[:, 1], stratify=data.iloc[:, 1], test_size=0.2
        )
        self.encoder = TfidfVectorizer()

    def fit(self):
        self.encoder.fit(self.X_train)

    def load(self):
        self.fit()
        return (
            self.encoder.transform(self.X_train),
            self.encoder.transform(self.X_test),
            self.y_train,
            self.y_test,
        )

### Model Creation & Training

In [7]:
class MLModel:
    def __init__(self, name):
        if name == "logistic_regression":
            self.model = LogisticRegression()
        elif name == "random_forest":
            self.model = RandomForestClassifier()
        elif name == "decision_tree":
            self.model = DecisionTreeClassifier()
        elif name == "svm":
            self.model = SVC()
        elif name == "knn":
            self.model = KNeighborsClassifier(n_neighbors=5)
        elif name == "adaboost":
            self.model = AdaBoostClassifier()
        elif name == "mlp":
            self.model = MLPClassifier()
        elif name == "naive_bayes":
            self.model = MultinomialNB()
        else:
            raise ValueError

    def fit(self, X_train, y_train):
        self.model.fit(X_train, y_train)

    def predict(self, X_test):
        return self.model.predict(X_test)

In [8]:
# Select either DATASET1 or DATASET2
dataset = Dataset(DATASET1)
X_train, X_test, y_train, y_test = dataset.load()

In [9]:
# print(np.count_nonzero(y_test == 0)/len(y_test))
print(len(y_train))
print(len(y_test))

8160
2041


In [10]:
# Fake News Label = 0
# Not-Fake News Label = 1
print(f"Train fake news:{len(y_train == 0)},\Test fake news: {len(y_test == 0)}")
print(f"Train non-fake news: {len(y_train == 1)}, Test non-fake news: {len(y_test == 1)}")

Train fake news:8160,\Test fake news: 2041
Train non-fake news: 8160, Test non-fake news: 2041


In [11]:
model = MLModel(name="knn")
model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)

### Evaluation

In [12]:
def show_performance(y_test, y_test_pred):
    pprint(confusion_matrix(y_test, y_test_pred))
    print(classification_report(y_test, y_test_pred, digits=4))

In [13]:
# USE 'macro avg'
show_performance(y_test, y_test_pred)

array([[1946,    0],
       [  59,   36]])
              precision    recall  f1-score   support

           0     0.9706    1.0000    0.9851      1946
           1     1.0000    0.3789    0.5496        95

    accuracy                         0.9711      2041
   macro avg     0.9853    0.6895    0.7673      2041
weighted avg     0.9719    0.9711    0.9648      2041

