In [4]:
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import pandas as pd
import re
import string

In [5]:
df = pd.read_csv("Constraint_Train.csv")

In [6]:
df.shape

(6424, 3)

In [7]:
df.loc[df['label'] == 'real','label'] = 1
df.loc[df['label'] == 'fake','label'] = 0
df.isnull().sum()

id       0
tweet    0
label    0
dtype: int64

In [9]:
def word_filter(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)    
    return text

In [10]:
df['tweet'] = df['tweet'].apply(word_filter)

In [11]:
x = df['tweet']
y = df['label']

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 3)

y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

### 1. Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression

LR_model = LogisticRegression()
LR_model.fit(xv_train, y_train)

LR_model.score(xv_test, y_test)

pred_LR_model = LR_model.predict(xv_test)

print(classification_report(y_test, pred_LR_model))

              precision    recall  f1-score   support

           0       0.93      0.89      0.91       753
           1       0.91      0.94      0.92       853

    accuracy                           0.92      1606
   macro avg       0.92      0.92      0.92      1606
weighted avg       0.92      0.92      0.92      1606



### 2. Decision Tree Classification

In [15]:
from sklearn.tree import DecisionTreeClassifier

DT_model = DecisionTreeClassifier()
DT_model.fit(xv_train, y_train)

DT_model.score(xv_test, y_test)

pred_DT_model = DT_model.predict(xv_test)

print(classification_report(y_test, pred_DT_model))

              precision    recall  f1-score   support

           0       0.87      0.84      0.86       753
           1       0.86      0.89      0.88       853

    accuracy                           0.87      1606
   macro avg       0.87      0.87      0.87      1606
weighted avg       0.87      0.87      0.87      1606



### 3. Gradient Boosting Classifier

In [16]:
from sklearn.ensemble import GradientBoostingClassifier

GBC_model = GradientBoostingClassifier(random_state=0)
GBC_model.fit(xv_train, y_train)

GBC_model.score(xv_test, y_test)

pred_GBC_model = GBC_model.predict(xv_test)

print(classification_report(y_test, pred_GBC_model))

              precision    recall  f1-score   support

           0       0.89      0.86      0.88       753
           1       0.88      0.91      0.89       853

    accuracy                           0.89      1606
   macro avg       0.89      0.88      0.88      1606
weighted avg       0.89      0.89      0.89      1606



### 4. Random Forest Classifier

In [17]:
from sklearn.ensemble import RandomForestClassifier

RFC_model = RandomForestClassifier(random_state=0)
RFC_model.fit(xv_train, y_train)

RFC_model.score(xv_test, y_test)

pred_RFC_model = RFC_model.predict(xv_test)

print(classification_report(y_test, pred_RFC_model))

              precision    recall  f1-score   support

           0       0.92      0.90      0.91       753
           1       0.91      0.93      0.92       853

    accuracy                           0.91      1606
   macro avg       0.91      0.91      0.91      1606
weighted avg       0.91      0.91      0.91      1606



In [18]:
def check_news(n):
    if n == 0:
        return "It's A Fake Covid News"
    elif n == 1:
        return "It's Not A Fake Covid News"
    
def predicting(news):
    testing_news = {"text":[news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test["text"] = new_def_test["text"].apply(word_filter) 
    new_x_test = new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test)
    pred_LR = LR_model.predict(new_xv_test)
    pred_DT = DT_model.predict(new_xv_test)
    pred_GBC = GBC_model.predict(new_xv_test)
    pred_RFC = RFC_model.predict(new_xv_test)

    return print("Logistic Regression Prediction: {} \nDecision Tree Classification Prediction: {} \nGradient Boosting Classifier Prediction: {} \nRandom Forest Classifier Prediction: {}".format(
        check_news(pred_LR[0]), 
        check_news(pred_DT[0]), 
        check_news(pred_GBC[0]), 
        check_news(pred_RFC[0]))
    )

In [19]:
print("News №1")
news = "New variant classed of concern and named Omicron."
predicting(news)

print("==================================================")

print("News №2")
news2 = "The first volunteer to take the human trial vaccine for coronavirus in the UK has died."
predicting(news2)

News №1
Logistic Regression Prediction: It's Not A Fake Covid News 
Decision Tree Classification Prediction: It's Not A Fake Covid News 
Gradient Boosting Classifier Prediction: It's Not A Fake Covid News 
Random Forest Classifier Prediction: It's Not A Fake Covid News
News №2
Logistic Regression Prediction: It's A Fake Covid News 
Decision Tree Classification Prediction: It's A Fake Covid News 
Gradient Boosting Classifier Prediction: It's A Fake Covid News 
Random Forest Classifier Prediction: It's A Fake Covid News
