- Database: Covid-fake
- Function: cleaning
- Desp: NA

# Necessary Imports...

In [28]:
%pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [29]:
# Start writing code here...
import pandas as pd
import xgboost
import re

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from sklearn.metrics import f1_score


# Read test and training data

In [30]:
train = pd.read_csv('fake-covid-train.txt')
test = pd.read_csv('fake-covid-val.txt')

In [31]:
train.head(10)

Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,real
1,2,States reported 1121 deaths a small rise from ...,real
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real
4,5,Populous states can generate large case counts...,real
5,6,"Covid Act Now found ""on average each person in...",real
6,7,If you tested positive for #COVID19 and have n...,real
7,8,Obama Calls Trump’s Coronavirus Response A Cha...,fake
8,9,"???Clearly, the Obama administration did not l...",fake
9,10,Retraction—Hydroxychloroquine or chloroquine w...,fake


In [32]:
labels = ['fake','real']
def label_encode(val):
    return labels.index(val)

# Label Encoding

In [33]:
train.label = train.label.apply(label_encode)

In [34]:
train.head(10)

Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,1
1,2,States reported 1121 deaths a small rise from ...,1
2,3,Politically Correct Woman (Almost) Uses Pandem...,0
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,1
4,5,Populous states can generate large case counts...,1
5,6,"Covid Act Now found ""on average each person in...",1
6,7,If you tested positive for #COVID19 and have n...,1
7,8,Obama Calls Trump’s Coronavirus Response A Cha...,0
8,9,"???Clearly, the Obama administration did not l...",0
9,10,Retraction—Hydroxychloroquine or chloroquine w...,0


# Cleaning training and test data

In [35]:
train = train.reset_index(drop=True)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = []

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
#    text = re.sub(r'\W+', '', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text
train.tweet = train.tweet.apply(clean_text)
train.tweet = train.tweet.str.replace('\d+', '')

  train.tweet = train.tweet.str.replace('\d+', '')


### Preparing Test Data

In [36]:
test.label = test.label.apply(label_encode)
test = test.reset_index(drop=True)
test.tweet = test.tweet.apply(clean_text)
test.tweet = test.tweet.str.replace('\d+', '')

  test.tweet = test.tweet.str.replace('\d+', '')


In [37]:
train.tweet.sample(10)

2939    #covid testing is done free of cost at govt la...
787     for example once a successful vaccine has been...
4184    new local restrictions have been announced for...
135     _photo of a vaccine developed by us scientists...
366     a banner with a swastika trump and pence is fr...
1177    #indiafightscorona india has substantively ram...
2665    the other  cases are in the community and they...
4201    as at  pm th april number of states with confi...
6297    daily new cases incidence has changed the risk...
558     our partner the cepivaccines is supporting  ca...
Name: tweet, dtype: object

# Base Line Model Used
## 1. Naive Bayes
## 2. Linear Classifier
## 3. Bagging
## 4. Boosting
## 5. SVM

### Building Model

In [38]:
def train_model(classifier, feature_vector_train, label,  feature_vector_valid, valid_y,test_data , test_label ,is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    #print("In Validation Data",metrics.accuracy_score(predictions, valid_y))
    #applying in test data
    predictions_test = classifier.predict(test_data)
    
    if is_neural_net:
        predictions_test = predictions_test.argmax(axis=-1)
    print("f1 score: ",f1_score(test_label,predictions_test))
    print("Accuracy: ",metrics.accuracy_score(test_label,predictions_test))
    

### 1.Splitting the Data into Train and validation

In [39]:
# split the dataset into training and validation datasets 
from sklearn.model_selection import train_test_split
# train_x, valid_x, train_y, valid_y = model_selection.train_test_split(train['tweet'], train['label'])
train_x, valid_x, train_y, valid_y = train_test_split(train['tweet'], train['label'], test_size=0.33)

### 2. Applying WordLevel tf-idf and bi-gram tf-idf

In [40]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(train['tweet'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)
test_tfidf   =  tfidf_vect.transform(test['tweet'])

# ngram level tf-idf (bigram in this case)
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(train['tweet'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)
test_tfidf_ngram   =  tfidf_vect.transform(test['tweet'])

#  Naive Bayes Model 

In [41]:
# Naive Bayes on Word Level TF IDF Vectors
print("Word level tf-idf")

accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf, valid_y, test_tfidf, test['label'])
# print ("NB, WordLevel TF-IDF: ", accuracy)
print("Bigram tf-idf")

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram, valid_y ,test_tfidf_ngram, test['label'])
# print ("NB, Bi-Gram Vectors: ", accuracy)

Word level tf-idf
f1 score:  0.9024390243902439
Accuracy:  0.8953271028037383
Bigram tf-idf
f1 score:  0.5833003561535417
Accuracy:  0.5079439252336448


# Linear Classifier

In [42]:
# Linear Classifier on Word Level TF IDF Vectors
print("Word level tf-idf")
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf, valid_y, test_tfidf, test['label'])
# print("LR, WordLevel TF-IDF: ", accuracy)
print("Bigram tf-idf")
# Linear Classifier on Ngram Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(),  xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram, valid_y ,test_tfidf_ngram, test['label'])
# print("LR, Bi-Gram Vectors: ", accuracy)

Word level tf-idf
f1 score:  0.9183222958057394
Accuracy:  0.9135514018691588
Bigram tf-idf
f1 score:  0.6333463490823897
Accuracy:  0.561214953271028


# Bagging Model

In [43]:
# RF on Word Level TF IDF Vectors
print("Word level tf-idf")

accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xvalid_tfidf, valid_y, test_tfidf, test['label'])
# print ("RF, WordLevel TF-IDF: ", accuracy)
print("Bigram tf-idf")

# RF on ngram Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram, valid_y ,test_tfidf_ngram, test['label'])
# print ("RF, Bi-gram TF-IDF: ", accuracy)

Word level tf-idf
f1 score:  0.9105332745702954
Accuracy:  0.9051401869158878
Bigram tf-idf
f1 score:  0.23177842565597664
Accuracy:  0.5074766355140187


# Boosting Model

In [44]:

# Extereme Gradient Boosting on Word Level TF IDF Vectors
print("Word level tf-idf")
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc(), valid_y, test_tfidf.tocsc(), test['label'])
# print("Xgb, WordLevel TF-IDF: ", accuracy)

Word level tf-idf




f1 score:  0.9087719298245616
Accuracy:  0.902803738317757


# SVM Model

In [45]:


print("Word level tf-idf")
#SVM Model on Unigram TF-IDF
accuracy = train_model(svm.SVC(), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc(), valid_y, test_tfidf.tocsc(), test['label'])
# print("SVM, WordLevel TF-IDF: ", accuracy)
print("Bigram tf-idf")

# SVM Model on Bigram TF-IDF
accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram.tocsc(), train_y, xvalid_tfidf_ngram.tocsc(), valid_y, test_tfidf_ngram.tocsc(), test['label'])
# print("SVM, Bi-gram TF-IDF: ", accuracy)

Word level tf-idf
f1 score:  0.9276520195295161
Accuracy:  0.9238317757009346
Bigram tf-idf
f1 score:  0.5413808870598995
Accuracy:  0.5313084112149533
