In [99]:
import re
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pickle
from sklearn.linear_model import LogisticRegressionCV

df = pd.read_csv('data/corona_fake.csv')

df.loc[df['label'] == 'Fake', ['label']] = 'FAKE'
df.loc[df['label'] == 'fake', ['label']] = 'FAKE'
df.loc[df['source'] == 'facebook', ['source']] = 'Facebook'
df.text.fillna(df.title, inplace=True)

df.loc[5]['label'] = 'FAKE'
df.loc[15]['label'] = 'TRUE'
df.loc[43]['label'] = 'FAKE'
df.loc[131]['label'] = 'TRUE'
df.loc[242]['label'] = 'FAKE'

df = df.sample(frac=1).reset_index(drop=True)
df.title.fillna('missing', inplace=True)
df.source.fillna('missing', inplace=True)

df['title_text'] = df['title'] + ' ' + df['text']

In [100]:
df['label'].value_counts()

TRUE    586
FAKE    578
Name: label, dtype: int64

In [101]:
df.head()

Unnamed: 0,title,text,source,label,title_text
0,What precautions can I take when grocery shopp...,The coronavirus that causes COVID-19 is primar...,https://www.health.harvard.edu/,TRUE,What precautions can I take when grocery shopp...
1,BREAKING: New Evidence Based on Cell Phone Dat...,Bartiromo broke news this morning that cell ph...,https://www.thegatewaypundit.com/,FAKE,BREAKING: New Evidence Based on Cell Phone Dat...
2,COVID-19 and the CIA’s Biological Warfare on Cuba,"Maybe it was a plan that went horribly wrong, ...",https://www.globalresearch.ca,FAKE,COVID-19 and the CIA’s Biological Warfare on C...
3,missing,Donating blood requires that you be administer...,missing,FAKE,missing Donating blood requires that you be ad...
4,Is it safe to donate blood during the outbreak...,COVID-19 doesn’t pose any known risk to blood ...,https://www.globalhealthnow.org/,TRUE,Is it safe to donate blood during the outbreak...


In [152]:
df['title_text'][50]

'CORONAVIRUS: A WUHAN LABORATORY SPONSORED BY SOROS, VIRUS AFFECTS ONLY MONGOLOID RACE There is a biolaboratory in Wuhan – until recently, nothing was known about it. Its address is Gaoxin, three sixes – the number mentioned in the Bible, under which the name of the beast of the Apocalypse is hidden. But it’s even more symbolic that it exists thanks to the money of the famous banker George Soros, who shares the globalist ideas of Bill Gates. This could be part of a cunning plan.The coronavirus affects only the representatives of the Mongoloid race, which is very suspicious and raises questions.'

In [153]:
def preprocessor(text):
    
    text = re.sub('<[^>]*>', '', text)
    text = re.sub(r'[^\w\s]','', text)
    text = text.lower()

    return text

In [154]:
df['title_text'] = df['title_text'].apply(preprocessor)

In [156]:
df['title_text'][50]

'coronavirus a wuhan laboratory sponsored by soros virus affects only mongoloid race there is a biolaboratory in wuhan  until recently nothing was known about it its address is gaoxin three sixes  the number mentioned in the bible under which the name of the beast of the apocalypse is hidden but its even more symbolic that it exists thanks to the money of the famous banker george soros who shares the globalist ideas of bill gates this could be part of a cunning planthe coronavirus affects only the representatives of the mongoloid race which is very suspicious and raises questions'

In [157]:
porter = PorterStemmer()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [158]:
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None,
                        tokenizer=tokenizer_porter,
                        use_idf=True,
                        norm='l2',
                        smooth_idf=True)
X = tfidf.fit_transform(df['title_text'])
y = df.label.values

In [159]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.5, shuffle=False)

clf = LogisticRegressionCV(cv=5, scoring='accuracy', random_state=0, n_jobs=-1, verbose=3, max_iter=300).fit(X_train, y_train)

fake_news_model = open('fake_news_model.sav', 'wb')
pickle.dump(clf, fake_news_model)
fake_news_model.close()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   14.8s remaining:   22.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   18.1s finished


In [160]:
filename = 'fake_news_model.sav'
saved_clf = pickle.load(open(filename, 'rb'))

saved_clf.score(X_test, y_test)

0.9347079037800687

In [161]:
from sklearn.metrics import classification_report, accuracy_score
y_pred = clf.predict(X_test)
print("---Test Set Results---")
print("Accuracy with logreg: {}".format(accuracy_score(y_test, y_pred)))
print(classification_report(y_test, y_pred))

---Test Set Results---
Accuracy with logreg: 0.9347079037800687
              precision    recall  f1-score   support

        FAKE       0.93      0.94      0.93       281
        TRUE       0.94      0.93      0.94       301

    accuracy                           0.93       582
   macro avg       0.93      0.93      0.93       582
weighted avg       0.93      0.93      0.93       582

