In [215]:
import pandas as pd
import spacy

import re

from collections import defaultdict

In [216]:
df = pd.read_csv('corona_fake.csv')

In [217]:
df.head()

Unnamed: 0,title,text,source,label
0,Due to the recent outbreak for the Coronavirus...,"You just need to add water, and the drugs and ...",coronavirusmedicalkit.com,Fake
1,,Hydroxychloroquine has been shown to have a 10...,RudyGiuliani,Fake
2,,Fact: Hydroxychloroquine has been shown to hav...,CharlieKirk,Fake
3,,The Corona virus is a man made virus created i...,JoanneWrightForCongress,Fake
4,,Doesn’t @BillGates finance research at the Wuh...,JoanneWrightForCongress,Fake


In [218]:
df = df[['text']]

In [219]:
df

Unnamed: 0,text
0,"You just need to add water, and the drugs and ..."
1,Hydroxychloroquine has been shown to have a 10...
2,Fact: Hydroxychloroquine has been shown to hav...
3,The Corona virus is a man made virus created i...
4,Doesn’t @BillGates finance research at the Wuh...
...,...
1159,A study suggests that ultraviolet rays could s...
1160,"Last week, a medical journal reported that a b..."
1161,"A new report, sent to the White House science ..."
1162,A vaccine would be the ultimate weapon against...


In [220]:
df.dropna(inplace=True)

In [221]:
df

Unnamed: 0,text
0,"You just need to add water, and the drugs and ..."
1,Hydroxychloroquine has been shown to have a 10...
2,Fact: Hydroxychloroquine has been shown to hav...
3,The Corona virus is a man made virus created i...
4,Doesn’t @BillGates finance research at the Wuh...
...,...
1159,A study suggests that ultraviolet rays could s...
1160,"Last week, a medical journal reported that a b..."
1161,"A new report, sent to the White House science ..."
1162,A vaccine would be the ultimate weapon against...


In [222]:
nlp = spacy.load('en_core_web_sm' , disable=['ner' , 'parser'])

# cleaning

In [223]:
def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    doc = re.sub("[^A-Za-z']+" , ' ' , doc).lower()
    doc = nlp(doc)
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)
    

In [224]:
# brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['text'])

print(cleaning('asa asas ass'))

asa asas ass


In [225]:
#using spacy pipe (spacy.pipe())

In [226]:
# t = time.time()

# txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning , batch_size=5000 , n_threads=-1)]

# print(f'Time to clean up everything :{round(time.time() - t)/60,2}mins')

In [227]:
# df_clean = pd.DataFrame({'clean': txt})
# df_clean = df_clean.dropna().drop_duplicates()
# df_clean.shape

In [231]:
cleaned_text = df['text'].apply(cleaning)

In [240]:
df_clean = pd.DataFrame({'clean':cleaned_text})
df_clean.shape

(1154, 1)

In [244]:
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(1079, 1)

In [246]:
df_clean.head()

Unnamed: 0,clean
0,need add water drug vaccine ready administer p...
1,hydroxychloroquine show effective rate treatin...
2,fact hydroxychloroquine show effective rate tr...
3,corona virus man virus create wuhan laboratory...
4,doesn t billgates finance research wuhan lab c...


# bigrams

In [247]:
from gensim.models.phrases import Phrases , Phraser

In [249]:
sent = [row.split() for row in df_clean['clean']]

In [250]:
sent[:5]

[['need',
  'add',
  'water',
  'drug',
  'vaccine',
  'ready',
  'administer',
  'part',
  'kit',
  'hold',
  'pellet',
  'contain',
  'chemical',
  'machinery',
  'synthesise',
  'end',
  'product',
  'hold',
  'pellet',
  'contain',
  'instruction',
  'telll',
  'drug',
  'compound',
  'create',
  'mix',
  'part',
  'choose',
  'combination',
  'add',
  'water',
  'treatment',
  'ready'],
 ['hydroxychloroquine',
  'show',
  'effective',
  'rate',
  'treating',
  'covid',
  'democrat',
  'gretchen',
  'whitmer',
  'threaten',
  'doctor',
  'prescribe',
  'trump',
  'democrat',
  'okay',
  'people',
  'die',
  'mean',
  'oppose',
  'trump'],
 ['fact',
  'hydroxychloroquine',
  'show',
  'effective',
  'rate',
  'treating',
  'covid',
  'democrat',
  'gretchen',
  'whitmer',
  'threaten',
  'doctor',
  'prescribe',
  'trump',
  'democrat',
  'okay',
  'people',
  'die',
  'mean',
  'oppose',
  'trump',
  'sick'],
 ['corona',
  'virus',
  'man',
  'virus',
  'create',
  'wuhan',
  'labo

In [252]:
#now create relevant phrases from the list of sentences

phrases = Phrases(sent , min_count=30 , progress_per=10000)

In [279]:
phrases

<gensim.models.phrases.Phrases at 0x1d5064479d0>

# Most frequent words

In [263]:
biagram_phraser = Phraser(phrases)

In [267]:
biagram_phraser

<gensim.models.phrases.Phraser at 0x1d50615eb80>

In [269]:
sentences = biagram_phraser[sent]

In [273]:
sentences.corpus[:10]

[['need',
  'add',
  'water',
  'drug',
  'vaccine',
  'ready',
  'administer',
  'part',
  'kit',
  'hold',
  'pellet',
  'contain',
  'chemical',
  'machinery',
  'synthesise',
  'end',
  'product',
  'hold',
  'pellet',
  'contain',
  'instruction',
  'telll',
  'drug',
  'compound',
  'create',
  'mix',
  'part',
  'choose',
  'combination',
  'add',
  'water',
  'treatment',
  'ready'],
 ['hydroxychloroquine',
  'show',
  'effective',
  'rate',
  'treating',
  'covid',
  'democrat',
  'gretchen',
  'whitmer',
  'threaten',
  'doctor',
  'prescribe',
  'trump',
  'democrat',
  'okay',
  'people',
  'die',
  'mean',
  'oppose',
  'trump'],
 ['fact',
  'hydroxychloroquine',
  'show',
  'effective',
  'rate',
  'treating',
  'covid',
  'democrat',
  'gretchen',
  'whitmer',
  'threaten',
  'doctor',
  'prescribe',
  'trump',
  'democrat',
  'okay',
  'people',
  'die',
  'mean',
  'oppose',
  'trump',
  'sick'],
 ['corona',
  'virus',
  'man',
  'virus',
  'create',
  'wuhan',
  'labo

In [274]:
word_freq = defaultdict(int)

for sent in sentences:
    for i in sent:
        word_freq[i] +=1
len(word_freq)

16509

In [278]:
word_freq

defaultdict(int,
            {'need': 891,
             'add': 271,
             'water': 195,
             'drug': 592,
             'vaccine': 2625,
             'ready': 95,
             'administer': 65,
             'part': 80,
             'kit': 33,
             'hold': 142,
             'pellet': 2,
             'contain': 267,
             'chemical': 92,
             'machinery': 11,
             'synthesise': 5,
             'end': 362,
             'product': 257,
             'instruction': 59,
             'telll': 1,
             'compound': 24,
             'create': 440,
             'mix': 33,
             'choose': 61,
             'combination': 65,
             'treatment': 660,
             'hydroxychloroquine': 144,
             'show': 462,
             'effective': 367,
             'rate': 214,
             'treating': 5,
             'covid': 2748,
             'democrat': 14,
             'gretchen': 3,
             'whitmer': 3,
             'threaten': 71,

In [147]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['virus',
 's',
 'coronavirus',
 'people',
 'covid',
 'vaccine',
 'say',
 'china',
 'new',
 'disease']

# Train the model

In [148]:
import multiprocessing

from gensim.models import Word2Vec

In [149]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer
cores

4

In [150]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [151]:
t = time.time()

w2v_model.build_vocab(sentences , progress_per=5000)

print(f'Time to build the vocab:  {round(time.time()-t)/60,2}')

Time to build the vocab:  (0.03333333333333333, 2)


In [152]:
t = time.time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time.time() - t) / 60, 2)))

Time to train the model: 0.71 mins


In [153]:


w2v_model.init_sims(replace=True)



# exploring the model

In [154]:
w2v_model.wv.most_similar(positive=['virus'])

[('virulent', 0.8317253589630127),
 ('ferret', 0.8082175850868225),
 ('sar_mer', 0.7855465412139893),
 ('infect_human', 0.7835135459899902),
 ('evolve', 0.7768890857696533),
 ('novel', 0.768994927406311),
 ('circulate', 0.7687722444534302),
 ('infectious', 0.7656295299530029),
 ('meaning', 0.7654054164886475),
 ('easily', 0.761091947555542)]

In [157]:
#biagram

w2v_model.wv.most_similar(positive=['corona_virus'])

[('attribute', 0.8920176029205322),
 ('sic', 0.7593475580215454),
 ('see', 0.7547153234481812),
 ('ordinary', 0.7467460036277771),
 ('falsely', 0.7463915348052979),
 ('actual', 0.7400267124176025),
 ('acute', 0.7251443266868591),
 ('interesting', 0.7175319790840149),
 ('logical', 0.7137560248374939),
 ('bird_flu', 0.7050538063049316)]