In [1]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency

import spacy  # For preprocessing

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [3]:
df = pd.read_csv(r'D:\Word Embeddings\fake.csv')
df.shape

(12999, 20)

In [4]:
df.head()
df.isnull().sum()
df = df.dropna().reset_index(drop=True)
df.isnull().sum()

uuid                  0
ord_in_thread         0
author                0
published             0
title                 0
text                  0
language              0
crawled               0
site_url              0
country               0
domain_rank           0
thread_title          0
spam_score            0
main_img_url          0
replies_count         0
participants_count    0
likes                 0
comments              0
shares                0
type                  0
dtype: int64

In [5]:
nlp = spacy.load('en', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [7]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['text'])

In [8]:
t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_threads=-1)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 22.2 mins


In [9]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(4597, 1)

In [10]:
from gensim.models.phrases import Phrases, Phraser

INFO - 16:27:54: 'pattern' package not found; tag filters are not available for English


In [11]:
sent = [row.split() for row in df_clean['clean']]
sent

l',
  'pillar',
  'long',
  'political',
  'career',
  'fact',
  'don',
  't',
  'know',
  'politician',
  'america',
  'associated',
  'abortion',
  'hillary',
  'clinton',
  'roe',
  'v',
  'wade',
  'decide',
  'million',
  'baby',
  'murder',
  'united',
  'state',
  'hillary',
  'clinton',
  's',
  'hand',
  'drench',
  'blood',
  'vote',
  'hillary',
  'clinton',
  'hand',
  'drench',
  'blood',
  'needless',
  'absolutely',
  'horrified',
  'prominent',
  'evangelical',
  'leader',
  'come',
  'support',
  'hillary',
  'clinton',
  'election',
  'season',
  'example',
  'group',
  'represent',
  'latino',
  'evangelical',
  'church',
  'announce',
  'endorse',
  'hillary',
  'clinton',
  'organization',
  'represent',
  'latino',
  'evangelical',
  'church',
  'u',
  's',
  'endorse',
  'hillary',
  'clinton',
  'donald',
  'trump',
  'statement',
  'thursday',
  'group',
  'open',
  'usa',
  'say',
  'clinton',
  'prove',
  'willingness',
  'engage',
  'difficult',
  'conversat

In [12]:
phrases = Phrases(sent, min_count=30, progress_per=10000)

INFO - 16:27:58: collecting all words and their counts
INFO - 16:27:58: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 16:28:02: collected 1040505 word types from a corpus of 1659052 words (unigram + bigrams) and 4597 sentences
INFO - 16:28:02: using 1040505 counts as vocab in Phrases<0 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>


In [13]:
from nltk import bigrams

In [14]:
sentences = phrases[sent]

In [15]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

69694

In [16]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['s',
 'say',
 'people',
 'trump',
 'time',
 'know',
 'state',
 'like',
 'clinton',
 'go']

In [17]:
import multiprocessing

from gensim.models import Word2Vec

In [18]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

In [19]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [20]:
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 16:28:13: collecting all words and their counts
INFO - 16:28:13: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 16:28:21: collected 69694 word types from a corpus of 1570970 raw words and 4597 sentences
INFO - 16:28:21: Loading a fresh vocabulary
INFO - 16:28:21: effective_min_count=20 retains 8916 unique words (12% of original 69694, drops 60778)
INFO - 16:28:21: effective_min_count=20 leaves 1382608 word corpus (88% of original 1570970, drops 188362)
INFO - 16:28:21: deleting the raw counts dictionary of 69694 items
INFO - 16:28:21: sample=6e-05 downsamples 1369 most-common words
INFO - 16:28:21: downsampling leaves estimated 875651 word corpus (63.3% of prior 1382608)
INFO - 16:28:21: estimated required memory for 8916 words and 300 dimensions: 25856400 bytes
INFO - 16:28:21: resetting layer weights
Time to build vocab: 0.2 mins


In [21]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=15, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

hreads
INFO - 16:28:59: EPOCH - 2 : training on 1570970 raw words (875754 effective words) took 17.4s, 50345 effective words/s
INFO - 16:29:00: EPOCH 3 - PROGRESS: at 6.03% examples, 36822 words/s, in_qsize 0, out_qsize 0
INFO - 16:29:01: EPOCH 3 - PROGRESS: at 11.94% examples, 39951 words/s, in_qsize 0, out_qsize 0
INFO - 16:29:02: EPOCH 3 - PROGRESS: at 16.25% examples, 38844 words/s, in_qsize 0, out_qsize 0
INFO - 16:29:03: EPOCH 3 - PROGRESS: at 23.02% examples, 37180 words/s, in_qsize 0, out_qsize 0
INFO - 16:29:04: EPOCH 3 - PROGRESS: at 28.80% examples, 37056 words/s, in_qsize 1, out_qsize 0
INFO - 16:29:05: EPOCH 3 - PROGRESS: at 33.00% examples, 37488 words/s, in_qsize 0, out_qsize 0
INFO - 16:29:06: EPOCH 3 - PROGRESS: at 36.18% examples, 38081 words/s, in_qsize 0, out_qsize 0
INFO - 16:29:07: EPOCH 3 - PROGRESS: at 44.70% examples, 38524 words/s, in_qsize 0, out_qsize 0
INFO - 16:29:08: EPOCH 3 - PROGRESS: at 48.29% examples, 38089 words/s, in_qsize 0, out_qsize 0
INFO - 16:

In [23]:
w2v_model.wv.most_similar(positive=["fake"])

[('hoax', 0.6655097603797913),
 ('fake_news', 0.6599137783050537),
 ('circulate', 0.6320031881332397),
 ('biased', 0.6317537426948547),
 ('sicken', 0.6033468246459961),
 ('scam', 0.6008330583572388),
 ('misinformation', 0.6002168655395508),
 ('proof', 0.6000297665596008),
 ('false', 0.5967258214950562),
 ('troll', 0.5930442810058594)]

In [24]:
w2v_model.wv.most_similar(positive=["freedom"])

[('liberty', 0.7934807538986206),
 ('tyranny', 0.7433617115020752),
 ('deprive', 0.7103211879730225),
 ('bear_arm', 0.6700611710548401),
 ('freedom_speech', 0.6616452932357788),
 ('dignity', 0.6511629819869995),
 ('equality', 0.6453904509544373),
 ('preserve', 0.6435351371765137),
 ('uphold', 0.639883816242218),
 ('proclaim', 0.6337151527404785)]

In [25]:
w2v_model.wv.most_similar(positive=["attack"],topn=20)

[('false_flag', 0.7994900345802307),
 ('bloody', 0.7829836010932922),
 ('terrorist_attack', 0.7823898792266846),
 ('isis_fighter', 0.7490600943565369),
 ('warplane', 0.7447312474250793),
 ('relentless', 0.7430627346038818),
 ('taliban', 0.7392674684524536),
 ('houthi', 0.7388142943382263),
 ('barrage', 0.7347888946533203),
 ('behead', 0.7337174415588379),
 ('aftermath', 0.7299622893333435),
 ('helmet', 0.7236400842666626),
 ('violently', 0.7234197854995728),
 ('airstrike', 0.7209673523902893),
 ('meantime', 0.7197535037994385),
 ('lead_coalition', 0.7163382768630981),
 ('houthis', 0.7148869037628174),
 ('ypg', 0.7146459817886353),
 ('war_crime', 0.7138177752494812),
 ('bombing', 0.7130325436592102)]

In [26]:
w2v_model.wv.most_similar(positive=["bomb"],topn=20)

[('atomic', 0.7754235863685608),
 ('warplane', 0.7604361772537231),
 ('yeman', 0.7586443424224854),
 ('strike', 0.7105411887168884),
 ('shelter', 0.6895444393157959),
 ('air_strike', 0.681679904460907),
 ('sniper', 0.6816248893737793),
 ('isis_fighter', 0.6776495575904846),
 ('airstrike', 0.66100013256073),
 ('artillery', 0.6574423909187317),
 ('houthis', 0.6569006443023682),
 ('yemeni', 0.6505181789398193),
 ('drop', 0.6471607685089111),
 ('bullet', 0.6449828147888184),
 ('cluster', 0.6388813853263855),
 ('bomber', 0.6362646818161011),
 ('taliban', 0.6357870101928711),
 ('syria', 0.6310577988624573),
 ('rocket', 0.6288050413131714),
 ('north_korea', 0.6275111436843872)]

In [27]:
w2v_model.wv.most_similar(positive=["president"])

[('ronald_reagan', 0.797315239906311),
 ('predecessor', 0.7641506195068359),
 ('presidency', 0.7636823654174805),
 ('joe_biden', 0.758105993270874),
 ('president_elect', 0.7563950419425964),
 ('white_house', 0.7563837766647339),
 ('impeach', 0.7525460720062256),
 ('commander_chief', 0.7493503093719482),
 ('elect_president', 0.7470866441726685),
 ('transition_team', 0.7453500032424927)]

In [28]:
w2v_model.wv.most_similar(positive=["president"])

[('ronald_reagan', 0.797315239906311),
 ('predecessor', 0.7641506195068359),
 ('presidency', 0.7636823654174805),
 ('joe_biden', 0.758105993270874),
 ('president_elect', 0.7563950419425964),
 ('white_house', 0.7563837766647339),
 ('impeach', 0.7525460720062256),
 ('commander_chief', 0.7493503093719482),
 ('elect_president', 0.7470866441726685),
 ('transition_team', 0.7453500032424927)]