# Thematic Clustering of Fact-Checked Stories

This notebook clusters the headlines of fact-checking stories in the Tattle archive using an algorithm called GSDMM. The output file is used to generate a visualisation on the Tattle website.

### Process


1. Getting the data from MongoDB 
2. Text cleaning (removing noise, English / non-English headlines separation using regex)
3. Translating non-English headlines 
4. Pre-processing all the headlines (tokenizing, stop word removal, lemmatizing, creating bigrams)
5. Text transformation: creating a corpus of vectors
6. Building the GSDMM model
7. Adding cluster labels to headlines
8. Interactive model visualisation with pyLDAvis
9. Adding article links and total count to output file


## 1. Getting the data

In [None]:
# Importing libraries
import os
import requests
import time
from time import sleep
from random import uniform
import datetime
from datetime import date, timezone
import csv
from pymongo import MongoClient
from dotenv import load_dotenv
load_dotenv()
import os
from os import environ
import re
import numpy as np
import pandas as pd
from pprint import pprint
import nltk
from nltk.corpus import stopwords
import spacy
import gensim
import gensim.corpora as corpora
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from gensim.models import ldamodel
from gensim.models import CoherenceModel 
import re
from langdetect import detect
from gensim.models.phrases import Phrases, Phraser
from nltk import FreqDist
from nltk.corpus import RegexpTokenizer as regextoken
import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline
import logging
#logging.basicConfig(filename='lda_model.log', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import googletrans
from googletrans import Translator
import pyLDAvis
import pyLDAvis.gensim
from gensim import similarities
import nbconvert
from gsdmm import MovieGroupProcess
import pickle
import json

In [2]:
# Get data from factchecking sites MongoDB
def initialize_mongo():
    mongo_url = "mongodb+srv://"+os.environ.get("FACTCHECK_DB_USERNAME")+":"+os.environ.get("FACTCHECK_DB_PASSWORD")+"@tattle-data-fkpmg.mongodb.net/test?retryWrites=true&w=majority&ssl=true&ssl_cert_reqs=CERT_NONE"   
    cli = MongoClient(mongo_url)
    db = cli[os.environ.get("FACTCHECK_DB_NAME")]
    coll = db[os.environ.get("FACTCHECK_DB_COLLECTION")]
    if coll.count_documents({}) > 0:
        return coll 
    else:
        print("Error accessing Mongo collection")
        sys.exit()
        


In [None]:
coll = initialize_mongo()

In [4]:
coll.count_documents({})

16117

In [7]:
def get_weekly_data(coll):
    pipeline = [
        {"$project":{"date_accessed":"$date_accessed", "date_updated":"$date_updated", 'postID': "$postID",'postURL': "$postURL",
                     "headline": "$headline", "docs": "$docs", "author": "$author", "domain": "$domain",
                     "date": {"$dateFromString": {"dateString": "$date_accessed"}}}},
        {"$match": {"date": {"$gte":datetime.datetime(2020, 8, 24, 0, 0), "$lt": datetime.datetime(2020, 8, 31, 0, 0)}}}
    ]
    
    docs = coll.aggregate(pipeline)
    return docs

In [8]:
c=0
result=[]
docs = get_weekly_data(coll)
for doc in docs:
    result.append(doc)
    c+=1
print(c)

111


In [9]:
df = pd.DataFrame(result)

In [10]:
df.head(3)

Unnamed: 0,_id,author,date,date_accessed,docs,domain,headline,postID,postURL
0,5f2a37185b354603ecd9d960,"{'name': 'Kinjal', 'link': 'https://www.altnew...",2020-08-05,"August 05, 2020",[{'doc_id': 'fef2e7b1481740c5aa88cff4352f161c'...,altnews.in,Video of youth forced to drink urine in Rajast...,f8efe97ebad1429ea7df8d1855243d42,https://www.altnews.in/a-video-from-rajasthan-...
1,5f2a371c5b354603ecd9d961,"{'name': 'Priyanka Jha', 'link': 'https://www....",2020-08-05,"August 05, 2020",[{'doc_id': 'd454064cf1cb4725959408e2553b1767'...,altnews.in,Media outlet Hindustan shares 9-year-old image...,a1268a8179804100a58d3ddccdcd5d7b,https://www.altnews.in/hindustan-newspaper-sha...
2,5f2a37215b354603ecd9d962,"{'name': 'Archit Mehta', 'link': 'https://www....",2020-08-05,"August 05, 2020",[{'doc_id': 'cafea6fd24174b648afe54e36aae0a2a'...,altnews.in,Photo of Hindu deity Ram on New York’s Times S...,ad25cb0038f54a27ac17cf9643fc5ddb,https://www.altnews.in/photo-of-hindu-deity-ra...


In [13]:

clean_df = df.drop_duplicates(subset=["postURL"])

In [14]:
len(clean_df)

111

In [15]:
# Snapshot of headlines
clean_df["headline"][0:3]

0    Morphed: TIME magazine cover featuring Donald ...
1    Derogatory painting of Hindu deity Krishna fro...
2    Sushant Singh Rajput is not dancing with his ‘...
Name: headline, dtype: object

## 2. Text cleaning

In [16]:
# Save headlines in a variable
raw_data = clean_df["headline"].values.tolist()

# Defining a function to remove accented characters in the headlines  
def data_dict(sentences):
    return dict((sentence, ", ".join(simple_preprocess(str(sentence), deacc=True, max_len=100))) for sentence in sentences)

result = data_dict(raw_data)

# Separating non-English headlines using regex
pat = re.compile("[^\x00-\x7F]") # matches non-English characters
non_eng = [k for k,v in result.items() if pat.search(v)]
eng = [k for k,v in result.items() if not pat.search(v)]

## 3. Translating non-English headlines

Googletrans is a free library that sends translation requests to the Google Translate API. 
Random time delays between requests are advised, else Google may (and probably will) block your ip address.


In [17]:
# Translating non-English headlines using googletrans library

translator = Translator()
translations = []
for doc in non_eng:
    translations.append(translator.translate(doc).text)
    time.sleep(uniform(3,5))

In [18]:
# Saving the original and translated headlines for future reference
translated_headlines = dict(zip(non_eng, translations))
translations_df = pd.DataFrame(translated_headlines.items(), columns = ["headline", "translation"])
translations_df["original_english"] = 0
translations_df = translations_df.append(pd.DataFrame(eng, columns=['headline']), ignore_index=True, sort=True)
translations_df["original_english"].fillna(value=1, inplace = True)
translations_df["original_english"] = translations_df["original_english"].astype(int)
translations_df.to_csv("headlines_with_translations.csv")

In [19]:
trans = pd.read_csv("headlines_with_translations.csv")
trans.head(3)

Unnamed: 0.1,Unnamed: 0,headline,original_english,translation
0,0,SDPI और मुस्लिम समुदाय पर शंकराचार्य की मूर्ति...,0,SDPI and wrong alleged to flag up the seer sta...
1,1,फ़ैक्ट-चेक: बुर्का पहनकर पाकिस्तानी झंडा लहराते...,0,Fact-Czech: the burqa wearing Pakistani flag w...
2,2,वीडियो में सुशांत के साथ उनकी कोरियोग्राफ़र डां...,0,The video was their choreographer dance with S...


In [20]:
translations_df = trans[trans["original_english"] == 0]

In [21]:
translations = list(translations_df["translation"])

## 4. Text preprocessing

In [22]:
# Combining the headlines
all_headlines = eng + translations
# Tokenizing the headlines
def sent_to_words(sentences):
    for sentence in sentences:
        yield (simple_preprocess(str(sentence), deacc = True))
        
all_tokens = list(sent_to_words(all_headlines))
# Creating stop words list
stop_words = stopwords.words("english")
# Adding domain words 
stop_words.extend(["fake", "fact", "check", "checked", "factcheck", "news", "false", 
                   "falsely", "true", "truth", "viral", "video", "image", "picture", 
                   "photo", "claim", "claiming", "share", "clip", "misleading","recent", "old",
                  "india", "post", "medium"])
# Stop word removal
data_stopped = [[word for word in doc if word not in stop_words] for doc in all_tokens]
# Creating bigrams
bigram = gensim.models.Phrases(data_stopped, min_count=2)
for idx in range(len(data_stopped)):
    for token in bigram[data_stopped[idx]]:
        if '_' in token:
            # If token is bigram, add it to document
            data_stopped[idx].append(token)
            
data_with_bigrams = data_stopped
# Lemmatizing i.e. reducing words to their root form
# Including only nouns as this improves both topic interpretability and coherence scores
def lemmatization(docs, allowed_postags=["NOUN", "PROPN", "VERB", "ADJ", "ADV"]):
    nlp = spacy.load("en_core_web_sm")
    docs_out = []
    for sent in docs:
        doc = nlp(" ".join(sent))
        docs_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags]) 
    return docs_out

data_lemmatized = lemmatization(data_with_bigrams, allowed_postags=["NOUN", "PROPN", "VERB", "ADJ", "ADV"])

# Removing any stopwords created because of lemmatization
data_cleaned = [[word for word in doc if word not in stop_words] for doc in data_lemmatized]

## 5. Text transformation: creating a corpus

Topic modelling with the Gensim library involves documents, corpus, vectors and bag of words. These are explained here - https://radimrehurek.com/gensim/auto_examples/core/run_core_concepts.html

In [23]:
# Creating a dictionary
id2word = corpora.Dictionary(data_cleaned)
# Creating a document-term matrix
print('Number of unique tokens: %d' % len(id2word))
#id2word.filter_extremes(no_below = 20)
# Creating a document-term matrix
corpus = [id2word.doc2bow(doc) for doc in data_cleaned]

Number of unique tokens: 483


In [177]:
data_cleaned[0]

['morph',
 'time',
 'magazine',
 'cover',
 'feature',
 'donald',
 'trump',
 'time',
 'go',
 'headline']

## GSDMM clustering 

In [147]:
mgp = MovieGroupProcess(K=5, alpha=0.1, beta=0.1, n_iters=30)

In [148]:
vocab = set(x for doc in data_cleaned for x in doc)
y = mgp.fit(data_cleaned,len(vocab))

In stage 0: transferred 71 clusters with 5 clusters populated
In stage 1: transferred 21 clusters with 5 clusters populated
In stage 2: transferred 19 clusters with 5 clusters populated
In stage 3: transferred 18 clusters with 5 clusters populated
In stage 4: transferred 15 clusters with 5 clusters populated
In stage 5: transferred 23 clusters with 5 clusters populated
In stage 6: transferred 22 clusters with 5 clusters populated
In stage 7: transferred 17 clusters with 5 clusters populated
In stage 8: transferred 16 clusters with 5 clusters populated
In stage 9: transferred 19 clusters with 5 clusters populated
In stage 10: transferred 21 clusters with 5 clusters populated
In stage 11: transferred 10 clusters with 5 clusters populated
In stage 12: transferred 15 clusters with 5 clusters populated
In stage 13: transferred 13 clusters with 5 clusters populated
In stage 14: transferred 13 clusters with 5 clusters populated
In stage 15: transferred 17 clusters with 5 clusters populated
In

In [149]:
def cluster_importance(mgp):
    n_z_w = mgp.cluster_word_distribution
    beta, V, K = mgp.beta, mgp.vocab_size, mgp.K
    phi = [{} for i in range(K)]        
    for z in range(K):
        for w in n_z_w[z]:
            phi[z][w] = (n_z_w[z][w]+beta)/(sum(n_z_w[z].values())+V*beta)
    return phi
phi = cluster_importance(mgp)

In [150]:
doc_count = np.array(mgp.cluster_doc_count)
print('Number of documents per topic :', doc_count)

# Topics sorted by the number of documents they are allocated to
top_index = doc_count.argsort()[-50:][::-1]
print('Most important clusters (by number of docs inside):', top_index)

def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts =sorted(mgp.cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print("Cluster {} : {}".format(cluster,sort_dicts))

Number of documents per topic : [18 20 22 27 24]
Most important clusters (by number of docs inside): [3 4 2 1 0]


In [179]:
def top_words(cluster_word_distribution, top_cluster, values):
    freq_dict = {}
    for cluster in top_cluster:
        sort_dicts =sorted(mgp.cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        freq_dict[str(cluster)] = sort_dicts
#        print("Cluster {} : {}".format(cluster,sort_dicts))
    return freq_dict

In [180]:
top_words(mgp.cluster_word_distribution, top_index, 5)


{'3': [('new', 6), ('covid', 5), ('delhi', 5), ('road', 4), ('link', 4)],
 '4': [('milk', 4),
  ('hyderabad', 4),
  ('modi', 4),
  ('pm_modi', 4),
  ('garden', 4)],
 '2': [('khan', 6), ('rss', 5), ('man', 4), ('flag', 4), ('singh', 4)],
 '1': [('flag', 9),
  ('trailer', 5),
  ('national', 5),
  ('day', 4),
  ('mahesh', 3)],
 '0': [('morph', 4),
  ('krishna', 4),
  ('painting', 4),
  ('year', 3),
  ('trump', 3)]}

In [153]:
word_counts = top_words(mgp.cluster_word_distribution, top_index, 5)
type(word_counts)

dict

In [154]:
with open ("wordcounts_wk35.json", "w") as fp:
    json.dump(word_counts, fp)

## Cluster labelling

In [158]:
clean_df["tokens"]=data_cleaned

In [None]:
# Assign cluster label to each article headline
clean_df["cluster"] = y

In [184]:
clean_df.head(5)[["headline", "cluster", "postURL"]]

Unnamed: 0,headline,cluster,postURL
0,Morphed: TIME magazine cover featuring Donald ...,1,https://www.altnews.in/morphed-image-of-time-t...
1,Derogatory painting of Hindu deity Krishna fro...,1,https://www.altnews.in/derogatory-painting-of-...
2,Sushant Singh Rajput is not dancing with his ‘...,3,https://www.altnews.in/sushant-singh-rajput-da...
3,RSS man dressed in burqa caught by the police ...,3,https://www.altnews.in/did-rss-worker-wearing-...
4,Old pornographic visual from Pakistan shared a...,3,https://www.altnews.in/old-pornographic-visual...


In [186]:
clean_df["cluster"].value_counts()

4    27
5    24
3    22
2    20
1    18
Name: cluster, dtype: int64

In [162]:
# Save model
with open("wk35_mgp.model", "wb") as f:
    pickle.dump(mgp, f)
    f.close()

In [211]:
# Create dictionary of clusters and headlines
headlines_dict= {}
c=1
for i in clean_df.groupby(by="cluster"):
    #print(i[1]["postURL"])
    headlines_dict[c] = list(i[1]["postURL"])
    c+=1


In [213]:
headlines_dict[1]

['https://www.altnews.in/morphed-image-of-time-to-go-cover-featuring-donald-trump-shared-by-journalists/',
 'https://www.altnews.in/derogatory-painting-of-hindu-deity-krishna-from-5-years-ago-revived-on-social-media/',
 'https://www.altnews.in/hindi/sushant-singh-rajput-dancing-with-his-choreographer-manpreet-toor-media-misreports-he-is-dancing-with-his-niece-mallika-singh/',
 'https://www.altnews.in/hindi/two-decade-old-hostage-rescue-video-from-venezuela-viral-as-spanish-police-shoot-is-terrorist/',
 'https://www.boomlive.in/fake-news/fact-check-did-putins-daughter-die-after-taking-covid-19-vaccine-9437',
 'https://hindi.boomlive.in/fake-news/no-mughal-gardens-has-not-been-renamed-to-dr-rajendra-prasad-garden-9406',
 'https://bangla.boomlive.in/fake-news/old-video-clip-of-fire-breaking-out-at-delhis-tughlakabad-slum-shared-with-communal-spin-9381',
 'https://factly.in/this-man-was-not-beaten-for-hoisting-blue-colour-bsp-flag-instead-of-national-flag/',
 'https://factly.in/image-of-pm

## Visualise MGP model

In [None]:
vocabulary = list(vocab)
doc_topic_dists = [mgp.score(doc) for doc in data_cleaned]
doc_lengths = [len(doc) for doc in data_cleaned]
term_counts_map = {}
for doc in data_cleaned:
    for term in doc:
        term_counts_map[term] = term_counts_map.get(term, 0) + 1
term_counts = [term_counts_map[term] for term in vocabulary]

matrix = []
for cluster in mgp.cluster_word_distribution:
    total = sum([occurence for word, occurence in cluster.items()])
    row = [cluster.get(term, 0) / total for term in vocabulary]
    matrix.append(row)

vis_data = pyLDAvis.prepare(topic_term_dists=matrix, doc_topic_dists=doc_topic_dists, doc_lengths=doc_lengths, 
                            vocab=vocabulary, R=10, term_frequency=term_counts, sort_topics=False)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis_data)

In [217]:
pyLDAvis.save_json(vis_data, "wk35.json")

## Add headline links and article count to json

In [226]:
with open("wk35.json", "r") as f:
    data = json.loads(f.read())

In [227]:
data["per_cluster_headlines"] = headlines_dict

In [228]:
data["number_of_articles"] = len(clean_df)

In [232]:
# Save file that will be used in the themes dashboard
with open("wk35.json", "w") as f:
    json.dump(data, f)