# Thematic Clustering of Fact-Checked Stories

This notebook clusters the headlines of fact-checking stories in the Tattle archive using an algorithm called GSDMM. The output file is used to generate a visualisation on the Tattle website.

### Process


1. Getting the data from MongoDB 
2. Text cleaning (removing noise, English / non-English headlines separation using regex)
3. Translating non-English headlines 
4. Pre-processing all the headlines (tokenizing, stop word removal, lemmatizing, creating bigrams)
5. Text transformation: creating a corpus of vectors
6. Building the GSDMM model
7. Adding cluster labels to headlines
8. Interactive model visualisation with pyLDAvis
9. Adding article links and total count to output file


## 1. Getting the data

In [20]:
# Importing libraries
import os
import requests
import time
from time import sleep
from random import uniform
import datetime
from datetime import date, timezone
import csv
from pymongo import MongoClient
from dotenv import load_dotenv
load_dotenv()
import os
from os import environ
import re
import numpy as np
import pandas as pd
from pprint import pprint
import nltk
from nltk.corpus import stopwords
import spacy
import gensim
import gensim.corpora as corpora
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from gensim.models import ldamodel
from gensim.models import CoherenceModel 
import re
from langdetect import detect
from gensim.models.phrases import Phrases, Phraser
from nltk import FreqDist
from nltk.corpus import RegexpTokenizer as regextoken
import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline
import logging
#logging.basicConfig(filename='lda_model.log', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# import googletrans
# from googletrans import Translator
from google_trans_new import google_translator  
import pyLDAvis
import pyLDAvis.gensim
from pyLDAvis import PreparedData
from gensim import similarities
import nbconvert
from gsdmm import MovieGroupProcess
import pickle
import json

In [28]:
# Get data from factchecking sites MongoDB
def initialize_mongo():
    #mongo_url = "mongodb+srv://"+os.environ.get("FACTCHECK_DEV_DB_USERNAME")+":"+os.environ.get("FACTCHECK_DEV_DB_PASSWORD")+"@tattle-data-fkpmg.mongodb.net/test?retryWrites=true&w=majority&ssl=true&tlsallowinvalidcertificates=false"   
    cli = MongoClient(mongo_url)
    #db = cli[os.environ.get("FACTCHECK_DEV_DB_NAME")]
    #coll = db[os.environ.get("FACTCHECK_DEV_DB_COLLECTION")]
    db = cli["factcheck_sites_dev"]
    coll = db["stories"]
    if coll.count_documents({}) > 0:
        return coll 
    else:
        print("Error accessing Mongo collection")
        sys.exit()
        


In [29]:
coll = initialize_mongo()

  options = _handle_option_deprecations(options)


In [30]:
coll.count_documents({})

14999

In [33]:
def get_weekly_data(coll):
    pipeline = [
        {"$project":{"date_accessed":"$date_accessed", "date_updated":"$date_updated", 'postID': "$postID",'postURL': "$postURL",
                     "headline": "$headline", "docs": "$docs", "author": "$author", "domain": "$domain",
                     "date": {"$dateFromString": {"dateString": "$date_updated"}}}},
        {"$match": {"date": {"$gte":datetime.datetime(2020, 8, 30, 0, 0), "$lt": datetime.datetime(2020, 9, 30, 0, 0)}}}
        #{"$match": {"date": {"$gte":datetime.datetime(2019, 1, 1, 0, 0), "$lt": datetime.datetime(2020, 1, 1, 0, 0)}}}
    ]
    
    docs = coll.aggregate(pipeline)
    return docs

In [34]:
c=0
result=[]
docs = get_weekly_data(coll)
for doc in docs:
    result.append(doc)
    c+=1
print(c)

153


In [35]:
df = pd.DataFrame(result)

In [9]:
df.to_csv('FCDBJulyAugust2019.csv')

In [36]:
df.head(3)

Unnamed: 0,_id,date_accessed,date_updated,postID,postURL,headline,docs,author,domain,date
0,5f89ae9c1c9a4d25843f93c3,"October 16, 2020","September 21, 2020",3a9fe1e718b4470d9d9f7b23262b2aec,https://www.altnews.in/italy-defeated-covid-19...,Italy defeated COVID-19? Misinformation-riddle...,[{'doc_id': 'c542a9546530401f9c63e87859e1b442'...,"{'name': 'Dr Sharfaroz Satani', 'link': 'https...",altnews.in,2020-09-21
1,5f89ae9e1c9a4d25843f93c5,"October 16, 2020","September 27, 2020",9efdb44103144107b44d2e9bba1c8af8,https://www.altnews.in/ayush-kwath-or-kadha-ca...,AYUSH Kwath or Kadha cannot ‘boost’ immunity t...,[{'doc_id': 'cc9ec77a0c704cf783ed6c009e425119'...,"{'name': 'Dr. Sumaiya Shaikh', 'link': 'https:...",altnews.in,2020-09-27
2,602ac324e654233d4c456935,"February 15, 2021","September 18, 2020",a4872a5476c84b34983e26d2aff92f4d,https://bangla.boomlive.in/fact-check/fake-new...,"আজানের সময় মন্দিরে মাইক নয়, ভুয়ো পোস্ট জানা...",[{'doc_id': '53b66ce6b64141e7a63e4963a89ab8aa'...,"{'name': 'Suhash Bhattacharjee', 'link': '/aut...",bangla.boomlive.in,2020-09-18


In [37]:

clean_df = df.drop_duplicates(subset=["postURL"])

In [38]:
len(clean_df)

151

In [39]:
# Snapshot of headlines
clean_df["headline"][0:3]

0    Italy defeated COVID-19? Misinformation-riddle...
1    AYUSH Kwath or Kadha cannot ‘boost’ immunity t...
2    আজানের সময় মন্দিরে মাইক নয়, ভুয়ো পোস্ট জানা...
Name: headline, dtype: object

## 2. Text cleaning

In [40]:
# Defining a function to remove accented characters in the headlines  
def remove_accents(sentence):
    return ", ".join(simple_preprocess(str(sentence), deacc=True, max_len=100))
clean_df["unaccented"] = clean_df["headline"].map(remove_accents)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [41]:
# Detecting non-English headlines using regex
def detect_lang(text):
    pat = re.compile("[^\x00-\x7F]") # matches non-English characters
    if pat.search(text):
        return 0
    else:
        return 1

In [42]:
clean_df["is_english"] = clean_df["unaccented"].map(detect_lang)

## 3. Translating non-English headlines

Googletrans is a free library that sends translation requests to the Google Translate API. 
Random time delays between requests are advised, else Google may (and probably will) block your ip address.


In [43]:
%%time 
# Translating non-English headlines using googletrans library
translator = google_translator()
clean_df["english_headline"] = ""
for i, row in clean_df.iterrows():
    if row["is_english"] == 0:
        if coll.count_documents({"_id": row["_id"], "english_headline": {"$exists": True}}) > 0:
            clean_df.at[i, "english_headline"] = coll.find_one({"_id": row["_id"]})["english_headline"]
        else:
            clean_df.at[i, "english_headline"] = translator.translate(row["headline"], lang_tgt='en')
            time.sleep(uniform(3,5))
    else:
        clean_df.at[i, "english_headline"] = row["headline"]

CPU times: user 3.42 s, sys: 1.09 s, total: 4.51 s
Wall time: 6min 50s


In [18]:
# Store translated headlines in Mongo
for i, row in clean_df.iterrows():
    coll.update(
    {"_id": row["_id"]},
    {"$set": {"english_headline": row["english_headline"]}})

  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(


  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(


  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(
  coll.update(


In [44]:
clean_df[clean_df["is_english"]==0][["headline", "english_headline","is_english"]][:3]

Unnamed: 0,headline,english_headline,is_english
2,"আজানের সময় মন্দিরে মাইক নয়, ভুয়ো পোস্ট জানা...","During Azan, Mike is not Mike in the temple, M...",0
3,"'গালওয়ানের প্রমাণ' হিসেবে আজতক, টাইমস নাও দেখা...","Azatak, Times, Times, as a proof of 'Galwan' s...",0
6,সম্পর্কহীন ছবি সহ ম্যাডোনার হিন্দু ধর্ম সম্পর্...,Viral comments about Madonon Hindu religion wi...,0


In [382]:
clean_df[clean_df["is_english"]==1][["headline", "english_headline","is_english"]][:3]

Unnamed: 0,headline,english_headline,is_english
0,Doctored Video Falsely Shared To Claim Hindu M...,Doctored Video Falsely Shared To Claim Hindu M...,1
1,Did UNSC Remove Taliban From Terror List Under...,Did UNSC Remove Taliban From Terror List Under...,1
2,Old Video Of Afghans Crossing Pak Border Share...,Old Video Of Afghans Crossing Pak Border Share...,1


In [383]:
clean_df.to_csv("clean_df.csv", index=False)

## 4. Text preprocessing

In [384]:
len(clean_df["english_headline"])

141

In [385]:
# Tokenizing the headlines
def sent_to_words(sentences):
    for sentence in sentences:
        yield (simple_preprocess(str(sentence), deacc = True))
        
all_tokens = list(sent_to_words(clean_df["english_headline"]))
# Creating stop words list
stop_words = stopwords.words("english")
# Adding domain words 
stop_words.extend(["fake", "fact", "check", "checked", "factcheck", "news", "false", 
                   "falsely", "true", "truth", "viral", "video", "image", "picture", 
                   "photo", "claim", "claiming", "share", "clip", "misleading","recent", "old",
                  "india", "post", "medium", "go"])
# Stop word removal
data_stopped = [[word for word in doc if word not in stop_words] for doc in all_tokens]
# Creating bigrams
bigram = gensim.models.Phrases(data_stopped, min_count=2)
for idx in range(len(data_stopped)):
    for token in bigram[data_stopped[idx]]:
        if '_' in token:
            # If token is bigram, add it to document
            data_stopped[idx].append(token)
            
data_with_bigrams = data_stopped
# Lemmatizing i.e. reducing words to their root form
# Including only nouns as this improves both topic interpretability and coherence scores
def lemmatization(docs, allowed_postags=["NOUN", "PROPN", "VERB", "ADJ", "ADV"]):
    nlp = spacy.load("en_core_web_sm")
    docs_out = []
    for sent in docs:
        doc = nlp(" ".join(sent))
        docs_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags]) 
    return docs_out

data_lemmatized = lemmatization(data_with_bigrams, allowed_postags=["NOUN", "PROPN", "VERB", "ADJ", "ADV"])

# Removing any stopwords created because of lemmatization
data_cleaned = [[word for word in doc if word not in stop_words] for doc in data_lemmatized]

## 5. Text transformation: creating a corpus

Topic modelling with the Gensim library involves documents, corpus, vectors and bag of words. These are explained here - https://radimrehurek.com/gensim/auto_examples/core/run_core_concepts.html

In [386]:
len(data_cleaned)

141

In [387]:
# Creating a dictionary
id2word = corpora.Dictionary(data_cleaned)
# Creating a document-term matrix
print('Number of unique tokens: %d' % len(id2word))
#id2word.filter_extremes(no_below = 20)
# Creating a document-term matrix
corpus = [id2word.doc2bow(doc) for doc in data_cleaned]

Number of unique tokens: 552


In [388]:
data_cleaned[0]

['doctored', 'hindu', 'man', 'lynch', 'delhi']

## GSDMM clustering 

In [422]:
np.random.seed(9)   #41,5
mgp = MovieGroupProcess(K=5, alpha=0.1, beta=0.1, n_iters=50)

In [423]:
vocab = set(x for doc in data_cleaned for x in doc)
y = mgp.fit(data_cleaned,len(vocab))

In stage 0: transferred 85 clusters with 5 clusters populated
In stage 1: transferred 56 clusters with 5 clusters populated
In stage 2: transferred 41 clusters with 5 clusters populated
In stage 3: transferred 39 clusters with 5 clusters populated
In stage 4: transferred 42 clusters with 5 clusters populated
In stage 5: transferred 43 clusters with 5 clusters populated
In stage 6: transferred 42 clusters with 5 clusters populated
In stage 7: transferred 43 clusters with 5 clusters populated
In stage 8: transferred 37 clusters with 5 clusters populated
In stage 9: transferred 34 clusters with 5 clusters populated
In stage 10: transferred 34 clusters with 5 clusters populated
In stage 11: transferred 34 clusters with 5 clusters populated
In stage 12: transferred 34 clusters with 5 clusters populated
In stage 13: transferred 38 clusters with 5 clusters populated
In stage 14: transferred 35 clusters with 5 clusters populated
In stage 15: transferred 29 clusters with 5 clusters populated
In

## Cluster labelling

In [424]:
clean_df["tokens"]=data_cleaned

In [425]:
# Assign cluster label to each article headline
clean_df["cluster"] = y

In [426]:
clean_df.head(5)[["english_headline", "cluster", "postURL"]]

Unnamed: 0,english_headline,cluster,postURL
0,Doctored Video Falsely Shared To Claim Hindu M...,4,https://www.boomlive.in/fact-check/fact-check/...
1,Did UNSC Remove Taliban From Terror List Under...,4,https://www.boomlive.in/fact-check/fact-check/...
2,Old Video Of Afghans Crossing Pak Border Share...,0,https://www.boomlive.in/fact-check/fact-check/...
3,"No, MP Government Did Not Raze A Slum Over 'Pa...",0,https://www.boomlive.in/fact-check/fact-check/...
4,Shocking Assault Video From Chhattisgarh False...,2,https://www.boomlive.in/fact-check/fast-check/...


In [427]:
clean_df["cluster"].value_counts()

4    37
2    37
3    26
0    23
1    18
Name: cluster, dtype: int64

In [428]:
# Create dictionary of clusters and headlines
headlines_dict= {}
c=1
for i in clean_df.groupby(by="cluster"):
    print(i[0])
    pairs_list = []
    df = i[1][["headline", "postURL", "tokens"]]
#     print(df["tokens"])
#     print("...........")
    for idx, row in df.iterrows():
        print(row["postURL"])
        pairs = {}
        pairs["url"] = row["postURL"]
        pairs["headline"] = row["headline"]
        pairs_list.append(pairs)
    headlines_dict[c] = pairs_list
    c+=1


0
https://www.boomlive.in/fact-check/fact-check/old-video-afghan-corona-pandemic-afghanistan-pakistan-border-viral-false-claim-14522
https://www.boomlive.in/fact-check/fact-check/no-mp-government-did-not-raze-a-slum-over-pakistan-zindabad-slogans-14503
https://www.boomlive.in/fact-check/fact-check/fake-news-viral-video-mock-drill-bank-robbery-ahmednagar-police-maharashtra-factcheck-14543
https://www.boomlive.in/fact-check/fact-check/fir-against-coca-cola-for-penis-shaped-bottles-satire-post-viral-14493
https://hindi.boomlive.in/fact-check/fact-check/top-five-fake-news-from-last-week-viral-video-14569
https://hindi.boomlive.in/fact-check/international/usa-president-joe-biden-sleeping-israeli-prime-minister-naftali-bennett-meeting-fake-news-14516
https://bangla.boomlive.in/fact-check/fact-check/old-video-afghan-corona-pandemic-afghanistan-pakistan-border-viral-false-claim-14572
https://bangla.boomlive.in/fact-check/fact-check/fact-check-justice-dalveer-bhandari-india-icj-chief-justice-uk

## Visualise MGP model

In [429]:
vocabulary = list(vocab)
doc_topic_dists = [mgp.score(doc) for doc in data_cleaned]
doc_lengths = [len(doc) for doc in data_cleaned]
term_counts_map = {}
for doc in data_cleaned:
    for term in doc:
        term_counts_map[term] = term_counts_map.get(term, 0) + 1
term_counts = [term_counts_map[term] for term in vocabulary]

matrix = []
for cluster in mgp.cluster_word_distribution:
    total = sum([occurence for word, occurence in cluster.items()])
    row = [cluster.get(term, 0) / total for term in vocabulary]
    matrix.append(row)

vis_data = pyLDAvis.prepare(topic_term_dists=matrix, doc_topic_dists=doc_topic_dists, doc_lengths=doc_lengths, 
                            vocab=vocabulary, R=10, term_frequency=term_counts, sort_topics=False)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis_data)

  default_term_info  = pd.DataFrame({'saliency': saliency, 'Term': vocab, \


In [430]:
pyLDAvis.save_json(vis_data, "2021_wk35.json")

In [431]:
pyLDAvis.save_html(vis_data, "2021_wk35.html")

## Add headline links and article count to json

In [432]:
with open("2021_wk35.json", "r") as f:
    data = json.loads(f.read())

In [433]:
data["per_cluster_headlines"] = headlines_dict
data["numpy_seed"] = 0

In [434]:
data["number_of_articles"] = len(clean_df)

In [435]:
# Save file that will be used in the themes dashboard
with open("2021_wk35.json", "w") as f:
    json.dump(data, f)

## Recreate viz from json

In [787]:
def prepared_data_from_dict(vis_data):
    topic_coordinates = pd.DataFrame.from_dict(vis_data['mdsDat'])
    topic_info = pd.DataFrame.from_dict(vis_data['tinfo'])
    token_table = pd.DataFrame.from_dict(vis_data['token.table'])
    R = vis_data['R']
    lambda_step = vis_data['lambda.step']
    plot_opts = vis_data['plot.opts']
    client_topic_order = vis_data['topic.order']

    return PreparedData(topic_coordinates, topic_info,
                        token_table, R, lambda_step, plot_opts, client_topic_order)

In [802]:
with open('wk37.json', 'r') as json_file:
    dict_data = json.load(json_file)
    viz_data = prepared_data_from_dict(dict_data)
pyLDAvis.display(viz_data)

In [803]:
with open('wk36.json', 'r') as json_file:
    dict_data = json.load(json_file)
    viz_data = prepared_data_from_dict(dict_data)
pyLDAvis.display(viz_data)

In [804]:
with open('wk35.json', 'r') as json_file:
    dict_data = json.load(json_file)
    viz_data = prepared_data_from_dict(dict_data)
pyLDAvis.display(viz_data)

In [805]:
with open('wk34.json', 'r') as json_file:
    dict_data = json.load(json_file)
    viz_data = prepared_data_from_dict(dict_data)
pyLDAvis.display(viz_data)