# Thematic Clustering of Fact-Checked Stories

This notebook clusters the headlines of fact-checking stories in the Tattle archive using an algorithm called GSDMM. The output file is used to generate a visualisation on the Tattle website.

### Process


1. Getting the data from MongoDB 
2. Text cleaning (removing noise, English / non-English headlines separation using regex)
3. Translating non-English headlines 
4. Pre-processing all the headlines (tokenizing, stop word removal, lemmatizing, creating bigrams)
5. Text transformation: creating a corpus of vectors
6. Building the GSDMM model
7. Adding cluster labels to headlines
8. Interactive model visualisation with pyLDAvis
9. Adding article links and total count to output file


## 1. Getting the data

In [47]:
# Importing libraries
import os
import requests
import time
from time import sleep
from random import uniform
import datetime
from datetime import date, timezone
import csv
from pymongo import MongoClient
from dotenv import load_dotenv
load_dotenv()
import os
from os import environ
import re
import numpy as np
import pandas as pd
from pprint import pprint
import nltk
from nltk.corpus import stopwords
import spacy
import gensim
import gensim.corpora as corpora
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from gensim.models import ldamodel
from gensim.models import CoherenceModel 
import re
from langdetect import detect
from gensim.models.phrases import Phrases, Phraser
from nltk import FreqDist
from nltk.corpus import RegexpTokenizer as regextoken
import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline
import logging
#logging.basicConfig(filename='lda_model.log', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# import googletrans
# from googletrans import Translator
from google_trans_new import google_translator  
import pyLDAvis
import pyLDAvis.gensim
from pyLDAvis import PreparedData
from gensim import similarities
import nbconvert
from gsdmm import MovieGroupProcess
import pickle
import json

In [3]:
# Get data from factchecking sites MongoDB
def initialize_mongo():
    mongo_url = "mongodb+srv://"+os.environ.get("FACTCHECK_DB_USERNAME")+":"+os.environ.get("FACTCHECK_DB_PASSWORD")+"@tattle-data-fkpmg.mongodb.net/test?retryWrites=true&w=majority&ssl=true&ssl_cert_reqs=CERT_NONE"   
    cli = MongoClient(mongo_url)
    db = cli[os.environ.get("FACTCHECK_DB_NAME")]
    coll = db[os.environ.get("FACTCHECK_DB_COLLECTION")]
    if coll.count_documents({}) > 0:
        return coll 
    else:
        print("Error accessing Mongo collection")
        sys.exit()
        


In [7]:
coll = initialize_mongo()

  options = _handle_option_deprecations(options)


In [8]:
coll.count_documents({})

18517

In [9]:
def get_weekly_data(coll):
    pipeline = [
        {"$project":{"date_accessed":"$date_accessed", "date_updated":"$date_updated", 'postID': "$postID",'postURL': "$postURL",
                     "headline": "$headline", "docs": "$docs", "author": "$author", "domain": "$domain",
                     "date": {"$dateFromString": {"dateString": "$date_updated"}}}},
        {"$match": {"date": {"$gte":datetime.datetime(2020, 8, 17, 0, 0), "$lt": datetime.datetime(2020, 8, 24, 0, 0)}}}
    ]
    
    docs = coll.aggregate(pipeline)
    return docs

In [10]:
c=0
result=[]
docs = get_weekly_data(coll)
for doc in docs:
    result.append(doc)
    c+=1
print(c)

128


In [11]:
df = pd.DataFrame(result)

In [12]:
df.head(3)

Unnamed: 0,_id,date_accessed,date_updated,postID,postURL,headline,docs,author,domain,date
0,5f3a80bb04d27dc5e06b56f4,"August 17, 2020","August 17, 2020",bd40b6536cb34193a855e06850292a2e,https://www.altnews.in/hindi/morphed-image-of-...,टाइम के कवर पेज पर ‘Time to go’ के साथ बाहर जा...,[{'doc_id': '531358cafedc4670adfb0bf1e0797cb5'...,"{'name': 'Kinjal', 'link': 'https://www.altnew...",altnews.in/hindi,2020-08-17
1,5f3a80e304d27dc5e06b56fd,"August 17, 2020","August 17, 2020",2c54e3b65f5044eb839de900e2a36196,https://www.boomlive.in/fake-news/no-maharani-...,"No, Maharani Radhikaraje Gaekwad Of Baroda Has...",[{'doc_id': '1fd631ca80f14e9bb169e6fb8b0cd9ff'...,"{'name': 'Anmol Alphonso', 'link': None}",boomlive.in,2020-08-17
2,5f3a80c204d27dc5e06b56f6,"August 17, 2020","August 17, 2020",a1e38e54e12546f6b70c39f7484b9a83,https://www.altnews.in/hindi/congresss-video-o...,मोदी पर चीन से नज़दीकी के ‘आरोप’ लगाते हुए कांग...,[{'doc_id': '1be886b5c07347a783adeb06d62dd716'...,"{'name': 'Pooja Chaudhuri', 'link': 'https://w...",altnews.in/hindi,2020-08-17


In [13]:

clean_df = df.drop_duplicates(subset=["postURL"])

In [14]:
len(clean_df)

128

In [15]:
# Snapshot of headlines
clean_df["headline"][0:3]

0    टाइम के कवर पेज पर ‘Time to go’ के साथ बाहर जा...
1    No, Maharani Radhikaraje Gaekwad Of Baroda Has...
2    मोदी पर चीन से नज़दीकी के ‘आरोप’ लगाते हुए कांग...
Name: headline, dtype: object

## 2. Text cleaning

In [16]:
# Defining a function to remove accented characters in the headlines  
def remove_accents(sentence):
    return ", ".join(simple_preprocess(str(sentence), deacc=True, max_len=100))
clean_df["unaccented"] = clean_df["headline"].map(remove_accents)

In [17]:
# Detecting non-English headlines using regex
def detect_lang(text):
    pat = re.compile("[^\x00-\x7F]") # matches non-English characters
    if pat.search(text):
        return 0
    else:
        return 1

In [18]:
clean_df["is_english"] = clean_df["unaccented"].map(detect_lang)

## 3. Translating non-English headlines

Googletrans is a free library that sends translation requests to the Google Translate API. 
Random time delays between requests are advised, else Google may (and probably will) block your ip address.


In [50]:
%%time 
# Translating non-English headlines using googletrans library
translator = google_translator()
clean_df["english_headline"] = ""
for i, row in clean_df.iterrows():
    if row["is_english"] == 0:
        if coll.count_documents({"_id": row["_id"], "english_headline": {"$exists": True}}) > 0:
            clean_df.at[i, "english_headline"] = coll.find_one({"_id": row["_id"]})["english_headline"]
        else:
            clean_df.at[i, "english_headline"] = translator.translate(row["headline"], lang_tgt='en')
            time.sleep(uniform(3,5))
    else:
        clean_df.at[i, "english_headline"] = row["headline"]

CPU times: user 31.7 ms, sys: 9.53 ms, total: 41.2 ms
Wall time: 5.06 s


In [None]:
# Store translated headlines in Mongo
for i, row in clean_df.iterrows():
    coll.update(
    {"_id": row["_id"]},
    {"$set": {"english_headline": row["english_headline"]}})

In [705]:
clean_df[clean_df["is_english"]==0][["headline", "english_headline","is_english"]][:3]

Unnamed: 0,headline,english_headline,is_english
6,मठ से NGO सेंटर लाई जा रही हथिनी का पुराना वीड...,Old video of Hathini being brought from Math t...,0
7,एयर इंडिया क्रैश: मनोरमा न्यूज़ ने ग्राफ़िक को ...,Air India Crash: Manorama News shows the graph...,0
8,फ़ैक्ट-चेक: युवक ने ‘जय श्री राम’ का नारा लगवान...,Fact-check: young man shot 2 'saffron-clad' sl...,0


In [706]:
clean_df[clean_df["is_english"]==1][["headline", "english_headline","is_english"]][:3]

Unnamed: 0,headline,english_headline,is_english
0,Hindustan Times publishes unverified claim abo...,Hindustan Times publishes unverified claim abo...,1
1,Old video of temple elephant taken for rehabil...,Old video of temple elephant taken for rehabil...,1
2,Two decade old hostage rescue video from Venez...,Two decade old hostage rescue video from Venez...,1


In [707]:
clean_df.to_csv("clean_df.csv", index=False)

## 4. Text preprocessing

In [710]:
len(clean_df["english_headline"])

87

In [711]:
# Tokenizing the headlines
def sent_to_words(sentences):
    for sentence in sentences:
        yield (simple_preprocess(str(sentence), deacc = True))
        
all_tokens = list(sent_to_words(clean_df["english_headline"]))
# Creating stop words list
stop_words = stopwords.words("english")
# Adding domain words 
stop_words.extend(["fake", "fact", "check", "checked", "factcheck", "news", "false", 
                   "falsely", "true", "truth", "viral", "video", "image", "picture", 
                   "photo", "claim", "claiming", "share", "clip", "misleading","recent", "old",
                  "india", "post", "medium", "go"])
# Stop word removal
data_stopped = [[word for word in doc if word not in stop_words] for doc in all_tokens]
# Creating bigrams
bigram = gensim.models.Phrases(data_stopped, min_count=2)
for idx in range(len(data_stopped)):
    for token in bigram[data_stopped[idx]]:
        if '_' in token:
            # If token is bigram, add it to document
            data_stopped[idx].append(token)
            
data_with_bigrams = data_stopped
# Lemmatizing i.e. reducing words to their root form
# Including only nouns as this improves both topic interpretability and coherence scores
def lemmatization(docs, allowed_postags=["NOUN", "PROPN", "VERB", "ADJ", "ADV"]):
    nlp = spacy.load("en_core_web_sm")
    docs_out = []
    for sent in docs:
        doc = nlp(" ".join(sent))
        docs_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags]) 
    return docs_out

data_lemmatized = lemmatization(data_with_bigrams, allowed_postags=["NOUN", "PROPN", "VERB", "ADJ", "ADV"])

# Removing any stopwords created because of lemmatization
data_cleaned = [[word for word in doc if word not in stop_words] for doc in data_lemmatized]

## 5. Text transformation: creating a corpus

Topic modelling with the Gensim library involves documents, corpus, vectors and bag of words. These are explained here - https://radimrehurek.com/gensim/auto_examples/core/run_core_concepts.html

In [712]:
len(data_cleaned)

87

In [713]:
# Creating a dictionary
id2word = corpora.Dictionary(data_cleaned)
# Creating a document-term matrix
print('Number of unique tokens: %d' % len(id2word))
#id2word.filter_extremes(no_below = 20)
# Creating a document-term matrix
corpus = [id2word.doc2bow(doc) for doc in data_cleaned]

Number of unique tokens: 431


In [714]:
data_cleaned[0]

['hindustan',
 'times',
 'publish',
 'unverified',
 'rioter',
 'bengaluru',
 'violence',
 'hindustan_time']

## GSDMM clustering 

In [764]:
np.random.seed(0)
mgp = MovieGroupProcess(K=4, alpha=0.1, beta=0.1, n_iters=50)

In [765]:
vocab = set(x for doc in data_cleaned for x in doc)
y = mgp.fit(data_cleaned,len(vocab))

In stage 0: transferred 50 clusters with 4 clusters populated
In stage 1: transferred 14 clusters with 4 clusters populated
In stage 2: transferred 18 clusters with 4 clusters populated
In stage 3: transferred 14 clusters with 4 clusters populated
In stage 4: transferred 16 clusters with 4 clusters populated
In stage 5: transferred 13 clusters with 4 clusters populated
In stage 6: transferred 18 clusters with 4 clusters populated
In stage 7: transferred 16 clusters with 4 clusters populated
In stage 8: transferred 15 clusters with 4 clusters populated
In stage 9: transferred 13 clusters with 4 clusters populated
In stage 10: transferred 12 clusters with 4 clusters populated
In stage 11: transferred 14 clusters with 4 clusters populated
In stage 12: transferred 16 clusters with 4 clusters populated
In stage 13: transferred 14 clusters with 4 clusters populated
In stage 14: transferred 9 clusters with 4 clusters populated
In stage 15: transferred 14 clusters with 4 clusters populated
In 

## Cluster labelling

In [772]:
clean_df["tokens"]=data_cleaned

In [773]:
# Assign cluster label to each article headline
clean_df["cluster"] = y

In [774]:
clean_df.head(5)[["english_headline", "cluster", "postURL"]]

Unnamed: 0,english_headline,cluster,postURL
0,Hindustan Times publishes unverified claim abo...,0,https://www.altnews.in/hindustan-times-publish...
1,Old video of temple elephant taken for rehabil...,3,https://www.altnews.in/old-video-of-an-elephan...
2,Two decade old hostage rescue video from Venez...,1,https://www.altnews.in/two-decade-old-hostage-...
3,"SDPI, Muslim community falsely blamed for plac...",2,https://www.altnews.in/sdpi-muslim-community-f...
4,Congress’s video on PM Modi’s relationship wit...,3,https://www.altnews.in/congresss-video-on-pm-m...


In [775]:
clean_df["cluster"].value_counts()

1    29
3    22
2    19
0    17
Name: cluster, dtype: int64

In [None]:
# Create dictionary of clusters and headlines
headlines_dict= {}
c=1
for i in clean_df.groupby(by="cluster"):
    print(i[0])
    pairs_list = []
    df = i[1][["headline", "postURL", "tokens"]]
#     print(df["tokens"])
#     print("...........")
    for idx, row in df.iterrows():
        print(row["postURL"])
        pairs = {}
        pairs["url"] = row["postURL"]
        pairs["headline"] = row["headline"]
        pairs_list.append(pairs)
    headlines_dict[c] = pairs_list
    c+=1


## Visualise MGP model

In [793]:
vocabulary = list(vocab)
doc_topic_dists = [mgp.score(doc) for doc in data_cleaned]
doc_lengths = [len(doc) for doc in data_cleaned]
term_counts_map = {}
for doc in data_cleaned:
    for term in doc:
        term_counts_map[term] = term_counts_map.get(term, 0) + 1
term_counts = [term_counts_map[term] for term in vocabulary]

matrix = []
for cluster in mgp.cluster_word_distribution:
    total = sum([occurence for word, occurence in cluster.items()])
    row = [cluster.get(term, 0) / total for term in vocabulary]
    matrix.append(row)

vis_data = pyLDAvis.prepare(topic_term_dists=matrix, doc_topic_dists=doc_topic_dists, doc_lengths=doc_lengths, 
                            vocab=vocabulary, R=10, term_frequency=term_counts, sort_topics=False)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis_data)

In [794]:
pyLDAvis.save_json(vis_data, "wk34.json")

In [795]:
pyLDAvis.save_html(vis_data, "wk34.html")

## Add headline links and article count to json

In [796]:
with open("wk34.json", "r") as f:
    data = json.loads(f.read())

In [797]:
data["per_cluster_headlines"] = headlines_dict
data["numpy_seed"] = 0

In [798]:
data["number_of_articles"] = len(clean_df)

In [799]:
# Save file that will be used in the themes dashboard
with open("wk34.json", "w") as f:
    json.dump(data, f)

## Recreate viz from json

In [787]:
def prepared_data_from_dict(vis_data):
    topic_coordinates = pd.DataFrame.from_dict(vis_data['mdsDat'])
    topic_info = pd.DataFrame.from_dict(vis_data['tinfo'])
    token_table = pd.DataFrame.from_dict(vis_data['token.table'])
    R = vis_data['R']
    lambda_step = vis_data['lambda.step']
    plot_opts = vis_data['plot.opts']
    client_topic_order = vis_data['topic.order']

    return PreparedData(topic_coordinates, topic_info,
                        token_table, R, lambda_step, plot_opts, client_topic_order)

In [802]:
with open('wk37.json', 'r') as json_file:
    dict_data = json.load(json_file)
    viz_data = prepared_data_from_dict(dict_data)
pyLDAvis.display(viz_data)

In [803]:
with open('wk36.json', 'r') as json_file:
    dict_data = json.load(json_file)
    viz_data = prepared_data_from_dict(dict_data)
pyLDAvis.display(viz_data)

In [804]:
with open('wk35.json', 'r') as json_file:
    dict_data = json.load(json_file)
    viz_data = prepared_data_from_dict(dict_data)
pyLDAvis.display(viz_data)

In [805]:
with open('wk34.json', 'r') as json_file:
    dict_data = json.load(json_file)
    viz_data = prepared_data_from_dict(dict_data)
pyLDAvis.display(viz_data)