# Thematic Clustering of Fact-Checked Stories

This notebook clusters the headlines of fact-checking stories in the Tattle archive using an algorithm called GSDMM. The output file is used to generate a visualisation on the Tattle website.

### Process


1. Getting the data from MongoDB 
2. Text cleaning (removing noise, English / non-English headlines separation using regex)
3. Translating non-English headlines 
4. Pre-processing all the headlines (tokenizing, stop word removal, lemmatizing, creating bigrams)
5. Text transformation: creating a corpus of vectors
6. Building the GSDMM model
7. Adding cluster labels to headlines
8. Interactive model visualisation with pyLDAvis
9. Adding article links and total count to output file


## 1. Getting the data

In [12]:
# Importing libraries
import os
import requests
import time
from time import sleep
from random import uniform
import datetime
from datetime import date, timezone
import csv
from pymongo import MongoClient
from dotenv import load_dotenv
load_dotenv()
import os
from os import environ
import re
import numpy as np
import pandas as pd
from pprint import pprint
import nltk
from nltk.corpus import stopwords
import spacy
import gensim
import gensim.corpora as corpora
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from gensim.models import ldamodel
from gensim.models import CoherenceModel 
import re
from langdetect import detect
from gensim.models.phrases import Phrases, Phraser
from nltk import FreqDist
from nltk.corpus import RegexpTokenizer as regextoken
import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline
import logging
#logging.basicConfig(filename='lda_model.log', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import googletrans
from googletrans import Translator
import pyLDAvis
import pyLDAvis.gensim
from gensim import similarities
import nbconvert
from gsdmm import MovieGroupProcess
import pickle
import json

In [13]:
# Get data from factchecking sites MongoDB
def initialize_mongo():
    mongo_url = "mongodb+srv://"+os.environ.get("FACTCHECK_DB_USERNAME")+":"+os.environ.get("FACTCHECK_DB_PASSWORD")+"@tattle-data-fkpmg.mongodb.net/test?retryWrites=true&w=majority&ssl=true&ssl_cert_reqs=CERT_NONE"   
    cli = MongoClient(mongo_url)
    db = cli[os.environ.get("FACTCHECK_DB_NAME")]
    coll = db[os.environ.get("FACTCHECK_DB_COLLECTION")]
    if coll.count_documents({}) > 0:
        return coll 
    else:
        print("Error accessing Mongo collection")
        sys.exit()
        


In [None]:
coll = initialize_mongo()

In [16]:
coll.count_documents({})

16361

In [149]:
def get_weekly_data(coll):
    pipeline = [
        {"$project":{"date_accessed":"$date_accessed", "date_updated":"$date_updated", 'postID': "$postID",'postURL': "$postURL",
                     "headline": "$headline", "docs": "$docs", "author": "$author", "domain": "$domain",
                     "date": {"$dateFromString": {"dateString": "$date_accessed"}}}},
        {"$match": {"date": {"$gte":datetime.datetime(2020, 8, 31, 0, 0), "$lt": datetime.datetime(2020, 9, 7, 0, 0)}}}
    ]
    
    docs = coll.aggregate(pipeline)
    return docs

In [151]:
c=0
result=[]
docs = get_weekly_data(coll)
for doc in docs:
    result.append(doc)
    c+=1
print(c)

154


In [152]:
df = pd.DataFrame(result)

In [153]:
df.head(3)

Unnamed: 0,_id,author,date,date_accessed,date_updated,docs,domain,headline,postID,postURL
0,5f4c9e15efd3af927ab39ab7,"{'name': 'Archit Mehta', 'link': 'https://www....",2020-08-31,"August 31, 2020","August 25, 2020",[{'doc_id': '948bafd1d6804121ba58f314c61d03c2'...,altnews.in,Video of personal dispute in Hyderabad shared ...,6a7a0b18a49c4ebf9c12a6d89aed5629,https://www.altnews.in/video-of-personal-dispu...
1,5f4c9e1aefd3af927ab39ab8,"{'name': 'Kinjal', 'link': 'https://www.altnew...",2020-08-31,"August 31, 2020","August 25, 2020",[{'doc_id': '8b249cbbcdb440de915ba60a4939a0e1'...,altnews.in,"No, this is not Facebook’s Ankhi Das cutting t...",b0fc3d997ce643d69a7720ea23767931,https://www.altnews.in/fact-check-image-of-amb...
2,5f4c9e1eefd3af927ab39ab9,"{'name': 'Priyanka Jha', 'link': 'https://www....",2020-08-31,"August 31, 2020","August 26, 2020",[{'doc_id': '9dc9737543e6426aa1f42c07787535ef'...,altnews.in,Video from West Bengal passed off as communal ...,e556d8fe9192452d858965581230e7eb,https://www.altnews.in/video-of-angry-mob-from...


In [169]:

clean_df = df.drop_duplicates(subset=["postURL"])

In [170]:
len(clean_df)

154

In [156]:
# Snapshot of headlines
clean_df["headline"][0:3]

0    Video of personal dispute in Hyderabad shared ...
1    No, this is not Facebook’s Ankhi Das cutting t...
2    Video from West Bengal passed off as communal ...
Name: headline, dtype: object

## 2. Text cleaning

In [157]:
# Save headlines in a variable
raw_data = clean_df["headline"].values.tolist()

# Defining a function to remove accented characters in the headlines  
def data_dict(sentences):
    return dict((sentence, ", ".join(simple_preprocess(str(sentence), deacc=True, max_len=100))) for sentence in sentences)

result = data_dict(raw_data)

# Separating non-English headlines using regex
pat = re.compile("[^\x00-\x7F]") # matches non-English characters
non_eng = [k for k,v in result.items() if pat.search(v)]
eng = [k for k,v in result.items() if not pat.search(v)]

In [172]:
len(eng)

100

In [173]:
len(non_eng)

54

## 3. Translating non-English headlines

Googletrans is a free library that sends translation requests to the Google Translate API. 
Random time delays between requests are advised, else Google may (and probably will) block your ip address.


In [215]:
# Translating non-English headlines using googletrans library

translator = Translator()
translations = []
for doc in non_eng:
    translations.append(translator.translate(doc).text)
    time.sleep(uniform(3,5))

In [240]:
len(translations)

54

In [217]:
# Saving the original and translated headlines for future reference
translated_headlines = dict(zip(non_eng, translations))
translations_df = pd.DataFrame(translated_headlines.items(), columns = ["headline", "translation"])
translations_df["original_english"] = 0
translations_df = translations_df.append(pd.DataFrame(eng, columns=['headline']), ignore_index=True, sort=True)
translations_df["original_english"].fillna(value=1, inplace = True)
translations_df["original_english"] = translations_df["original_english"].astype(int)
translations_df.to_csv("headlines_with_translations.csv")

In [194]:
# translations_df = pd.read_csv("working-files/headlines_with_translations.csv")
# translations_df.head(3)

In [220]:
#translations = list(translations_df[translations_df["original_english"] == 0]["translation"])

## 4. Text preprocessing

In [241]:
len(eng+translations)

154

In [242]:
# Combining the headlines
all_headlines = eng + translations
# Tokenizing the headlines
def sent_to_words(sentences):
    for sentence in sentences:
        yield (simple_preprocess(str(sentence), deacc = True))
        
all_tokens = list(sent_to_words(all_headlines))
# Creating stop words list
stop_words = stopwords.words("english")
# Adding domain words 
stop_words.extend(["fake", "fact", "check", "checked", "factcheck", "news", "false", 
                   "falsely", "true", "truth", "viral", "video", "image", "picture", 
                   "photo", "claim", "claiming", "share", "clip", "misleading","recent", "old",
                  "india", "post", "medium"])
# Stop word removal
data_stopped = [[word for word in doc if word not in stop_words] for doc in all_tokens]
# Creating bigrams
bigram = gensim.models.Phrases(data_stopped, min_count=2)
for idx in range(len(data_stopped)):
    for token in bigram[data_stopped[idx]]:
        if '_' in token:
            # If token is bigram, add it to document
            data_stopped[idx].append(token)
            
data_with_bigrams = data_stopped
# Lemmatizing i.e. reducing words to their root form
# Including only nouns as this improves both topic interpretability and coherence scores
def lemmatization(docs, allowed_postags=["NOUN", "PROPN", "VERB", "ADJ", "ADV"]):
    nlp = spacy.load("en_core_web_sm")
    docs_out = []
    for sent in docs:
        doc = nlp(" ".join(sent))
        docs_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags]) 
    return docs_out

data_lemmatized = lemmatization(data_with_bigrams, allowed_postags=["NOUN", "PROPN", "VERB", "ADJ", "ADV"])

# Removing any stopwords created because of lemmatization
data_cleaned = [[word for word in doc if word not in stop_words] for doc in data_lemmatized]

## 5. Text transformation: creating a corpus

Topic modelling with the Gensim library involves documents, corpus, vectors and bag of words. These are explained here - https://radimrehurek.com/gensim/auto_examples/core/run_core_concepts.html

In [243]:
len(data_cleaned)

154

In [244]:
# Creating a dictionary
id2word = corpora.Dictionary(data_cleaned)
# Creating a document-term matrix
print('Number of unique tokens: %d' % len(id2word))
#id2word.filter_extremes(no_below = 20)
# Creating a document-term matrix
corpus = [id2word.doc2bow(doc) for doc in data_cleaned]

Number of unique tokens: 585


In [245]:
data_cleaned[0]

['personal', 'dispute', 'hyderabad', 'communal', 'angle', 'shared_communal']

## GSDMM clustering 

In [279]:
mgp = MovieGroupProcess(K=5, alpha=0.1, beta=0.05, n_iters=30)

In [280]:
vocab = set(x for doc in data_cleaned for x in doc)
y = mgp.fit(data_cleaned,len(vocab))

In stage 0: transferred 92 clusters with 5 clusters populated
In stage 1: transferred 36 clusters with 5 clusters populated
In stage 2: transferred 36 clusters with 5 clusters populated
In stage 3: transferred 29 clusters with 5 clusters populated
In stage 4: transferred 29 clusters with 5 clusters populated
In stage 5: transferred 25 clusters with 5 clusters populated
In stage 6: transferred 23 clusters with 5 clusters populated
In stage 7: transferred 19 clusters with 5 clusters populated
In stage 8: transferred 18 clusters with 5 clusters populated
In stage 9: transferred 20 clusters with 5 clusters populated
In stage 10: transferred 14 clusters with 5 clusters populated
In stage 11: transferred 12 clusters with 5 clusters populated
In stage 12: transferred 17 clusters with 5 clusters populated
In stage 13: transferred 25 clusters with 5 clusters populated
In stage 14: transferred 18 clusters with 5 clusters populated
In stage 15: transferred 20 clusters with 5 clusters populated
In

In [39]:
# def cluster_importance(mgp):
#     n_z_w = mgp.cluster_word_distribution
#     beta, V, K = mgp.beta, mgp.vocab_size, mgp.K
#     phi = [{} for i in range(K)]        
#     for z in range(K):
#         for w in n_z_w[z]:
#             phi[z][w] = (n_z_w[z][w]+beta)/(sum(n_z_w[z].values())+V*beta)
#     return phi
# phi = cluster_importance(mgp)

In [40]:
# doc_count = np.array(mgp.cluster_doc_count)
# print('Number of documents per topic :', doc_count)

# # Topics sorted by the number of documents they are allocated to
# top_index = doc_count.argsort()[-50:][::-1]
# print('Most important clusters (by number of docs inside):', top_index)

# def top_words(cluster_word_distribution, top_cluster, values):
#     for cluster in top_cluster:
#         sort_dicts =sorted(mgp.cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
#         print("Cluster {} : {}".format(cluster,sort_dicts))

Number of documents per topic : [21 15 27 20 28]
Most important clusters (by number of docs inside): [4 2 0 3 1]


In [41]:
# def top_words(cluster_word_distribution, top_cluster, values):
#     freq_dict = {}
#     for cluster in top_cluster:
#         sort_dicts =sorted(mgp.cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
#         freq_dict[str(cluster)] = sort_dicts
# #        print("Cluster {} : {}".format(cluster,sort_dicts))
#     return freq_dict

In [42]:
# top_words(mgp.cluster_word_distribution, top_index, 5)

{'4': [('khan', 7),
  ('painting', 4),
  ('krishna', 4),
  ('mumbai', 4),
  ('hyderabad', 4)],
 '2': [('show', 6), ('new', 5), ('modi', 4), ('pm_modi', 4), ('firework', 4)],
 '0': [('road', 4), ('delhi', 4), ('bjp', 4), ('link', 3), ('fall', 3)],
 '3': [('rss', 5), ('man', 4), ('flag', 4), ('indian', 4), ('garden', 4)],
 '1': [('flag', 9),
  ('national', 5),
  ('day', 4),
  ('national_flag', 3),
  ('light', 3)]}

In [43]:
# word_counts = top_words(mgp.cluster_word_distribution, top_index, 5)

dict

In [154]:
# with open ("wordcounts_wk35.json", "w") as fp:
#     json.dump(word_counts, fp)

## Cluster labelling

In [281]:
clean_df["tokens"]=data_cleaned

In [291]:
# Assign cluster label to each article headline
clean_df["cluster"] = y

In [292]:
clean_df.head(5)[["headline", "cluster", "postURL"]]

Unnamed: 0,headline,cluster,postURL
0,Video of personal dispute in Hyderabad shared ...,3,https://www.altnews.in/video-of-personal-dispu...
1,"No, this is not Facebook’s Ankhi Das cutting t...",4,https://www.altnews.in/fact-check-image-of-amb...
2,Video from West Bengal passed off as communal ...,0,https://www.altnews.in/video-of-angry-mob-from...
3,Video of Delhi cop beating minor is not an old...,1,https://www.altnews.in/video-of-delhi-cop-beat...
4,Congress shares 2012 image of PM Modi with duc...,0,https://www.altnews.in/congress-shares-2012-im...


In [293]:
clean_df["cluster"].value_counts()

0    39
2    33
1    30
3    29
4    23
Name: cluster, dtype: int64

In [285]:
# Save model
# with open("wk35_mgp.model", "wb") as f:
#     pickle.dump(mgp, f)
#     f.close()

In [299]:
clean_df.columns

Index(['_id', 'author', 'date', 'date_accessed', 'date_updated', 'docs',
       'domain', 'headline', 'postID', 'postURL', 'tokens', 'cluster'],
      dtype='object')

In [303]:
# Create dictionary of clusters and headlines
headlines_dict= {}
c=1
for i in clean_df.groupby(by="cluster"):
    pairs_list = []
    df = i[1][["headline", "postURL", "tokens"]]
#     print(df["tokens"])
#     print("...........")
    for idx, row in df.iterrows():
        pairs = {}
        pairs["url"] = row["postURL"]
        pairs["headline"] = row["headline"]
        pairs_list.append(pairs)
    headlines_dict[c] = pairs_list
    c+=1


In [304]:
headlines_dict[2][:2]

[{'url': 'https://www.altnews.in/video-of-delhi-cop-beating-minor-is-not-an-old-video-of-armed-chain-snatcher/',
  'headline': 'Video of Delhi cop beating minor is not an old video of armed chain snatcher'},
 {'url': 'https://www.altnews.in/bjp-shares-old-imf-data-to-make-misleading-claim-about-indian-gdp-growth-projection/',
  'headline': 'BJP shares old IMF data to make misleading claim about Indian GDP growth projection'}]

In [305]:
headlines_dict[1][0:2]

[{'url': 'https://www.altnews.in/video-of-angry-mob-from-west-bengal-shared-as-banglore-with-false-communal-angle/',
  'headline': 'Video from West Bengal passed off as communal violence in Bengaluru'},
 {'url': 'https://www.altnews.in/congress-shares-2012-image-of-pm-modi-with-ducks-as-pr-during-covid-19/',
  'headline': 'Congress shares 2012 image of PM Modi with ducks as PR during COVID-19'}]

## Visualise MGP model

In [None]:
vocabulary = list(vocab)
doc_topic_dists = [mgp.score(doc) for doc in data_cleaned]
doc_lengths = [len(doc) for doc in data_cleaned]
term_counts_map = {}
for doc in data_cleaned:
    for term in doc:
        term_counts_map[term] = term_counts_map.get(term, 0) + 1
term_counts = [term_counts_map[term] for term in vocabulary]

matrix = []
for cluster in mgp.cluster_word_distribution:
    total = sum([occurence for word, occurence in cluster.items()])
    row = [cluster.get(term, 0) / total for term in vocabulary]
    matrix.append(row)

vis_data = pyLDAvis.prepare(topic_term_dists=matrix, doc_topic_dists=doc_topic_dists, doc_lengths=doc_lengths, 
                            vocab=vocabulary, R=10, term_frequency=term_counts, sort_topics=False)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis_data)

In [306]:
pyLDAvis.save_json(vis_data, "wk36.json")

## Add headline links and article count to json

In [307]:
with open("wk36.json", "r") as f:
    data = json.loads(f.read())

In [308]:
data["per_cluster_headlines"] = headlines_dict

In [309]:
data["number_of_articles"] = len(clean_df)

In [310]:
# Save file that will be used in the themes dashboard
with open("wk36.json", "w") as f:
    json.dump(data, f)