In [None]:
import pandas as pd
import networkx as nx
from itertools import combinations
from collections import defaultdict
import operator
import matplotlib.pyplot as plt
import numpy as np

import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

nlp = spacy.load('en_core_web_sm')


In [None]:
# check out the data
data = pd.read_csv('./input/snopes.csv')

# TODO: remove unnecessary columns

# make sure the data are strings (Pandas converts into numeric sometimes)
data['claim'] = data['claim'].astype(str)

#Display the data
pd.set_option('display.max_colwidth', -1)
data.head(20)

# TODO: process the data

In [None]:
claims = []
# remove duplicate claims (Not really needed since dropped already)
# TODO

# Eventually add lower and whitespace strip to see the impact on the result
# TODO

# use spaCy to process the data (disable tagger and parser)
# put the results in the 'corpus' variable
# TODO
corpus = ... TODO ...

# the result should be in the 'corpus' variable
print('Number of claims: ', len(corpus))

assert len(corpus) == 3122, "Wrong processing of the corpus"

In [None]:
# Print the first few claims, along with the entities identified
# TODO (use corpus variable)


In [None]:
# Calculate the number of times each entity appears in the corpus
# Format: dictionnary {word(str): count(int)}
all_ents = defaultdict(int)

for i, doc in enumerate(corpus):
    # TODO

print('Number of distinct entities: ', len(all_ents))

In [None]:
# TODO: Update this list of sentences and see the results of the pipeline
# TODO: Find variants of sentences that are well/wrongly tagged
my_sents = [u"This is a sentence without named Entities", 
            u"This is a sentence without named Entities... apparently"]

for doc in list(nlp.pipe(my_sents)):
    print(doc)
    print(doc.ents)


In [None]:
# TODO: identify most popular entities (most popular = most frequent)
# NOTE: sort entities by freq
sorted_ents = ... TODO ...
print(sorted_ents[:20])

In [None]:
# How many ents appear per claim?
# TODO: calculate the average number of entities per claim?
# TODO: draw a bar chart
# NOTE: Use plt.hist

plt.title('Entities per claim')
plt.show()

# Getting the cooccurrences of identified entities

In [None]:
# This function counts cooccurrences of entities
# Inputs: list of list of entities
# Outputs: a dict of dict
def coocurrence(*inputs):
    com = defaultdict(int)
    for named_entities in inputs:
        # Build co-occurrence matrix
        # TODO: create the cooc matrix com having the cooc counts of entities in named_entities
        
    # create the cooc dict
    result = defaultdict(dict)
    for (w1, w2), count in com.items():
        if w1 != w2:
            result[w1][w2] = {'count': count}
    return result

In [None]:
# Function to filter out entities with less than min_count
# Input: entities (dict of dict), min_count (int)
# Output: 
def filter_ents_by_min_count(entities, min_count):
    cooc_entities_filtered = defaultdict()
    for k1, e in entities.items():
        ents_over_x_count = {k2: v for k2, v in e.items() if v['count'] > min_count}
        if ents_over_x_count:  # ie. Not empty
            cooc_entities_filtered[k1] = ents_over_x_count
    return cooc_entities_filtered

In [None]:
# TODO:
# 1. make list of of entities in claims
# 2. calculate co-occurrences of entities
# 3. filter out entities cooccurring less than min_count time, use filter_ents_by_min_count
# 4. print most frequent co-occuring entities
min_count = 2

# 1. make list of entities in claims
claim_ents = []
for doc in corpus:
    string_ents = list(map(str, doc.ents))
    claim_ents.append(string_ents)
    
# Keeping only claims with multiple entities
multi_ent_claims = [c for c in claim_ents if len(c)>1]

# 2. Creating the coocurrence dict
cooc_entities = coocurrence(*multi_ent_claims)

# 3. filter out entities cooccurring less than min_count time, use filter_ents_by_min_count
filtered_entities = filter_ents_by_min_count(cooc_entities, min_count)

# 4. Print most frequent co-occuring entities
cooc_sum = defaultdict(int)
for k1, e in filtered_entities.items():
    for k2, v in e.items():
        cooc_sum[k1] += v['count']

sorted_cooc = sorted(cooc_sum.items(), key=operator.itemgetter(1), reverse=True)
print('Most frequent Cooccurring entities:')
sorted_cooc[:20]

# Graphing the coocurrence relationships

In [None]:
# Getting the data - eg top 30, including only ents with min weight 2
top_n = 30
min_count = 2
figsize = (20, 15)
scale_nodes = lambda x: (x * 30) + 1
scale_edges = lambda x: 15 * x

filtered_entities = filter_ents_by_min_count(coocur_entities, min_count)

top_cooccur = [x[0] for x in sorted_coocur[:top_n]]  
graph_edges = {k:filtered_edges[k] for k in top_cooccur}

# Attempting to graph these top coocurrences
graph = nx.from_dict_of_dicts(graph_edges)
pos = nx.kamada_kawai_layout(graph)

# Normalise, then scale the line weights
weights = [graph[u][v]['count'] for u, v in graph.edges() if u != v]
weights = list(map(lambda x: (x - min(weights)) / (max(weights) - min(weights)), weights))
weights = list(map(scale_edges, weights))

# Scale node weights 
sum_weights = [cooc_sum[n] if cooc_sum[n]>0 else 1 for n in graph.nodes]
sum_weights = list(map(scale_nodes, sum_weights))
# sum_weights = list(map(lambda x: 100*log(x), sum_weights))


plt.figure(figsize=figsize)

# nx.draw(G, pos)
nx.draw_networkx_edges(graph, pos, alpha=0.2, width=weights)
nx.draw_networkx_nodes(graph, pos, alpha=0.2, node_size=sum_weights)
nx.draw_networkx_labels(graph, pos)

plt.xticks([])
plt.yticks([])

plt.title('Top coocurrences of named entities in Snopes claims')
plt.show()

# Extra: create a wordcloud picture

In [None]:
# Create a wordcloud picture
# use all the detected named entities
from wordcloud import WordCloud
from collections import Counter
from PIL import Image

# Join all the tweet entities into one list
word_list = [j for i in claim_ents for j in i]

# Count occurences of each entity 
word_count_dict=Counter(word_list)

# Make the wordcloud
mask = np.array(Image.open("images/twitter_mask.png"))
wordcloud = WordCloud(background_color="white", max_words=100,
                      width = 600, height = 300,
                      mask=mask, contour_width=5, contour_color="skyblue"
                     )
                      
wordcloud.generate_from_frequencies(word_count_dict)

# Show the wordcloud
plt.figure(figsize=(10,10))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

# Or save the file
# plt.savefig('wordcloud.png', bbox_inches='tight')
# plt.close()