# Create text graphs
* This notebook takes the newsela/britannica folder and the newsela/britannica.csv file and:
    1. Annotates the files (output: csv/newselaORbritannica_with_annotation.csv)
    2. Removes outliers on a test set by the Sematch package.
    3. Breaks the nodes into sentences and paragraphs.
    4. Creates the text graphs as described in Stajner and Hulpus (2015).
    
* The final output of running this specific notebook is csv/britannica_with_graphs.csv
* Can be run with different data sets - newsela/britannica_semrel/britannica_sematch_with_graphs.csv were created with this notebook

#### Import requirements

In [1]:
from ast import literal_eval
import codecs
from collections import defaultdict
import itertools
import networkx as nx
import nltk
nltk.download('wordnet')
nltk.download('wordnet_ic')
from nltk import word_tokenize
import numpy as np
import pandas as pd
import requests
from sematch.semantic.similarity import EntitySimilarity #changed the semantic/sparql.py file 
                                                         #to be compatible with python 3
import spacy
import spotlight
import time
from tqdm import tqdm, tqdm_notebook
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package wordnet to /home/edit/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet_ic to /home/edit/nltk_data...
[nltk_data]   Package wordnet_ic is already up-to-date!


#### Define helper functions

In [2]:
def load_data(file_name):
    '''
    function that loads CSV files
    without adding new column for index
    '''
    df = pd.read_csv(file_name, index_col=0)
    return df

In [3]:
def annotate_text(text):
    '''
    function that takes a string as argument and returns a dictionary 
    as desribed in https://pypi.org/project/pyspotlight/
    confidence value is set based on experiment and literature described in the paper
    the "URI" values of the dictionary are used for further experiments
    '''
    try:
        annotation = spotlight.annotate(
            "https://api.dbpedia-spotlight.org/en/annotate",
            str(text),
            confidence=0.35,
            support=20,
        )
        return annotation
    
    #added to control for missing annotation when the link throws an error 
    except:
        return [0]

In [4]:
def annotate_df(series_of_file_paths):
    '''
    function that takes the series of file paths as seen in the newsela.csv
    and returns a list of annotations that can be added to the DF
    '''
    list_of_annotations = []
    for path in tqdm(series_of_file_paths):    
        with codecs.open(path.replace("IM/", ""), "r", encoding="utf-8", errors="ignore") as f:
            text_annotations = []
            text = f.read()
            text_annotation = annotate_text(str(text))
            list_of_annotations.append(text_annotation)

    return list_of_annotations

In [5]:
def create_graph(annotation, max_path_length=4):
    '''
    function that creates a graph based on an annotation dictionary created by the annotate_text() function    
    part 1 of the graph creation as described by Stajner and Hulpus (2015)
    forms a graph of all nodes connected to the original nodes up to the max_path_length
    using DBPedia JSON files under "http://dbpedia.org/data/"
    returns a networkx Graph object
    max path length parameter is set to 4 as in the paper 
    
    '''    
    start = time.time()
    
    old_annotation = []
    
    graph = nx.Graph()
    for layer in range(max_path_length-1): 
        for node in annotation:
            if node not in old_annotation:
                if ("resource" in node and node[7:9] == "db" and "Category" not in node
                   and "List" not in node):#
                    #check dpbedia entries only, 'category' and 'list' give meaningless results
                    try:
                        data = requests.get("http://dbpedia.org/data/" + node[27:] + ".json").json()
                        item = data[node]
                        keys = [key for key in item if item[key][0]["type"] == "uri"]
                        for key in keys:

                            if ("resource" in item[key][0]["value"]
                                and item[key][0]["value"][7:9] == "db"
                            ):  
                                if (key != "http://www.w3.org/2002/07/owl#sameAs"):  #####exclude non-english entries
                                    graph.add_edge(node, item[key][0]["value"], prop=key)
                    except:
                        #print(node) ###### can be used to control which nodes are not found
                        pass

        old_annotation = annotation
        annotation = set(graph.nodes())
       
    end = time.time()
    #print("Time taken:", end - start)  ######## can be used to control timing
    return graph

In [6]:
def select_related_nodes(graph, annotation):
    '''
    part 2 of the graph creation as described by Stajner and Hulpus (2015)
    only keep nodes that are related to the original nodes present in the text by path lenths up to 4
    returns networkx Graph objects
    needs an annotation as returned by the annotate_text() function
    '''   
    limited_text_graph = nx.Graph().to_undirected()
    nx_graph = graph.to_undirected()
    #print("Undirected networkx graph created.") # was used to check timing

    for node in annotation:
        limited_text_graph.add_node(node)
    
    annotation_pairs = list(itertools.combinations(annotation, 2))
    for pair in (set(annotation_pairs)): #tqdm_notebook was used earlier to check timing

        if pair[0] in nx_graph.nodes() and pair[1] in nx_graph.nodes():
            paths = list(nx.simple_paths.all_simple_paths(nx_graph, (pair[0]), (pair[1]), 4))
            if paths != []:

                for path in paths:
                    for related_nodes in list(zip(path, path[1:])):
                        
                        limited_text_graph.add_edge(
                            related_nodes[0],
                            related_nodes[1],
                            prop=graph.get_edge_data(
                                related_nodes[0], related_nodes[1])["prop"],)
    
    #these were not part of the graph so far
    isolates = list(nx.isolates(limited_text_graph))
    
    return limited_text_graph, isolates

In [7]:
nlp = spacy.load("en_core_web_sm")

def return_nodes_per_sent(path, anno):
    '''
    based on the columns 'path' and 'annotation' from the DF
    returns a list of lists, where each sublist contains nodes from one sentence
    used to average the feature values per sentence
    '''
    with codecs.open(path.replace("IM/",""), "r", encoding='utf-8', errors='ignore') as f:
        
        all_nodes=[]
        text = f.read()
        doc = nlp(text)
        sentences = list(doc.sents)
        for sent in sentences:
            
            sentence_nodes = set()

            for word in sent:
                for dic in literal_eval(anno):
                    if str(word) == dic["surfaceForm"]:
                        sentence_nodes.add(dic["URI"])
            
            all_nodes.append(sentence_nodes)
            
    return all_nodes

In [8]:
def return_nodes_per_para(path, anno):
    '''
    based on the columns 'path' and 'annotation' from the DF
    returns a list of lists, where each sublist contains nodes from one paragraph
    used to average the feature values per paragraph
    '''
    with codecs.open(path.replace("IM/",""), "r", encoding='utf-8', errors='ignore') as f:
        
        all_nodes=[]
        text = f.read()
        
        paragraphs = text.split('\n')

        for para in paragraphs:
            
            para_nodes = set()

            for word in word_tokenize(para):
                for dic in literal_eval(anno):
                    if str(word) == dic["surfaceForm"]:
                        para_nodes.add(dic["URI"])
            
            all_nodes.append(para_nodes)
            
    return all_nodes

#### Load and annotate data

In [9]:
def load_and_annotate_df(df_path, folder='britannica/'):
    df = load_data(df_path)
    df["annotations"] = str
    df["annotations"] = annotate_df(folder+df["path"])
    return df

In [14]:
df = load_and_annotate_df('csv/britannica.csv')
df.head()

100%|██████████| 117/117 [03:28<00:00,  1.78s/it]


Unnamed: 0,path,name,score,level,annotations
0,african_penguin_kids,african-penguin,kids,2,[{'URI': 'http://dbpedia.org/resource/African_...
1,african_penguin_scholars,african-penguin,scholars,0,[{'URI': 'http://dbpedia.org/resource/African_...
2,african_penguin_students,african-penguin,students,1,[{'URI': 'http://dbpedia.org/resource/African_...
3,bald_eagle_kids,bald-eagle,kids,2,[{'URI': 'http://dbpedia.org/resource/Bald_eag...
4,bald_eagle_scholars,bald-eagle,scholars,0,[{'URI': 'http://dbpedia.org/resource/Bald_eag...


In [19]:
#test if there are any empty annotations
for ind,row in df.iterrows():
    if df['annotations'][ind] == [0]:
        print(ind)

In [14]:
#if needed, this function can annotate the missing texts, based on the index from previous cell
def annotate_missing_text(ind, folder='newsela/'):
    file_name = df["path"][ind]
    with open(folder+file_name) as f:
        file = f.read()
    df['annotations'][ind] = annotate_text(file)

In [10]:
#annotate_missing_text(977)

In [62]:
#save DF
df.to_csv('csv/britannica_with_annotations.csv')

<hr style="rgb(0,0,0);height: 15.0px;"/>

#### Outlier removal with Sematch

In [10]:
df = load_data('csv/britannica_with_annotations.csv')

In [11]:
sim = EntitySimilarity()

In [96]:
def remove_outliers(annotation, threshold=0.2): ##########0.2 based on manual testing
    
    relatedness=defaultdict(list)
    cleaned_annotations = []
    
    for pair in tqdm(list(itertools.combinations([dic["URI"] for dic in literal_eval(annotation)], 2))):
        rel = (sim.relatedness(pair[0],pair[1]))
        
        relatedness[pair[0]].append(rel)
        relatedness[pair[1]].append(rel)
                     
    for key in relatedness:
        relatedness[key]=np.mean(relatedness[key])
        if relatedness[key] > threshold:
            cleaned_annotations.append(key)  
            
    return relatedness, cleaned_annotations

* Testing for threshold value - two examples from Newsela data

In [52]:
test = remove_outliers(df["annotations"][0])

100%|██████████| 1485/1485 [25:19<00:00,  1.04it/s]


In [57]:
test[0]

defaultdict(list,
            {'http://dbpedia.org/resource/Police_dog': 0.5090655987614584,
             'http://dbpedia.org/resource/Texas': 0.6880030870285047,
             'http://dbpedia.org/resource/Brittany': 0.38944078906160096,
             'http://dbpedia.org/resource/Golden_Retriever': 0.40973838803481033,
             'http://dbpedia.org/resource/Dog': 0.5503373487867458,
             'http://dbpedia.org/resource/New_York_City': 0.8153091972845675,
             'http://dbpedia.org/resource/Sleep': 0.2990742300267429,
             'http://dbpedia.org/resource/Firefighter': 0.507604486291097,
             'http://dbpedia.org/resource/Flag_of_the_United_States': 0.5323630461139134,
             'http://dbpedia.org/resource/Volunteer_fire_department': 0.2323340662724604,
             'http://dbpedia.org/resource/Family': 0.7492827212252995,
             'http://dbpedia.org/resource/Fixed-wing_aircraft': 0.5302183387945506,
             'http://dbpedia.org/resource/Hotel': 0.723

In [55]:
df["annotations"][0]

[{'URI': 'http://dbpedia.org/resource/Police_dog',
  'support': 510,
  'types': '',
  'surfaceForm': 'K9',
  'offset': 0,
  'similarityScore': 0.9999995822221044,
  'percentageOfSecondRank': 4.177780900825528e-07},
 {'URI': 'http://dbpedia.org/resource/Texas',
  'support': 136284,
  'types': 'Wikidata:Q3455524,Schema:Place,Schema:AdministrativeArea,DBpedia:Region,DBpedia:PopulatedPlace,DBpedia:Place,DBpedia:Location,DBpedia:AdministrativeRegion',
  'surfaceForm': 'Texas',
  'offset': 85,
  'similarityScore': 0.999981639340906,
  'percentageOfSecondRank': 6.308142280775235e-06},
 {'URI': 'http://dbpedia.org/resource/Brittany',
  'support': 7640,
  'types': 'Wikidata:Q486972,Schema:Place,DBpedia:Settlement,DBpedia:PopulatedPlace,DBpedia:Place,DBpedia:Location',
  'surfaceForm': 'Bretagne',
  'offset': 93,
  'similarityScore': 0.9613999954526835,
  'percentageOfSecondRank': 0.04014978584314553},
 {'URI': 'http://dbpedia.org/resource/Golden_Retriever',
  'support': 407,
  'types': '',
  's

In [58]:
test2 = remove_outliers(df["annotations"][6])

100%|██████████| 2775/2775 [1:04:40<00:00,  1.32s/it]


In [59]:
test2[0]

defaultdict(list,
            {'http://dbpedia.org/resource/Food': 0.5074505976921527,
             'http://dbpedia.org/resource/Afghanistan': 0.6520561037556565,
             'http://dbpedia.org/resource/Financial_capital': 0.3119354629114519,
             'http://dbpedia.org/resource/Kabul': 0.4792197772871563,
             'http://dbpedia.org/resource/Hot_dog': 0.435428258362215,
             'http://dbpedia.org/resource/North_America': 0.744968895363626,
             'http://dbpedia.org/resource/Navid': 0.15236097892942538,
             'http://dbpedia.org/resource/Toronto': 0.7730697869700689,
             'http://dbpedia.org/resource/Canada': 0.870624684127026,
             'http://dbpedia.org/resource/Abdullah_of_Saudi_Arabia': 0.49199216822151287,
             'http://dbpedia.org/resource/Hamburger': 0.455370212231134,
             'http://dbpedia.org/resource/Chicken_as_food': 0.3614870481368901,
             'http://dbpedia.org/resource/Mustard_seed': 0.4079663070170393,
    

In [61]:
df["annotations"][6]

[{'URI': 'http://dbpedia.org/resource/Food',
  'support': 9216,
  'types': '',
  'surfaceForm': 'food',
  'offset': 19,
  'similarityScore': 0.9883228701429909,
  'percentageOfSecondRank': 0.011812835279352781},
 {'URI': 'http://dbpedia.org/resource/Afghanistan',
  'support': 48460,
  'types': 'Wikidata:Q6256,Schema:Place,Schema:Country,DBpedia:PopulatedPlace,DBpedia:Place,DBpedia:Location,DBpedia:Country',
  'surfaceForm': 'Afghanistan',
  'offset': 62,
  'similarityScore': 0.9999989505746321,
  'percentageOfSecondRank': 1.0485084921499918e-06},
 {'URI': 'http://dbpedia.org/resource/Financial_capital',
  'support': 1291,
  'types': '',
  'surfaceForm': 'capital',
  'offset': 74,
  'similarityScore': 0.6093330899527369,
  'percentageOfSecondRank': 0.6410369074282207},
 {'URI': 'http://dbpedia.org/resource/Kabul',
  'support': 8993,
  'types': 'Wikidata:Q515,Wikidata:Q486972,Schema:Place,Schema:City,DBpedia:Settlement,DBpedia:PopulatedPlace,DBpedia:Place,DBpedia:Location,DBpedia:City',


* Testing for graphs - examples from Britannica

In [None]:
df_test = df[:45]
df_test['clean_anno']=str
for ind,row in df_test.iterrows():
    try:
        df_test['clean_anno'][ind]=remove_outliers(df['annotations'][ind])
    except:
        print(ind)

In [50]:
df_test.to_csv('csv/britannica_sematch.csv')

<hr style="rgb(0,0,0);height: 20.0px;"/>

#### Add nodes per sentences and paragraph to DF

In [26]:
#if used for outlier removal, has to be 'clean_anno' instead of 'annotations'
def add_nodes_per_sent(df, name, folder='britannica/'):
    df["sentences"] = str
    for ind, anno in df.iterrows():
        df["sentences"][ind] = return_nodes_per_sent(folder+df['path'][ind],df['annotations'][ind])
    return df

In [29]:
df = add_nodes_per_sent(df, 'britannica','')
df.head()

Unnamed: 0,path,name,score,level,annotations,sentences,paragraphs,graph,sel_graph,isolates,sel_graph_data,cleaned_anno,cleaned_anno_
0,britannica/albatross_kids,albatross,kids,2,[{'URI': 'http://dbpedia.org/resource/Flying_a...,"[{}, {}, {}, {}, {}, {}, {http://dbpedia.org/r...",[{'http://dbpedia.org/resource/Flying_and_glid...,,"[('http://dbpedia.org/resource/Species', 'http...",['http://dbpedia.org/resource/Flying_and_glidi...,"[('http://dbpedia.org/resource/Species', 'http...","['http://dbpedia.org/resource/Species', 'http:...",[{'URI': 'http://dbpedia.org/resource/Species'...
1,britannica/albatross_scholars,albatross,scholars,0,[{'URI': 'http://dbpedia.org/resource/Albatros...,"[{http://dbpedia.org/resource/Family, http://d...",[{'http://dbpedia.org/resource/Dutch_language'...,,"[('http://dbpedia.org/resource/Albatross', 'ht...",['http://dbpedia.org/resource/Flying_and_glidi...,"[('http://dbpedia.org/resource/Albatross', 'ht...","['http://dbpedia.org/resource/Albatross', 'htt...",[{'URI': 'http://dbpedia.org/resource/Albatros...
2,britannica/albatross_students,albatross,students,1,[{'URI': 'http://dbpedia.org/resource/Gliding'...,"[{http://dbpedia.org/resource/Albatross}, {htt...","[{'http://dbpedia.org/resource/Poetry', 'http:...",,"[('http://dbpedia.org/resource/Albatross', 'ht...","['http://dbpedia.org/resource/Gliding', 'http:...","[('http://dbpedia.org/resource/Albatross', 'ht...","['http://dbpedia.org/resource/Albatross', 'htt...",[{'URI': 'http://dbpedia.org/resource/Albatros...
3,britannica/canary_kids,canary,kids,2,[{'URI': 'http://dbpedia.org/resource/Domestic...,"[{http://dbpedia.org/resource/Domestic_canary,...",[{'http://dbpedia.org/resource/Atlantic_canary...,,[('http://dbpedia.org/resource/Domestic_canary...,"['http://dbpedia.org/resource/Coast', 'http://...",[('http://dbpedia.org/resource/Domestic_canary...,['http://dbpedia.org/resource/Domestic_canary'...,[{'URI': 'http://dbpedia.org/resource/Domestic...
4,britannica/canary_scholars,canary,scholars,0,[{'URI': 'http://dbpedia.org/resource/Serinus'...,"[{http://dbpedia.org/resource/Serinus, http://...","[{'http://dbpedia.org/resource/Finch', 'http:/...",,"[('http://dbpedia.org/resource/Serinus', 'http...","['http://dbpedia.org/resource/Species', 'http:...","[('http://dbpedia.org/resource/Serinus', 'http...","['http://dbpedia.org/resource/Serinus', 'http:...",[{'URI': 'http://dbpedia.org/resource/Serinus'...


In [32]:
def add_nodes_per_para(df, name, folder='britannica/'):
    df["paragraphs"] = str
    for ind, anno in df.iterrows():
        df["paragraphs"][ind] = return_nodes_per_para(folder+df['path'][ind], (df["cleaned_anno_"][ind]))
    #df.to_csv('csv/' + name + '_with_paragraphs.csv')
    return df

In [33]:
df = add_nodes_per_para(df, 'britannica', '')
df.head()

Unnamed: 0,path,name,score,level,annotations,sentences,paragraphs,graph,sel_graph,isolates,sel_graph_data,cleaned_anno,cleaned_anno_
0,britannica/albatross_kids,albatross,kids,2,[{'URI': 'http://dbpedia.org/resource/Flying_a...,"[{}, {}, {}, {}, {}, {}, {http://dbpedia.org/r...","[{}, {}, {http://dbpedia.org/resource/Family, ...",,"[('http://dbpedia.org/resource/Species', 'http...",['http://dbpedia.org/resource/Flying_and_glidi...,"[('http://dbpedia.org/resource/Species', 'http...","['http://dbpedia.org/resource/Species', 'http:...",[{'URI': 'http://dbpedia.org/resource/Species'...
1,britannica/albatross_scholars,albatross,scholars,0,[{'URI': 'http://dbpedia.org/resource/Albatros...,"[{http://dbpedia.org/resource/Family, http://d...","[{http://dbpedia.org/resource/Mollymawk, http:...",,"[('http://dbpedia.org/resource/Albatross', 'ht...",['http://dbpedia.org/resource/Flying_and_glidi...,"[('http://dbpedia.org/resource/Albatross', 'ht...","['http://dbpedia.org/resource/Albatross', 'htt...",[{'URI': 'http://dbpedia.org/resource/Albatros...
2,britannica/albatross_students,albatross,students,1,[{'URI': 'http://dbpedia.org/resource/Gliding'...,"[{http://dbpedia.org/resource/Albatross}, {htt...","[{http://dbpedia.org/resource/Syllable, http:/...",,"[('http://dbpedia.org/resource/Albatross', 'ht...","['http://dbpedia.org/resource/Gliding', 'http:...","[('http://dbpedia.org/resource/Albatross', 'ht...","['http://dbpedia.org/resource/Albatross', 'htt...",[{'URI': 'http://dbpedia.org/resource/Albatros...
3,britannica/canary_kids,canary,kids,2,[{'URI': 'http://dbpedia.org/resource/Domestic...,"[{http://dbpedia.org/resource/Domestic_canary,...","[{http://dbpedia.org/resource/Bird, http://dbp...",,[('http://dbpedia.org/resource/Domestic_canary...,"['http://dbpedia.org/resource/Coast', 'http://...",[('http://dbpedia.org/resource/Domestic_canary...,['http://dbpedia.org/resource/Domestic_canary'...,[{'URI': 'http://dbpedia.org/resource/Domestic...
4,britannica/canary_scholars,canary,scholars,0,[{'URI': 'http://dbpedia.org/resource/Serinus'...,"[{http://dbpedia.org/resource/Serinus, http://...","[{http://dbpedia.org/resource/Serinus, http://...",,"[('http://dbpedia.org/resource/Serinus', 'http...","['http://dbpedia.org/resource/Species', 'http:...","[('http://dbpedia.org/resource/Serinus', 'http...","['http://dbpedia.org/resource/Serinus', 'http:...",[{'URI': 'http://dbpedia.org/resource/Serinus'...


In [None]:
#save df with sentences and paragraphs
df.to_csv('csv/britannica_with_units.csv')

<hr style="rgb(0,0,0);height: 20.0px;"/>

#### Add text graphs to DF

In [67]:
df = load_data('csv/britannica_with_units.csv')

In [95]:
def add_text_graphs_to_df(df, name):
    #initiate columns
    df["graph"] = str
    df["sel_graph"] = str
    df["isolates"] = str
    #save properties as well
    df["sel_graph_data"] = str
    
    for ind, anno in df[220:240].iterrows():
        df["graph"][ind] = create_graph([dic["URI"] for dic in literal_eval(df["annotations"][ind])])
        df["sel_graph"][ind] = select_related_nodes(df['graph'][ind],
                                                        [dic["URI"] for dic in literal_eval(df["annotations"][ind])])[0].edges()
        df['isolates'][ind]=select_related_nodes(df['graph'][ind],
                                                        [dic["URI"] for dic in literal_eval(df["annotations"][ind])])[1]
        
        df["sel_graph_data"][ind] = df["sel_graph"][ind].data()

        if ind % 2 == 0:
            df.to_csv('csv/'+name+'_with_graphs.csv')
            print("DF processed up to index ", ind)
        
    df.to_csv('csv/'+name+'_with_graphs.csv')
    return df

In [None]:
add_text_graphs_to_df(df, 'britannica')