# Add features to the DF
* This notebook contains code that calculates the graph-based feature values.
* It demonstrates values on Britannica.
* It demonstrates statistically significant differences on both Britannica and Newsela.
* The files britannica/newsela/britannica_semrel/britannica_sematch_with_features.csv in the csv folder were created with this notebook.
* The last section contains outlier removal based on the exclusivity-based semantic relatedness.

#### Import requirements

In [1]:
from ast import literal_eval
from collections import defaultdict
import itertools
import networkx
from networkx.algorithms import average_clustering
from networkx.algorithms.cluster import clustering
from networkx.algorithms import shortest_paths, pagerank
from networkx.algorithms.components import connected_components
from networkx.classes.function import density
from networkx import convert
import numpy as np
import pandas as pd
from scipy.stats import wilcoxon
from tqdm import tqdm_notebook
import warnings
warnings.filterwarnings('ignore')

#### Helper functions

In [2]:
def load_data(file_name):
    df = pd.read_csv(file_name, index_col=0)
    return df

#### Functions to calculate feature values

* Single-node

In [3]:
def node_degree(graph, isolates):
    values = []
    graph = convert.from_edgelist((literal_eval(graph)))
    graph.add_nodes_from(literal_eval(isolates))
    for g in networkx.connected_component_subgraphs(graph):
        values.append(np.mean([degree for (node, degree) in list(g.degree())]))
    return np.mean(values)

In [4]:
def clustering_coef(graph, isolates):
    graph = convert.from_edgelist((literal_eval(graph)))    
    graph.add_nodes_from(literal_eval(isolates))
    return np.mean(list(clustering(graph).values()))

In [5]:
def av_pagerank(graph, isolates):
    graph = convert.from_edgelist((literal_eval(graph)))
    graph.add_nodes_from(literal_eval(isolates))
    return np.mean(list(pagerank(graph).values()))    

* Pairwise

In [6]:
def pairwise_distance_per_unit(graph, sentences, isolates): #second argument can be paragraphs or sentences
    
    values = []
    sentences=(literal_eval(sentences.replace("{","[").replace("}","]").
                            replace(", set()", "").replace("[set(), ", "[").replace(", set()]", "]")))
        
    graph = convert.from_edgelist((literal_eval(graph)))
    graph.add_nodes_from(literal_eval(isolates))
    
    for sentence in sentences:
        subgraph = networkx.Graph()
        subgraph.add_nodes_from([node for node in set(sentence)])
        subgraph.add_edges_from([(node1,node2) for (node1,node2) in graph.edges() if node1 in subgraph.nodes() 
                                or node2 in subgraph.nodes()])
        
        for g in networkx.connected_component_subgraphs(subgraph):            
            values.append((networkx.average_shortest_path_length(g)))
            
    return np.mean(values)

In [7]:
def pairwise_semrel(graph, sentences, isolates, path_length=5, decay_factor=0.25): 
    #second argument can be paragraphs or sentences
    #based on Hulpus et al. (2015)
    
    values = []
    sentences=(literal_eval(sentences.replace("{","[").replace("}","]").
                           replace(", set()", "").replace("[set(), ", "[").replace(", set()]", "]")))
        
    graph = convert.from_edgelist((literal_eval(graph)))
    graph.add_nodes_from(literal_eval(isolates))

    graph_prop = (graph.edges.data())

    
    
    for sentence in (sentences):
        
        graph_relatedness = defaultdict(float)
        exclusivities = defaultdict(float)
        
        subgraph = networkx.Graph()
        subgraph.add_nodes_from([node for node in set(sentence)])
        subgraph.add_edges_from([(node1,node2) for (node1,node2) in graph.edges() if node1 in subgraph.nodes() 
                                or node2 in subgraph.nodes()])

        for (node1, node2) in subgraph.edges():   #two related nodes can have more properties connecting them
                # for each property connecting two nodes
                prop = [tup[2]["prop"] for tup in graph_prop if tup[0]==node1 and tup[1]==node2]
                #their exclusivity depends on how many other edges bear the same property
                if (len([tup for tup in graph_prop if (tup[0]==node1 or tup[1]==node1) and tup[2]["prop"] in prop]) +  
                                   len([tup for tup in graph_prop if (tup[1]==node2 or tup[0]==node2) and tup[2]["prop"] in prop])) != 0: 
              
                    exclusivity = 1 / (len([tup for tup in graph_prop if (tup[0]==node1 or tup[1]==node1) and tup[2]["prop"] in prop]) +  
                                       len([tup for tup in graph_prop if (tup[1]==node2 or tup[0]==node2) and tup[2]["prop"] in prop]))  

                    exclusivities[((node1,node2))] += exclusivity

        for node1, node2 in (list(itertools.combinations(list(subgraph.nodes()), 2))): 
                #the relatedness is calculated for each pair of nodes
                relatedness = 0                                                  
                sum_of_exc = 0

                paths = list(networkx.simple_paths.all_simple_paths(subgraph, node1, node2, path_length))
                for path in paths:  #if there is a path of length at most path_lenght between them                
                    for related_nodes in list(zip(path, path[1:])):   #we check all connecting edges                    
                        sum_of_exc += exclusivities[(related_nodes[0], related_nodes[1])] 

                if sum_of_exc != 0:               
                    weight = 1 /  sum_of_exc #weight is for a path                             
                    length = (len(path))

                    relatedness += (weight * decay_factor ** length) #relatedness is for node pair

                graph_relatedness[(node1,node2)]=(relatedness)
        
        if (list(graph_relatedness.values())) != []:
            values.append(np.mean(list(graph_relatedness.values())))
        
    return np.mean(values)

* Global

In [8]:
def graph_conncomp_per_unit(graph, sentences, isolates): #second argument can be paragraphs or sentences
    
    values = []
    sentences=(literal_eval(sentences.replace("{","[").replace("}","]").
                            replace(", set()", "").replace("[set(), ", "[").replace(", set()]", "]")))
    graph = convert.from_edgelist((literal_eval(graph)))
    graph.add_nodes_from(literal_eval(isolates))

    
    for sentence in (sentences):
        subgraph = networkx.Graph()
        subgraph.add_nodes_from([node for node in set(sentence)])
        subgraph.add_edges_from([(node1,node2) for (node1,node2) in graph.edges() if node1 in subgraph.nodes() 
                                or node2 in subgraph.nodes()])
        
        values.append(len(list(connected_components(subgraph))))

    return np.mean(values)

In [9]:
def clustering_coef_per_unit(graph, sentences, isolates): ##second argument can be paragraphs or sentences
    
    values = []
    sentences=(literal_eval(sentences.replace("{","[").replace("}","]").
                            replace(", set()", "").replace("[set(), ", "[").replace(", set()]", "]")))
    graph = convert.from_edgelist((literal_eval(graph)))
    graph.add_nodes_from(literal_eval(isolates))
    
    for sentence in sentences:
        subgraph = networkx.Graph()
        subgraph.add_nodes_from([node for node in set(sentence)])
        subgraph.add_edges_from([(node1,node2) for (node1,node2) in graph.edges() if node1 in subgraph.nodes() 
                                or node2 in subgraph.nodes()])

        for g in networkx.connected_component_subgraphs(subgraph):
            values.append(len(list(clustering(g).values())))

    return np.mean(values)    

In [10]:
def graph_density_per_unit(graph, sentences, isolates): #second argument can be paragraphs or sentences
    
    values = []
    sentences=(literal_eval(sentences.replace("{","[").replace("}","]").
                            replace(", set()", "").replace("[set(), ", "[").replace(", set()]", "]")))
    graph = convert.from_edgelist((literal_eval(graph)))
    graph.add_nodes_from(literal_eval(isolates))
    
    for sentence in sentences:
        subgraph = networkx.Graph()

        subgraph.add_nodes_from([node for node in set(sentence)])
        subgraph.add_edges_from([(node1,node2) for (node1,node2) in graph.edges() if node1 in subgraph.nodes() 
                                or node2 in subgraph.nodes()])
        
        values.append(density(subgraph))

    return np.mean(values)    

In [11]:
def graph_density(graph, isolates):
    graph = convert.from_edgelist((literal_eval(graph)))
    graph.add_nodes_from(literal_eval(isolates))
    return density(graph)

<hr style="rgb(0,0,0);height: 15.0px;"/>

#### Add feature values to DF:  Britannica demo

In [31]:
df_graph = load_data('csv/britannica_with_graphs.csv')

In [13]:
def add_features_to_df(df, name):
    feat_list = ['node_degree', 'clustering_coef', 'av_pagerank', 'pairwise_distance_per_sent', 
             'graph_conncomp_per_sent', 'clustering_coef_per_sent', 'graph_density_per_sent', 
             'graph_density', 'graph_conncomp_per_para', 'clustering_coef_per_para', 'graph_density_per_para', 
             'pairwise_distance_per_para', 'pairwise_semrel_per_sent', 'pairwise_semrel_per_para']

    for feat in feat_list:
        df[feat] = float 
    
    for ind, row in tqdm_notebook(df.iterrows()):
    
        df['node_degree'][ind] = node_degree(row['sel_graph'], row['isolates'])
        df['clustering_coef'][ind] = clustering_coef(row['sel_graph'], row['isolates'])
        df['av_pagerank'][ind] = av_pagerank(row['sel_graph'], row['isolates'])
        
        df['pairwise_distance_per_sent'][ind] = pairwise_distance_per_unit(row['sel_graph'], row["sentences"], row['isolates'])
        df['pairwise_distance_per_para'][ind] = pairwise_distance_per_unit(row['sel_graph'], row["paragraphs"], row['isolates'])
        
        df['graph_conncomp_per_sent'][ind] = graph_conncomp_per_unit(row['sel_graph'], row["sentences"], row['isolates'])
        df['graph_conncomp_per_para'][ind] = graph_conncomp_per_unit(row['sel_graph'], row["paragraphs"], row['isolates'])

        df['clustering_coef_per_sent'][ind] = clustering_coef_per_unit(row['sel_graph'], row["sentences"], row['isolates'])
        df['clustering_coef_per_para'][ind] = clustering_coef_per_unit(row['sel_graph'], row["paragraphs"], row['isolates'])

        df['graph_density_per_sent'][ind] = graph_density_per_unit(row['sel_graph'], row["sentences"], row['isolates'])
        df['graph_density_per_para'][ind] = graph_density_per_unit(row['sel_graph'], row["paragraphs"], row['isolates'])
        
        df['graph_density'][ind] = graph_density(row['sel_graph'], row['isolates'])
        
        df['pairwise_semrel_per_sent'][ind]=pairwise_semrel(row['sel_graph_data'], row["sentences"], row['isolates'])
        df['pairwise_semrel_per_para'][ind]=pairwise_semrel(row['sel_graph_data'], row["paragraphs"], row['isolates'])
    
    df.to_csv("csv/" + name + "_with_features.csv")
        
    return df

In [14]:
df = add_features_to_df(df_graph, 'britannica')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




<hr style="rgb(0,0,0);height: 15.0px;"/>

#### Show results: Britannica

In [39]:
#in case levels are not in order
#df.sort_values(['name', 'score'], ascending=[True, False], inplace=True)
#df.reset_index(inplace=True)
#df['level']=df.index%5
#df.head()
#df.to_csv('csv/newsela_with_features.csv')

In [32]:
df = load_data('csv/britannica_with_features.csv')

In [33]:
df.groupby('level').count()

Unnamed: 0_level_0,path,name,score,annotations,sentences,paragraphs,graph,sel_graph,isolates,sel_graph_data,...,graph_conncomp_per_sent,clustering_coef_per_sent,graph_density_per_sent,graph_density,graph_conncomp_per_para,clustering_coef_per_para,graph_density_per_para,pairwise_distance_per_para,pairwise_semrel_per_sent,pairwise_semrel_per_para
level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,59,59,59,59,59,59,0,59,59,59,...,59,59,59,59,59,59,59,59,59,59
1,59,59,59,59,59,59,0,59,59,59,...,59,59,59,59,59,59,59,59,59,59
2,59,59,59,59,59,59,0,59,59,59,...,59,59,59,59,59,59,59,59,59,59


In [35]:
for feat in ['node_degree', 'clustering_coef', 'av_pagerank', 'pairwise_distance_per_sent', 
             'graph_conncomp_per_sent', 'clustering_coef_per_sent', 'graph_density_per_sent', 
             'graph_density', 'graph_conncomp_per_para', 'clustering_coef_per_para', 'graph_density_per_para', 
             'pairwise_distance_per_para', 'pairwise_semrel_per_sent', 'pairwise_semrel_per_para']:
    
        #convert to numeric for future operations
        df[feat] = df[feat].apply(pd.to_numeric)

In [36]:
#check if it worked
df.dtypes

path                           object
name                           object
score                          object
level                           int64
annotations                    object
sentences                      object
paragraphs                     object
graph                         float64
sel_graph                      object
isolates                       object
sel_graph_data                 object
node_degree                   float64
clustering_coef               float64
av_pagerank                   float64
pairwise_distance_per_sent    float64
graph_conncomp_per_sent       float64
clustering_coef_per_sent      float64
graph_density_per_sent        float64
graph_density                 float64
graph_conncomp_per_para       float64
clustering_coef_per_para      float64
graph_density_per_para        float64
pairwise_distance_per_para    float64
pairwise_semrel_per_sent      float64
pairwise_semrel_per_para      float64
dtype: object

#### Higher = simpler:
* pagerank
* node_degree
* density
* semrel

In [37]:
#display average feature values where higher=simpler
results = df.groupby('level')[['node_degree', 'graph_density_per_sent', 'graph_density_per_para',
                               "av_pagerank", 'graph_density', 'pairwise_semrel_per_sent',
                              'pairwise_semrel_per_para']].mean()
results

Unnamed: 0_level_0,node_degree,graph_density_per_sent,graph_density_per_para,av_pagerank,graph_density,pairwise_semrel_per_sent,pairwise_semrel_per_para
level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.524435,0.181135,0.078909,0.006019,0.020829,0.065031,0.034302
1,0.502462,0.222466,0.080971,0.007857,0.025506,0.06924,0.034576
2,0.652699,0.249132,0.143085,0.015344,0.048913,0.072682,0.037052


In [38]:
#display changes in feature values where higher=simpler
for feat in ['node_degree', 'graph_density_per_sent', 'graph_density_per_para', "av_pagerank",  'graph_density']:
    
    feat_df = pd.DataFrame(0, index=[0,1,2,3,4], columns=[0,1,2,3,4])
    for ind,row in enumerate(results[feat]):
        for compare_ind in range(len(results[feat])):
            if compare_ind<=ind:
                feat_df[ind][compare_ind] += (results[feat][ind]*100 / results[feat][compare_ind])-100
    
    print(feat)
    print(feat_df.head())
    print('\n')

node_degree
   0  1   2  3  4
0  0 -4  24  0  0
1  0  0  29  0  0
2  0  0   0  0  0
3  0  0   0  0  0
4  0  0   0  0  0


graph_density_per_sent
   0   1   2  3  4
0  0  22  37  0  0
1  0   0  11  0  0
2  0   0   0  0  0
3  0   0   0  0  0
4  0   0   0  0  0


graph_density_per_para
   0  1   2  3  4
0  0  2  81  0  0
1  0  0  76  0  0
2  0  0   0  0  0
3  0  0   0  0  0
4  0  0   0  0  0


av_pagerank
   0   1    2  3  4
0  0  30  154  0  0
1  0   0   95  0  0
2  0   0    0  0  0
3  0   0    0  0  0
4  0   0    0  0  0


graph_density
   0   1    2  3  4
0  0  22  134  0  0
1  0   0   91  0  0
2  0   0    0  0  0
3  0   0    0  0  0
4  0   0    0  0  0




#### Lower = simpler:
* clustering_coef
* pairwise distance
* conncomp

In [39]:
#display average values where lower=simpler
df.groupby('level')[['clustering_coef', 'clustering_coef_per_sent', 'clustering_coef_per_para',
                               'pairwise_distance_per_sent', 'pairwise_distance_per_para', 
                               'graph_conncomp_per_sent', 'graph_conncomp_per_para']].mean()

Unnamed: 0_level_0,clustering_coef,clustering_coef_per_sent,clustering_coef_per_para,pairwise_distance_per_sent,pairwise_distance_per_para,graph_conncomp_per_sent,graph_conncomp_per_para
level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.119313,7.134127,7.76649,1.164052,1.072889,2.318567,6.409737
1,0.110549,6.337198,6.826522,1.150274,1.076909,1.924071,5.073012
2,0.115842,4.715268,5.010809,1.025489,0.998849,1.567241,3.43755


In [40]:
results = df.groupby('level')[['clustering_coef', 'clustering_coef_per_sent', 'clustering_coef_per_para',
                               'pairwise_distance_per_sent', 'pairwise_distance_per_para', 
                               'graph_conncomp_per_sent', 'graph_conncomp_per_para']].mean()
results = results.sort_values('level', ascending=False).reset_index(drop=True)

In [41]:
#display changes in feature values where lower=simpler
for feat in ['clustering_coef', 'clustering_coef_per_sent', 'clustering_coef_per_para',
                               'pairwise_distance_per_sent', 'pairwise_distance_per_para', 
                               'graph_conncomp_per_sent', 'graph_conncomp_per_para']:
    
    feat_df = pd.DataFrame(0, index=[0,1,2,3,4], columns=[0,1,2,3,4])
    for ind,row in enumerate(results[feat]):
        for compare_ind in range(len(results[feat])):
             if compare_ind<=ind:
                feat_df[ind][compare_ind] += (results[feat][compare_ind]*100 / results[feat][ind])-100

    print(feat)
    print(feat_df.head())
    print('\n')

clustering_coef
   0  1  2  3  4
0  0  4 -2  0  0
1  0  0 -7  0  0
2  0  0  0  0  0
3  0  0  0  0  0
4  0  0  0  0  0


clustering_coef_per_sent
   0   1   2  3  4
0  0 -25 -33  0  0
1  0   0 -11  0  0
2  0   0   0  0  0
3  0   0   0  0  0
4  0   0   0  0  0


clustering_coef_per_para
   0   1   2  3  4
0  0 -26 -35  0  0
1  0   0 -12  0  0
2  0   0   0  0  0
3  0   0   0  0  0
4  0   0   0  0  0


pairwise_distance_per_sent
   0   1   2  3  4
0  0 -10 -11  0  0
1  0   0  -1  0  0
2  0   0   0  0  0
3  0   0   0  0  0
4  0   0   0  0  0


pairwise_distance_per_para
   0  1  2  3  4
0  0 -7 -6  0  0
1  0  0  0  0  0
2  0  0  0  0  0
3  0  0  0  0  0
4  0  0  0  0  0


graph_conncomp_per_sent
   0   1   2  3  4
0  0 -18 -32  0  0
1  0   0 -17  0  0
2  0   0   0  0  0
3  0   0   0  0  0
4  0   0   0  0  0


graph_conncomp_per_para
   0   1   2  3  4
0  0 -32 -46  0  0
1  0   0 -20  0  0
2  0   0   0  0  0
3  0   0   0  0  0
4  0   0   0  0  0




<hr style="rgb(0,0,0);height: 15.0px;"/>

#### Check which differences are statistically significant

#### Britannica

In [42]:
features = ['node_degree', 'clustering_coef', 'av_pagerank', 'pairwise_distance_per_sent', 
             'graph_conncomp_per_sent', 'clustering_coef_per_sent', 'graph_density_per_sent', 
             'graph_density', 'graph_conncomp_per_para', 'clustering_coef_per_para', 'graph_density_per_para', 
             'pairwise_distance_per_para', 'pairwise_semrel_per_sent', 'pairwise_semrel_per_para']

In [43]:
statistically_significant = []

In [48]:
for feat in features:    
    scholar = df[feat].loc[(df['level'] == 0)]
    student = df[feat].loc[(df['level'] == 1)]
    kid = df[feat].loc[(df['level'] == 2)]
    
    print("p-values for feature: ", feat)
    print('kids vs. students: ', "{0:.2f}".format(wilcoxon(kid, student)[1]))
    if wilcoxon(kid, student)[1] < 0.01: statistically_significant.append((feat, 'kid-student'))
    print('kids vs. scholars: ',  "{0:.2f}".format(wilcoxon(kid, scholar)[1]))
    if wilcoxon(kid, scholar)[1] < 0.01: statistically_significant.append((feat, 'kid-scholar'))
    print('student vs. scholars: ', "{0:.2f}".format(wilcoxon(student, scholar)[1]))
    if wilcoxon(student, scholar)[1] < 0.01: statistically_significant.append((feat, 'student-scholar'))
    print('\n')

p-values for feature:  node_degree
kids vs. students:  0.01
kids vs. scholars:  0.00
student vs. scholars:  0.57


p-values for feature:  clustering_coef
kids vs. students:  0.58
kids vs. scholars:  0.70
student vs. scholars:  0.62


p-values for feature:  av_pagerank
kids vs. students:  0.00
kids vs. scholars:  0.00
student vs. scholars:  0.01


p-values for feature:  pairwise_distance_per_sent
kids vs. students:  0.00
kids vs. scholars:  0.00
student vs. scholars:  0.53


p-values for feature:  graph_conncomp_per_sent
kids vs. students:  0.00
kids vs. scholars:  0.00
student vs. scholars:  0.00


p-values for feature:  clustering_coef_per_sent
kids vs. students:  0.00
kids vs. scholars:  0.00
student vs. scholars:  0.03


p-values for feature:  graph_density_per_sent
kids vs. students:  0.02
kids vs. scholars:  0.00
student vs. scholars:  0.00


p-values for feature:  graph_density
kids vs. students:  0.00
kids vs. scholars:  0.00
student vs. scholars:  0.01


p-values for feature:  

In [49]:
print("Statistically siginicant differences:")
for tup in statistically_significant:
    print('Feature ', tup[0], ' for pair ', tup[1])

Statistically siginicant differences:
Feature  node_degree  for pair  kid-student
Feature  node_degree  for pair  kid-scholar
Feature  av_pagerank  for pair  kid-student
Feature  av_pagerank  for pair  kid-scholar
Feature  av_pagerank  for pair  student-scholar
Feature  pairwise_distance_per_sent  for pair  kid-student
Feature  pairwise_distance_per_sent  for pair  kid-scholar
Feature  graph_conncomp_per_sent  for pair  kid-student
Feature  graph_conncomp_per_sent  for pair  kid-scholar
Feature  graph_conncomp_per_sent  for pair  student-scholar
Feature  clustering_coef_per_sent  for pair  kid-student
Feature  clustering_coef_per_sent  for pair  kid-scholar
Feature  graph_density_per_sent  for pair  kid-scholar
Feature  graph_density_per_sent  for pair  student-scholar
Feature  graph_density  for pair  kid-student
Feature  graph_density  for pair  kid-scholar
Feature  graph_conncomp_per_para  for pair  kid-student
Feature  graph_conncomp_per_para  for pair  kid-scholar
Feature  graph_c

#### Newsela

In [50]:
df = load_data('csv/newsela_with_features.csv')

In [54]:
features = ['node_degree', 'clustering_coef', 'av_pagerank', 'pairwise_distance_per_sent', 
             'graph_conncomp_per_sent', 'clustering_coef_per_sent', 'graph_density_per_sent', 
             'pairwise_semrel_per_sent']


statistically_significant = []

In [55]:
for feat in features:
    levels = {'level0' : df[feat].loc[(df['level'] == 0)],
    'level1' : df[feat].loc[(df['level'] == 1)],
    "level2" : df[feat].loc[(df['level'] == 2)],
    'level3' : df[feat].loc[(df['level'] == 3)],
    'level4' : df[feat].loc[(df['level'] == 4)]}
        
    for (levela, levelb) in itertools.combinations(levels, 2):
        print("p-values for feature: ", feat)
        print(str(levela) + ' vs. ', str(levelb), "{0:.2f}".format(wilcoxon(levels[levela], levels[levelb])[1]))
        if wilcoxon(levels[levela], levels[levelb])[1] < 0.01: statistically_significant.append((feat, (levela,levelb)))


p-values for feature:  node_degree
level0 vs.  level1 0.11
p-values for feature:  node_degree
level0 vs.  level2 0.00
p-values for feature:  node_degree
level0 vs.  level3 0.00
p-values for feature:  node_degree
level0 vs.  level4 0.00
p-values for feature:  node_degree
level1 vs.  level2 0.01
p-values for feature:  node_degree
level1 vs.  level3 0.01
p-values for feature:  node_degree
level1 vs.  level4 0.00
p-values for feature:  node_degree
level2 vs.  level3 0.05
p-values for feature:  node_degree
level2 vs.  level4 0.00
p-values for feature:  node_degree
level3 vs.  level4 0.00
p-values for feature:  clustering_coef
level0 vs.  level1 0.14
p-values for feature:  clustering_coef
level0 vs.  level2 0.92
p-values for feature:  clustering_coef
level0 vs.  level3 0.80
p-values for feature:  clustering_coef
level0 vs.  level4 0.66
p-values for feature:  clustering_coef
level1 vs.  level2 0.03
p-values for feature:  clustering_coef
level1 vs.  level3 0.04
p-values for feature:  clusterin

In [53]:
print("Statistically siginicant differences:")
for tup in statistically_significant:
    print('Feature ', tup[0], ' for pair ', tup[1])

Statistically siginicant differences:
Feature  node_degree  for pair  ('level0', 'level2')
Feature  node_degree  for pair  ('level0', 'level3')
Feature  node_degree  for pair  ('level0', 'level4')
Feature  node_degree  for pair  ('level1', 'level3')
Feature  node_degree  for pair  ('level1', 'level4')
Feature  node_degree  for pair  ('level2', 'level4')
Feature  node_degree  for pair  ('level3', 'level4')
Feature  av_pagerank  for pair  ('level0', 'level1')
Feature  av_pagerank  for pair  ('level0', 'level2')
Feature  av_pagerank  for pair  ('level0', 'level3')
Feature  av_pagerank  for pair  ('level0', 'level4')
Feature  av_pagerank  for pair  ('level1', 'level2')
Feature  av_pagerank  for pair  ('level1', 'level3')
Feature  av_pagerank  for pair  ('level1', 'level4')
Feature  av_pagerank  for pair  ('level2', 'level3')
Feature  av_pagerank  for pair  ('level2', 'level4')
Feature  av_pagerank  for pair  ('level3', 'level4')
Feature  pairwise_distance_per_sent  for pair  ('level0', 'le

<hr style="rgb(0,0,0);height: 15.0px;"/>

#### Exclusivity-based semantic relatedness used for outlier removal
* the clean_graph function is a modified version of pairwise_semrel that returns cleaned annotations
* running the below cells adds a 'cleaned_anno' column to the DF
* was tested on the first 45 texts of britannica
* saved as britannica_semrel.csv

In [2]:
def clean_graph(graph, isolates, path_length=5, decay_factor=0.25, threshold=0.00001): 
    #second argument can be paragraphs or sentences
    #based on Hulpus et al. (2015)
    
    cleaned_annotations=[]
    values = []
        
    graph = convert.from_edgelist((literal_eval(graph)))

    graph_prop = (graph.edges.data())

    graph_relatedness = defaultdict(list)
           
    exclusivities = defaultdict(float)
        
    for (node1, node2) in graph.edges():   #two related nodes can have more properties connecting them
                # for each property connecting two nodes
                prop = [tup[2]["prop"] for tup in graph_prop if tup[0]==node1 and tup[1]==node2]
                #their exclusivity depends on how many other edges bear the same property
                if (len([tup for tup in graph_prop if (tup[0]==node1 or tup[1]==node1) and tup[2]["prop"] in prop]) +  
                                   len([tup for tup in graph_prop if (tup[1]==node2 or tup[0]==node2) and tup[2]["prop"] in prop])) != 0: 
              
                    exclusivity = 1 / (len([tup for tup in graph_prop if (tup[0]==node1 or tup[1]==node1) and tup[2]["prop"] in prop]) +  
                                       len([tup for tup in graph_prop if (tup[1]==node2 or tup[0]==node2) and tup[2]["prop"] in prop])-1)  
                    #print(node1,node2,exclusivity)
                    
                    exclusivities[((node1,node2))] += exclusivity
        
    for node1, node2 in (list(itertools.combinations(list(graph.nodes()), 2))): 
                #the relatedness is calculated for each pair of nodes
                relatedness = 0                                                  
                sum_of_exc = 0

                paths = list(networkx.simple_paths.all_simple_paths(graph, node1, node2, 5))
                for path in paths:  #if there is a path of length at most path_lenght between them                
                    for related_nodes in list(zip(path, path[1:])):   #we check all connecting edges                    
                        sum_of_exc += exclusivities[(related_nodes[0], related_nodes[1])] 

                if sum_of_exc != 0:               
                    weight = 1 /  (1/sum_of_exc) #weight is for a path                             
                    length = (len(path))

                    relatedness += (weight * decay_factor ** length) #relatedness is for node pair
                  
               # print(node1,node2,relatedness)
                graph_relatedness[node1].append(relatedness)
                graph_relatedness[node2].append(relatedness)
  
    #if (list(graph_relatedness.values())) != []:
         #   values.append(np.mean(list(graph_relatedness.values())))
    
    for key in graph_relatedness:
        if np.mean(graph_relatedness[key]) > threshold:
                cleaned_annotations.append(key)  
            
    return cleaned_annotations

In [None]:
cleaned_anno = []

for ind,row in df[:45].iterrows():
    print(ind)
    g = df.iloc[ind]['sel_graph_data']
    s = df.iloc[ind]['sentences']
    i = df.iloc[ind]['isolates']   
    cleaned_anno.append(clean_graph(g,s,i))

In [None]:
df['cleaned_anno']=cleaned_anno

In [None]:
df.to_csv('csv/britannica_semrel.csv')