In [71]:
# for pretty code
%load_ext nb_black

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

In [72]:
# imports
import numpy as np
import pandas as pd
import statsmodels.api as sm

from newsrelations.query_db.relation_query import DBQueryHandlerCoocc
from newsrelations.helper_classes.synonym_handler import SynonymHandler
from newsrelations.metrics.distances import DistanceMeasure

<IPython.core.display.Javascript object>

In [90]:
# input variables

# import Relationsmodels from their paths via argsy
RELATION_MODELS = [
    "RMadvanced_2008_politics_nytimes.sqlite",
    "RM_2008_politics_nytimes.sqlite",
    "RM_2008-2010_politics_foxnews.sqlite",
]

test_RM_similar = [
    "RMadvanced_2008_politics_nytimes.sqlite",
    "RM_2008_politics_nytimes.sqlite",
]
# path to the directory of all relation models
RELATION_MODELS_PATH = "/home/jonas/.local/share/NewsRelations/models/secondTry/"
# number of entities you want to compare
NO_ENTITIES = 5
# reference entity
TOPIC_OF_INTEREST = "united_states"

# reference entity list
TOPIC_OF_INTEREST_LIST = ["united_states", "germany", "china", "colombia", "namibia"]

<IPython.core.display.Javascript object>

In [84]:
def build_contingency_table_from_single_topic(
    relation_models_path, relation_models, topic_of_interest, no_entities=10
):
    """
    This function builds a contingency table from a input list of relation models generated with relation_miner.py 
    in regards to predetermined topic [TOPIC_OF_INTEREST].
    The first model in [RELATION_MODELS] is the reference model all other models will be compared with.
    The function extracts the top [NO_ENTITIES] co_occuring entities from the model and builds a contingency table.
    
    input:  relation_models = list
            relation_models_path = str 
            topic_of_interest = str           
            no_entities = int (standard 10)
            
    output: contingency_table = pandas DataFrame [rows = different models, columns = entities]
    """
    # initialize DistanceMeasure with reference-model
    dm = DistanceMeasure(relation_models_path, str(relation_models[0]))

    # extract top NO_ENTITIES entities
    top = dm.get_top_co_occurrences(
        topic_of_interest, cutoff=no_entities, e1_is_synset=0, e2_is_synset=0
    )
    # write first row of contingency_table
    contingency_table = pd.DataFrame(
        np.array([t[1] for t in top]),
        index=[t[0] for t in top],
        columns=[str(relation_models[0])],
    )

    # loop through all remaining models
    for model in relation_models[1:]:
        # initialize db_handler()
        db_handler = DBQueryHandlerCoocc(relation_models_path, model)

        # buffer for cooccurrences
        co_occs = []
        # loop through all entities and get number of co-occurrences
        for row in contingency_table.index:
            co_occs.append(
                len(
                    list(
                        db_handler.select_relations(
                            e1=topic_of_interest.lower(),
                            e2=row.lower(),
                            e1_is_synset=0,
                            e2_is_synset=0,
                        )
                    )
                )
            )

        contingency_table[str(model)] = co_occs

    # transpose the contingency table to get it into the right format
    contingency_table = contingency_table.transpose()
    return contingency_table

<IPython.core.display.Javascript object>

In [75]:
contingency_table = build_contingency_table_from_single_topic(
    RELATION_MODELS_PATH, RELATION_MODELS, TOPIC_OF_INTEREST, NO_ENTITIES
)

contingency_table

Unnamed: 0,washington,u.s.,bush,iraq,russia
RMadvanced_2008_politics_nytimes.sqlite,524,416,406,312,259
RM_2008_politics_nytimes.sqlite,489,411,379,294,260
RM_2008-2010_politics_foxnews.sqlite,82,129,53,34,8


<IPython.core.display.Javascript object>

In [85]:
def build_contingency_table_from_topic_list(
    relation_models_path, relation_models, topic_of_interest_list, no_entities=10
):
    """
    This function builds a contingency table from a input list of relation models generated with relation_miner.py 
    in regards to predetermined topic list [topic_of_interest_list].
    The first model in [relation_models] is the reference model all other models will be compared with.
    The function extracts the top [no_entities] co-occuring entities for the first [topic_of_interest] from the 
    model and builds a contingency table.
    
    
    input:  relation_models = list
            relation_models_path = str 
            topic_of_interest_list = list           
            no_entities = int (standard 10)
            
    output: contingency_table = pandas DataFrame [rows = different models, columns = entities]
    """
    i = 0
    # initialize DistanceMeasure with reference-model
    dm = DistanceMeasure(relation_models_path, str(relation_models[0]))

    # extract top NO_ENTITIES entities
    top = dm.get_top_co_occurrences(
        topic_of_interest_list[0], cutoff=no_entities, e1_is_synset=0, e2_is_synset=0
    )
    # write first row of contingency_table
    contingency_table = pd.DataFrame(
        np.array([t[1] for t in top]),
        index=[t[0] for t in top],
        columns=[str(topic_of_interest_list[0]) + " (" + str(i) + ")"],
    )

    # loop through the models
    for model in relation_models[:]:
        # initialize db_handler()
        db_handler = DBQueryHandlerCoocc(relation_models_path, model)

        for topic in topic_of_interest_list:
            # buffer for co-occurrencces
            co_occs = []

            # loop through all all entities and get number of co-occurrences
            for row in contingency_table.index:
                co_occs.append(
                    len(
                        list(
                            db_handler.select_relations(
                                e1=topic.lower(),
                                e2=row.lower(),
                                e1_is_synset=0,
                                e2_is_synset=0,
                            )
                        )
                    )
                )
            contingency_table[str(topic) + " (" + str(i) + ")"] = co_occs

        i += 1

    # transpose the contingency table to get it into the right format
    contingency_table = contingency_table.transpose()

    return contingency_table

<IPython.core.display.Javascript object>

In [91]:
contingency_table_from_list = build_contingency_table_from_topic_list(
    RELATION_MODELS_PATH, RELATION_MODELS, TOPIC_OF_INTEREST_LIST, NO_ENTITIES
)

contingency_table_from_list

Unnamed: 0,washington,u.s.,bush,iraq,russia
united_states (0),524,416,406,312,259
germany (0),66,60,58,39,95
china (0),110,107,92,35,114
colombia (0),22,10,21,6,5
namibia (0),2,0,0,1,1
united_states (1),489,411,379,294,260
germany (1),65,60,58,38,97
china (1),107,110,86,34,113
colombia (1),20,10,19,5,5
namibia (1),2,0,0,1,1


<IPython.core.display.Javascript object>

In [92]:
def chi_squared(contingency_table):
    """
    This function conducts a chi-squared test of independence between the different rows of a contingency table
    
    input: contingency_table
    
    ouput: None
    """
    contingency_table = sm.stats.Table(contingency_table)
    results = contingency_table.test_nominal_association()
    
    """
    # orig contingency table
    print("Original contingency table:")
    print(contingency_table.table_orig)
    # expected values
    print("\nExpected values:")
    print(contingency_table.fittedvalues)
    # residual 
    print("\nChi-square contributions:")
    print(contingency_table.chi2_contribs)
    print("\nResults:")
    """
    print(results)
    
    
    results = contingency_table.test_ordinal_association()
    #print("\nOrdinal test results:")
    #print(results)
    
    return


<IPython.core.display.Javascript object>

In [93]:
chi_squared(contingency_table)

df          8
pvalue      1.1324274851176597e-14
statistic   83.14424052352445


<IPython.core.display.Javascript object>

In [94]:
chi_squared(contingency_table_from_list)

df          56
pvalue      0.0
statistic   316.6076887916231


<IPython.core.display.Javascript object>