In [1]:
# for pretty code
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
# imports
import numpy as np
import pandas as pd
import statsmodels.api as sm

from newsrelations.query_db.relation_query import DBQueryHandlerCoocc
from newsrelations.helper_classes.synonym_handler import SynonymHandler
from newsrelations.metrics.distances import DistanceMeasure

<IPython.core.display.Javascript object>

In [15]:
# input variables

# import Relationsmodels from their paths via argsy
RELATION_MODELS = [
    "RMadvanced_2008_politics_nytimes.sqlite",
    "RM_2008_politics_nytimes.sqlite",
    "RM_2008-2010_politics_foxnews.sqlite",
]
RELATION_MODELS_PATH = "/home/jonas/.local/share/NewsRelations/models/secondTry/"

NO_ENTITIES = 3
TOPIC_OF_INTEREST = "united_states"

<IPython.core.display.Javascript object>

In [16]:
def build_contingency_table(
    RELATION_MODELS_PATH, RELATION_MODELS, TOPIC_OF_INTEREST, NO_ENTITIES=10
):
    """
    This function builds a contingency table from a input list of relation models generated with relation_miner.py in regards to predetermined topic TOPIC_OF_INTEREST.
    The first model in RELATION_MODELS is the reference model all other models will be compared with.
    The function extracts the top NO_ENTITIES entities from the model and builds the contingency table.
    
    input:  RELATION_MODELS = list
            RELATION_MODEL_PATHS = str 
            TOPIC_OF_INTEREST = str           
            NO_ENTITIES = int (standard 10)
            
    output: contingency_table = pandas DataFrame [rows = different models, columns = entities]
    """
    # initialize DistanceMeasure with reference-model
    dm = DistanceMeasure(RELATION_MODELS_PATH, str(RELATION_MODELS[0]))

    # extract top NO_ENTITIES entities
    top = dm.get_top_co_occurrences(
        TOPIC_OF_INTEREST, cutoff=NO_ENTITIES, e1_is_synset=0, e2_is_synset=0
    )
    # write first row of contingency_table
    contingency_table = pd.DataFrame(
        np.array([t[1] for t in top]),
        index=[t[0] for t in top],
        columns=[str(RELATION_MODELS[0])],
    )

    # loop through all remaining models
    for model in RELATION_MODELS[1:]:
        # initialize db_handler()
        db_handler = DBQueryHandlerCoocc(RELATION_MODELS_PATH, model)

        # buffer for cooccurrences
        co_occs = []
        # loop through all entities and get number of co-occurrences
        for row in contingency_table.index:
            co_occs.append(
                len(
                    list(
                        db_handler.select_relations(
                            e1=TOPIC_OF_INTEREST.lower(),
                            e2=row.lower(),
                            e1_is_synset=0,
                            e2_is_synset=0,
                        )
                    )
                )
            )

        contingency_table[str(model)] = co_occs

    # transpose the contingency table to get it into the right format
    contingency_table = contingency_table.transpose()
    return contingency_table

<IPython.core.display.Javascript object>

In [17]:
contingency_table = build_contingency_table(
    RELATION_MODELS_PATH, RELATION_MODELS, TOPIC_OF_INTEREST, NO_ENTITIES
)

<IPython.core.display.Javascript object>

In [18]:
def chi_squared(contingency_table):
    """
    This function conducts a chi-squared test of independence between the different rows of a contingency table
    
    input: contingency_table
    
    ouput: None
    """
    contingency_table = sm.stats.Table(contingency_table)
    results = contingency_table.test_nominal_association()
    
    # orig contingency table
    print("Original contingency table:")
    print(contingency_table.table_orig)
    # expected values
    print("\nExpected values:")
    print(contingency_table.fittedvalues)
    # residual 
    print("\nChi-square contributions:")
    print(contingency_table.chi2_contribs)
    print("\nResults:")
    print(results)
    
    
    results = contingency_table.test_ordinal_association()
    print("\nOrdinal test results:")
    print(results)
    
    return


<IPython.core.display.Javascript object>

In [19]:
chi_squared(contingency_table)

Original contingency table:
                                         washington  u.s.  bush
RMadvanced_2008_politics_nytimes.sqlite         524   416   406
RM_2008_politics_nytimes.sqlite                 489   411   379
RM_2008-2010_politics_foxnews.sqlite             82   129    53

Expected values:
                                         washington        u.s.        bush
RMadvanced_2008_politics_nytimes.sqlite  510.166147  445.405331  390.428522
RM_2008_politics_nytimes.sqlite          484.771547  423.234337  370.994116
RM_2008-2010_politics_foxnews.sqlite     100.062305   87.360332   76.577362

Chi-square contributions:
                                         washington       u.s.      bush
RMadvanced_2008_politics_nytimes.sqlite    0.375124   1.941318  0.621038
RM_2008_politics_nytimes.sqlite            0.036883   0.353655  0.172763
RM_2008-2010_politics_foxnews.sqlite       3.260437  19.847245  7.259221

Results:
df          4
pvalue      7.932297215740292e-07
statistic   33.86

<IPython.core.display.Javascript object>