In [None]:
# for pretty code
%load_ext nb_black

# Imports and functions

In [None]:
# imports
import numpy as np
import pandas as pd
import statsmodels.api as sm
import multiprocessing as mp

from newsrelations.query_db.relation_query import DBQueryHandlerCoocc
from newsrelations.helper_classes.synonym_handler import SynonymHandler
from newsrelations.metrics.distances import DistanceMeasure

In [None]:
def build_contingency_table_from_single_topic(
    relation_models_path, relation_models, topic_of_interest, no_entities=10
):
    """
    This function builds a contingency table from a input list of relation models generated with relation_miner.py 
    in regards to predetermined topic [TOPIC_OF_INTEREST].
    The first model in [RELATION_MODELS] is the reference model all other models will be compared with.
    The function extracts the top [NO_ENTITIES] co_occuring entities from the model and builds a contingency table.
    
    input:  relation_models = list
            relation_models_path = str 
            topic_of_interest = str           
            no_entities = int (standard 10)
            
    output: contingency_table = pandas DataFrame [rows = different models, columns = entities]
    """
    # initialize DistanceMeasure with reference-model
    dm = DistanceMeasure(relation_models_path, str(relation_models[0]))

    # extract top NO_ENTITIES entities
    top = dm.get_top_co_occurrences(
        topic_of_interest, cutoff=no_entities, e1_is_synset=0, e2_is_synset=0
    )
    # write first row of contingency_table
    contingency_table = pd.DataFrame(
        np.array([t[1] for t in top]),
        index=[t[0] for t in top],
        columns=[str(relation_models[0])],
    )

    # loop through all remaining models
    for model in relation_models[1:]:
        # initialize db_handler()
        db_handler = DBQueryHandlerCoocc(relation_models_path, model)

        # buffer for cooccurrences
        co_occs = []
        # loop through all entities and get number of co-occurrences
        for row in contingency_table.index:
            co_occs.append(
                len(
                    list(
                        db_handler.select_relations(
                            e1=topic_of_interest.lower(),
                            e2=row.lower(),
                            e1_is_synset=0,
                            e2_is_synset=0,
                        )
                    )
                )
            )

        contingency_table[str(model)] = co_occs

    # transpose the contingency table to get it into the right format
    contingency_table = contingency_table.transpose()
    return contingency_table

In [None]:
def explain_models(relation_models):
    """
    This function lists the models from the relation_models list and their counter index
    
    Input: relation_models = list
    
    Output: None
    """
    i = 0
    for model in relation_models:
        print("model(" + str(i) + "): " + str(model))
        i += 1
    return

In [None]:
def build_contingency_table_from_topic_list(
    relation_models_path, relation_models, topic_of_interest_list, no_entities=10
):
    """
    This function builds a contingency table from a input list of relation models generated with relation_miner.py 
    in regards to predetermined topic list [topic_of_interest_list].
    The first model in [relation_models] is the reference model all other models will be compared with.
    The function extracts the top [no_entities] co-occuring entities for the first [topic_of_interest] from the 
    model and builds a contingency table.
    
    
    input:  relation_models = list
            relation_models_path = str 
            topic_of_interest_list = list           
            no_entities = int (standard 10)
            
    output: contingency_table = pandas DataFrame [rows = different models, columns = entities]
    """
    # identifier for models in relation_models_list
    i = 0
    # print models with idx
    explain_models(relation_models)

    # initialize DistanceMeasure with reference-model
    dm = DistanceMeasure(relation_models_path, str(relation_models[0]))

    # extract top NO_ENTITIES entities
    top = dm.get_top_co_occurrences(
        topic_of_interest_list[0], cutoff=no_entities, e1_is_synset=0, e2_is_synset=0
    )
    # write first row of contingency_table
    contingency_table = pd.DataFrame(
        np.array([t[1] for t in top]),
        index=[t[0] for t in top],
        columns=[str(topic_of_interest_list[0]) + " (" + str(i) + ")"],
    )

    # loop through the models
    for model in relation_models[:]:
        # initialize db_handler()
        db_handler = DBQueryHandlerCoocc(relation_models_path, model)

        for topic in topic_of_interest_list:
            # buffer for co-occurrencces
            co_occs = []

            # loop through all all entities and get number of co-occurrences
            for row in contingency_table.index:
                co_occs.append(
                    len(
                        list(
                            db_handler.select_relations(
                                e1=topic.lower(),
                                e2=row.lower(),
                                e1_is_synset=0,
                                e2_is_synset=0,
                            )
                        )
                    )
                )
            contingency_table[str(topic) + " (" + str(i) + ")"] = co_occs

        i += 1

    # transpose the contingency table to get it into the right format
    contingency_table = contingency_table.transpose()

    return contingency_table

In [None]:
def compare_entity_lists(
    relation_models_path, relation_models, topic_list1, topic_list2
):
    """
    This function creates a contingency table from a two input lists of entities [topic_list1][topic_list2]
    from a list of relation models 
    
    input:  relation_models = list
            relation_models_path = str 
            topic_list1 = list           
            topic_list2 = list
            
    output: contingency_table = pandas DataFrame [rows = different models, columns = entities]
    """
    # idx for models
    i = 0

    # print models with idx
    explain_models(relation_models)

    # initialize contingency_table
    contingency_table = pd.DataFrame(index=topic_list2)

    # loop through the models
    for model in relation_models:
        # initialize db_handler()
        db_handler = DBQueryHandlerCoocc(relation_models_path, model)

        for topic1 in topic_list1:
            # buffer for co-occurrencces
            co_occs = []

            # loop through all all entities and get number of co-occurrences
            for topic2 in topic_list2:
                co_occs.append(
                    len(
                        list(
                            db_handler.select_relations(
                                e1=topic1.lower(),
                                e2=topic2.lower(),
                                e1_is_synset=0,
                                e2_is_synset=0,
                            )
                        )
                    )
                )
            contingency_table[str(topic1) + " (" + str(i) + ")"] = co_occs

        i += 1

    # transpose the contingency table to get it into the right format
    contingency_table = contingency_table.transpose()

    return contingency_table

In [None]:
def chi_squared(contingency_table, print_orig = False, print_expect = False, print_chi_contr = False):
    """
    This function conducts a chi-squared test of independence between the different rows of a contingency table
    
    input: contingency_table
    
    ouput: None
    """
    contingency_table = sm.stats.Table(contingency_table)
    results = contingency_table.test_nominal_association()
    
    
    # orig contingency table
    if print_orig == True:
        print("Original contingency table:")
        print(contingency_table.table_orig)
    # expected values
    if print_expect == True:
        print("\nExpected values:")
        print(contingency_table.fittedvalues)
    # chi-squared contributions
    if print_chi_contr == True:
        print("\nChi-square contributions:")
        print(contingency_table.chi2_contribs)
    
    # results
    print("\nResults:")
    print(results)
    
    
    results = contingency_table.test_ordinal_association()
    #print("\nOrdinal test results:")
    #print(results)
    
    return


# Variables

Variable setups for different runs of models (all scraped from commoncrawl.com)

In [None]:
# First Try 
# general tryout on the NewsRelations Library

# directory path of relation models 
RELATION_MODELS_PATH = "/home/jonas/Documents/GitHub/MasterThesis/models/firstTry"

# model name
RELATION_MODELS = [
    "model.sqlite",
]

In [None]:
# Second Try
# models from newssources with different biases 
#
# -year:    2008 (for foxnews 2o08-2010)
# -domain:  politics
# -sources: NYT & foxnews 

# directory path of relation models 
RELATION_MODELS_PATH = "/home/jonas/Documents/GitHub/MasterThesis/models/secondTry"

# model names
RELATION_MODELS = [
    "RMadvanced_2008_politics_nytimes.sqlite",
    "RM_2009_politics_nytimes.sqlite",
    "RM_2008-2010_politics_foxnews.sqlite",
]



In [None]:
# Third Try
# models from newssources with different biases 
#
# -year:    2011
# -domain:  news
# -sources: reuters & national public radio

# directory path of relation models 
RELATION_MODELS_PATH = "/home/jonas/Documents/GitHub/MasterThesis/models/thirdTry"

# model names
RELATION_MODELS = [
    "RM_2011_news_reuters.sqlite",
    "RM_2011_news_npr.sqlite",
]



In [None]:
# Fourth try
# two models from newssources with different biases each, for comparing slant coherence within
# different directions. Timeslots with gapless news reporting were chosen.
#
# -year:    2011-01-01 - 2011-03-31
# -domain:  news
# -sources(left):    New York Times (NYT) 637, Washington Post (WP) 508
#         (center):  National Public Radio (NPR) 109, Reuters (RET) 300
#         (right):   FoxNews (FN) 2735, Newsmax (NM) 180

# directory path of relation models
RELATION_MODELS_PATH = "/home/jonas/Documents/GitHub/MasterThesis/models/fourthTry"

# model names
RELATION_MODELS = [
    "RM_2011_news_NYT.sqlite",
    "RM_2011_news_WP.sqlite",
    "RM_2011_news_NPR.sqlite",
    "RM_2011_news_RET.sqlite",
    "RM_2011_news_FN.sqlite",
    "RM_2011_news_NM.sqlite",
]

l1 = "RM_2011_news_NYT.sqlite"
l2 = "RM_2011_news_WP.sqlite"
c1 = "RM_2011_news_NPR.sqlite"
c2 = "RM_2011_news_RET.sqlite"
r1 = "RM_2011_news_FN.sqlite"
r2 = "RM_2011_news_NM.sqlite"

general global variables 

In [None]:
# number of entities you want to compare
NO_ENTITIES = 10

# reference entity
TOPIC_OF_INTEREST = "united_states"

# reference entity list
TOPIC_OF_INTEREST_LIST2 = [
    "united_states",
    "germany",
    "russia",
    "india",
    "china",
]

# co-occurrence entity list
TOPIC_OF_INTEREST_LIST = [
    "united_states",
]

# Tests

In [None]:
RELATION_MODELS = [
    [r1, l1],
    [r1, l2],
    [r1, c1],
    [r1, c2],
    [r2, l1],
    [r2, l2],
    [r2, c1],
    [r2, c2],
]

for c in RELATION_MODELS:
    contingency_table = build_contingency_table_from_single_topic(
        RELATION_MODELS_PATH, c, TOPIC_OF_INTEREST, NO_ENTITIES
    )

    chi_squared(contingency_table)

In [None]:
# extracting a contingency table from a single reference entity
contingency_table = build_contingency_table_from_single_topic(
    RELATION_MODELS_PATH, RELATION_MODELS, TOPIC_OF_INTEREST, NO_ENTITIES
)

chi_squared(contingency_table)

### Test coherence of cooccurrences within same slant group

In [None]:
# liberal

contingency_table = build_contingency_table_from_single_topic(
    RELATION_MODELS_PATH, RELATION_MODELS[:2], TOPIC_OF_INTEREST, NO_ENTITIES,
)

chi_squared(contingency_table, True)

In [None]:
# center

contingency_table = build_contingency_table_from_single_topic(
    RELATION_MODELS_PATH, RELATION_MODELS[2:4], TOPIC_OF_INTEREST, NO_ENTITIES,
)

chi_squared(contingency_table, True)

In [None]:
# conservative

contingency_table = build_contingency_table_from_single_topic(
    RELATION_MODELS_PATH, RELATION_MODELS[4:], TOPIC_OF_INTEREST, NO_ENTITIES,
)

chi_squared(contingency_table, True)

In [None]:
# liberal

contingency_table = compare_entity_lists(
    RELATION_MODELS_PATH,
    RELATION_MODELS[:2],
    TOPIC_OF_INTEREST_LIST,
    TOPIC_OF_INTEREST_LIST2,
)

chi_squared(contingency_table, True, False, False)

In [None]:
# center
contingency_table = compare_entity_lists(
    RELATION_MODELS_PATH,
    RELATION_MODELS[2:4],
    TOPIC_OF_INTEREST_LIST,
    TOPIC_OF_INTEREST_LIST2,
)

chi_squared(contingency_table, True, False, False)

In [None]:
# conservative
contingency_table = compare_entity_lists(
    RELATION_MODELS_PATH,
    RELATION_MODELS[4:],
    TOPIC_OF_INTEREST_LIST,
    TOPIC_OF_INTEREST_LIST2,
)

chi_squared(contingency_table, True, False, False)

In [None]:
# extract a contingency table from a list of reference entities 
contingency_table = build_contingency_table_from_topic_list(
    RELATION_MODELS_PATH, RELATION_MODELS, TOPIC_OF_INTEREST_LIST, NO_ENTITIES)

chi_squared(contingency_table)

In [None]:
# extract a contingency table from two lists of entities

contingency_table = compare_entity_lists(
    RELATION_MODELS_PATH,
    RELATION_MODELS,
    TOPIC_OF_INTEREST_LIST,
    TOPIC_OF_INTEREST_LIST2,
)

chi_squared(contingency_table)

In [None]:
    # initialize DistanceMeasure with reference-model
dm = DistanceMeasure(RELATION_MODELS_PATH, str(RELATION_MODELS[0]))

    # extract top NO_ENTITIES entities
topNYT = dm.get_top_co_occurrences(
    TOPIC_OF_INTEREST, cutoff=10, e1_is_synset=0, e2_is_synset=0
)

dm = DistanceMeasure(RELATION_MODELS_PATH, str(RELATION_MODELS[1]))

    # extract top NO_ENTITIES entities
topFN = dm.get_top_co_occurrences(
    TOPIC_OF_INTEREST, cutoff=10, e1_is_synset=0, e2_is_synset=0
)


print(topNYT)
print(topFN)
