In [1]:
# for pretty code
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
# imports
import numpy as np
import pandas as pd
import statsmodels.api as sm

from newsrelations.query_db.relation_query import DBQueryHandlerCoocc
from newsrelations.helper_classes.synonym_handler import SynonymHandler
from newsrelations.metrics.distances import DistanceMeasure

<IPython.core.display.Javascript object>

In [26]:
def build_contingency_table_from_single_topic(
    relation_models_path, relation_models, topic_of_interest, no_entities=10
):
    """
    This function builds a contingency table from a input list of relation models generated with relation_miner.py 
    in regards to predetermined topic [TOPIC_OF_INTEREST].
    The first model in [RELATION_MODELS] is the reference model all other models will be compared with.
    The function extracts the top [NO_ENTITIES] co_occuring entities from the model and builds a contingency table.
    
    input:  relation_models = list
            relation_models_path = str 
            topic_of_interest = str           
            no_entities = int (standard 10)
            
    output: contingency_table = pandas DataFrame [rows = different models, columns = entities]
    """
    # initialize DistanceMeasure with reference-model
    dm = DistanceMeasure(relation_models_path, str(relation_models[0]))

    # extract top NO_ENTITIES entities
    top = dm.get_top_co_occurrences(
        topic_of_interest, cutoff=no_entities, e1_is_synset=0, e2_is_synset=0
    )
    # write first row of contingency_table
    contingency_table = pd.DataFrame(
        np.array([t[1] for t in top]),
        index=[t[0] for t in top],
        columns=[str(relation_models[0])],
    )

    # loop through all remaining models
    for model in relation_models[1:]:
        # initialize db_handler()
        db_handler = DBQueryHandlerCoocc(relation_models_path, model)

        # buffer for cooccurrences
        co_occs = []
        # loop through all entities and get number of co-occurrences
        for row in contingency_table.index:
            co_occs.append(
                len(
                    list(
                        db_handler.select_relations(
                            e1=topic_of_interest.lower(),
                            e2=row.lower(),
                            e1_is_synset=0,
                            e2_is_synset=0,
                        )
                    )
                )
            )

        contingency_table[str(model)] = co_occs

    # transpose the contingency table to get it into the right format
    contingency_table = contingency_table.transpose()
    return contingency_table

<IPython.core.display.Javascript object>

In [27]:
contingency_table = build_contingency_table_from_single_topic(
    RELATION_MODELS_PATH, RELATION_MODELS, TOPIC_OF_INTEREST, NO_ENTITIES
)

contingency_table

Unnamed: 0,washington,u.s.,bush,iraq,russia
RMadvanced_2008_politics_nytimes.sqlite,524,416,406,312,259
RM_2008-2010_politics_foxnews.sqlite,82,129,53,34,8


<IPython.core.display.Javascript object>

In [6]:
def build_contingency_table_from_topic_list(
    relation_models_path, relation_models, topic_of_interest_list, no_entities=10
):
    """
    This function builds a contingency table from a input list of relation models generated with relation_miner.py 
    in regards to predetermined topic list [topic_of_interest_list].
    The first model in [relation_models] is the reference model all other models will be compared with.
    The function extracts the top [no_entities] co-occuring entities for the first [topic_of_interest] from the 
    model and builds a contingency table.
    
    
    input:  relation_models = list
            relation_models_path = str 
            topic_of_interest_list = list           
            no_entities = int (standard 10)
            
    output: contingency_table = pandas DataFrame [rows = different models, columns = entities]
    """
    # identifier for models in relation_models_list
    i = 0
    # initialize DistanceMeasure with reference-model
    dm = DistanceMeasure(relation_models_path, str(relation_models[0]))

    # extract top NO_ENTITIES entities
    top = dm.get_top_co_occurrences(
        topic_of_interest_list[0], cutoff=no_entities, e1_is_synset=0, e2_is_synset=0
    )
    # write first row of contingency_table
    contingency_table = pd.DataFrame(
        np.array([t[1] for t in top]),
        index=[t[0] for t in top],
        columns=[str(topic_of_interest_list[0]) + " (" + str(i) + ")"],
    )

    # loop through the models
    for model in relation_models[:]:
        # initialize db_handler()
        db_handler = DBQueryHandlerCoocc(relation_models_path, model)

        for topic in topic_of_interest_list:
            # buffer for co-occurrencces
            co_occs = []

            # loop through all all entities and get number of co-occurrences
            for row in contingency_table.index:
                co_occs.append(
                    len(
                        list(
                            db_handler.select_relations(
                                e1=topic.lower(),
                                e2=row.lower(),
                                e1_is_synset=0,
                                e2_is_synset=0,
                            )
                        )
                    )
                )
            contingency_table[str(topic) + " (" + str(i) + ")"] = co_occs

        i += 1

    # transpose the contingency table to get it into the right format
    contingency_table = contingency_table.transpose()

    return contingency_table

<IPython.core.display.Javascript object>

In [8]:
def explain_contingency_table_from_list(relation_models):
    """
    This function lists the models from the relation_models list and their counter index
    
    Input: relation_models = list
    
    Output: None
    """
    i = 0
    for model in relation_models:
        print("model (" + str(i) + "): " + str(model))
        i += 1
    return

<IPython.core.display.Javascript object>

In [18]:
contingency_table_from_list = build_contingency_table_from_topic_list(
    RELATION_MODELS_PATH, RELATION_MODELS, TOPIC_OF_INTEREST_LIST, NO_ENTITIES
)

explain_contingency_table_from_list(RELATION_MODELS)
contingency_table_from_list

NameError: name 'build_contingency_table_from_topic_list' is not defined

<IPython.core.display.Javascript object>

In [20]:
def compare_entity_lists(
    relation_models_path, relation_models, topic_list1, topic_list2
):
    """
    This function creates a contingency table from a two input lists of entities [topic_list1][topic_list2]
    from a list of relation models 
    
    input:  relation_models = list
            relation_models_path = str 
            topic_list1 = list           
            topic_list2 = list
            
    output: contingency_table = pandas DataFrame [rows = different models, columns = entities]
    """
    # idx for models
    i = 0
    explain_contingency_table_from_list(relation_models)

    # initialize contingency_table
    contingency_table = pd.DataFrame(index=topic_list2)

    # loop through the models
    for model in relation_models:
        # initialize db_handler()
        db_handler = DBQueryHandlerCoocc(relation_models_path, model)

        for topic1 in topic_list1:
            # buffer for co-occurrencces
            co_occs = []

            # loop through all all entities and get number of co-occurrences
            for topic2 in topic_list2:
                co_occs.append(
                    len(
                        list(
                            db_handler.select_relations(
                                e1=topic1.lower(),
                                e2=topic2.lower(),
                                e1_is_synset=0,
                                e2_is_synset=0,
                            )
                        )
                    )
                )
            contingency_table[str(topic1) + " (" + str(i) + ")"] = co_occs

        i += 1

    # transpose the contingency table to get it into the right format
    contingency_table = contingency_table.transpose()

    return contingency_table

<IPython.core.display.Javascript object>

In [21]:
contingency_table = compare_entity_lists(
    RELATION_MODELS_PATH,
    RELATION_MODELS,
    TOPIC_OF_INTEREST_LIST,
    TOPIC_OF_INTEREST_LIST2,
)
contingency_table


model (0): RMadvanced_2008_politics_nytimes.sqlite
model (1): RM_2008_politics_nytimes.sqlite
model (2): RM_2008-2010_politics_foxnews.sqlite


Unnamed: 0,united_states,united_states.1,united_states.2,united_states.3,united_states.4
united_states (0),617,617,617,617,617
germany (0),142,142,142,142,142
russia (0),259,259,259,259,259
india (0),117,117,117,117,117
china (0),243,243,243,243,243
united_states (1),582,582,582,582,582
germany (1),143,143,143,143,143
russia (1),260,260,260,260,260
india (1),110,110,110,110,110
china (1),237,237,237,237,237


<IPython.core.display.Javascript object>

In [22]:
def chi_squared(contingency_table):
    """
    This function conducts a chi-squared test of independence between the different rows of a contingency table
    
    input: contingency_table
    
    ouput: None
    """
    contingency_table = sm.stats.Table(contingency_table)
    results = contingency_table.test_nominal_association()
    
    
    # orig contingency table
    print("Original contingency table:")
    print(contingency_table.table_orig)
    # expected values
    print("\nExpected values:")
    print(contingency_table.fittedvalues)
    # residual 
    print("\nChi-square contributions:")
    print(contingency_table.chi2_contribs)
    print("\nResults:")
    
    print(results)
    
    
    results = contingency_table.test_ordinal_association()
    #print("\nOrdinal test results:")
    #print(results)
    
    return


<IPython.core.display.Javascript object>

# test this shit

In [35]:
# input variables

# later import Relationsmodels from their paths via argsys
RELATION_MODELS = [
    "RMadvanced_2008_politics_nytimes.sqlite",
    "RM_2008-2010_politics_foxnews.sqlite",
]

# path to the directory of all relation models
RELATION_MODELS_PATH = "/home/jonas/.local/share/NewsRelations/models/secondTry/"
# number of entities you want to compare
NO_ENTITIES = 5
# reference entity
TOPIC_OF_INTEREST = "united_states"

# reference entity list
TOPIC_OF_INTEREST_LIST = ["united_states", "germany", "russia", "india", "china"]
# co-occurrence entity list
TOPIC_OF_INTEREST_LIST2 = [
    "united_states",
    "united_states",
    "united_states",
    "united_states",
    "united_states",
]

<IPython.core.display.Javascript object>

In [36]:
# testing variables
test_RM_similar = [
    "RMadvanced_2008_politics_nytimes.sqlite",
    "RM_2008_politics_nytimes.sqlite",
]


<IPython.core.display.Javascript object>

In [37]:
# test for different newspapers - nytimes vs. foxnews
contingency_table = build_contingency_table_from_single_topic(
    RELATION_MODELS_PATH, RELATION_MODELS, TOPIC_OF_INTEREST, NO_ENTITIES
)

chi_squared(contingency_table)



Original contingency table:
                                         washington  u.s.  bush  iraq  russia
RMadvanced_2008_politics_nytimes.sqlite         524   416   406   312     259
RM_2008-2010_politics_foxnews.sqlite             82   129    53    34       8

Expected values:
                                         washington        u.s.        bush  \
RMadvanced_2008_politics_nytimes.sqlite  522.582996  469.979757  395.817814   
RM_2008-2010_politics_foxnews.sqlite      83.417004   75.020243   63.182186   

                                              iraq      russia  
RMadvanced_2008_politics_nytimes.sqlite  298.37247  230.246964  
RM_2008-2010_politics_foxnews.sqlite      47.62753   36.753036  

Chi-square contributions:
                                         washington       u.s.      bush  \
RMadvanced_2008_politics_nytimes.sqlite    0.003842   6.199872  0.261931   
RM_2008-2010_politics_foxnews.sqlite       0.024071  38.840372  1.640920   

                               

<IPython.core.display.Javascript object>

In [34]:
# test for similar newspapers - nytimes vs. nytimes advanced
contingency_table = build_contingency_table_from_single_topic(
    RELATION_MODELS_PATH, test_RM_similar, TOPIC_OF_INTEREST, NO_ENTITIES
)

chi_squared(contingency_table)

Original contingency table:
                                         washington  u.s.  bush  iraq  russia
RMadvanced_2008_politics_nytimes.sqlite         524   416   406   312     259
RM_2008_politics_nytimes.sqlite                 489   411   379   294     260

Expected values:
                                         washington      u.s.     bush  \
RMadvanced_2008_politics_nytimes.sqlite    517.8456  422.7624  401.292   
RM_2008_politics_nytimes.sqlite            495.1544  404.2376  383.708   

                                             iraq    russia  
RMadvanced_2008_politics_nytimes.sqlite  309.7872  265.3128  
RM_2008_politics_nytimes.sqlite          296.2128  253.6872  

Chi-square contributions:
                                         washington      u.s.      bush  \
RMadvanced_2008_politics_nytimes.sqlite    0.073143  0.108170  0.055235   
RM_2008_politics_nytimes.sqlite            0.076495  0.113127  0.057766   

                                             iraq    russi

<IPython.core.display.Javascript object>

In [39]:
    # initialize DistanceMeasure with reference-model
dm = DistanceMeasure(RELATION_MODELS_PATH, str(RELATION_MODELS[0]))

    # extract top NO_ENTITIES entities
topNYT = dm.get_top_co_occurrences(
    TOPIC_OF_INTEREST, cutoff=10, e1_is_synset=0, e2_is_synset=0
)

dm = DistanceMeasure(RELATION_MODELS_PATH, str(RELATION_MODELS[1]))

    # extract top NO_ENTITIES entities
topFN = dm.get_top_co_occurrences(
    TOPIC_OF_INTEREST, cutoff=10, e1_is_synset=0, e2_is_synset=0
)


print(topNYT)
print(topFN)


[('washington', 524), ('u.s.', 416), ('bush', 406), ('iraq', 312), ('russia', 259), ('barack_obama', 246), ('china', 243), ('george_w._bush', 242), ('afghanistan', 219), ('europe', 213)]
[('obama', 136), ('u.s.', 129), ('washington', 82), ('white_house', 75), ('congress', 72), ('america', 59), ('barack_obama', 55), ('bush', 53), ('senate', 46), ('afghanistan', 39)]


<IPython.core.display.Javascript object>