In [None]:
# for pretty code
%load_ext nb_black
%matplotlib inline

# Imports and functions

In [None]:
# imports
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook, tnrange, tqdm
from matplotlib.pyplot import figure
from newsrelations.query_db.relation_query import DBQueryHandlerCoocc
from newsrelations.helper_classes.synonym_handler import SynonymHandler
from newsrelations.metrics.distances import DistanceMeasure

In [None]:
def build_contingency_table_from_single_topic(
    relation_models_path, relation_models, topic_of_interest, no_entities=10
):
    """
    This function builds a contingency table from a input list of relation models generated with relation_miner.py 
    in regards to predetermined topic [TOPIC_OF_INTEREST].
    The first model in [RELATION_MODELS] is the reference model all other models will be compared with.
    The function extracts the top [NO_ENTITIES] co_occuring entities from the model and builds a contingency table.
    
    input:  relation_models = list
            relation_models_path = str 
            topic_of_interest = str           
            no_entities = int (standard 10)
            
    output: contingency_table = pandas DataFrame [rows = different models, columns = entities]
    """
    E1_SYNSET = 0
    E2_SYNSET = 1

    # initialize DistanceMeasure with reference-model
    dm = DistanceMeasure(relation_models_path, str(relation_models[0]))

    # extract top NO_ENTITIES entities
    top = dm.get_top_co_occurrences(
        topic_of_interest,
        cutoff=no_entities,
        e1_is_synset=E1_SYNSET,
        e2_is_synset=E2_SYNSET,
    )
    # write first row of contingency_table
    contingency_table = pd.DataFrame(
        np.array([t[1] for t in top]),
        index=[t[0] for t in top],
        columns=[str(relation_models[0])],
    )

    # loop through all remaining models
    for model in relation_models[1:]:
        # initialize db_handler()
        db_handler = DBQueryHandlerCoocc(relation_models_path, model)

        # buffer for cooccurrences
        co_occs = []
        # loop through all entities and get number of co-occurrences
        for row in contingency_table.index:
            co_occs.append(
                len(
                    list(
                        db_handler.select_relations(
                            e1=topic_of_interest.lower(),
                            e2=row.lower(),
                            e1_is_synset=E1_SYNSET,
                            e2_is_synset=E2_SYNSET,
                        )
                    )
                )
            )

        contingency_table[str(model)] = co_occs

    # transpose the contingency table to get it into the right format
    contingency_table = contingency_table.transpose()
    return contingency_table

In [None]:
def explain_models(relation_models):
    """
    This function lists the models from the relation_models list and their counter index
    
    Input: relation_models = list
    
    Output: None
    """
    i = 0
    for model in relation_models:
        print("model(" + str(i) + "): " + str(model))
        i += 1
    return

In [None]:
def build_contingency_table_from_topic_list(
    relation_models_path, relation_models, topic_of_interest_list, no_entities=10
):
    """
    This function builds a contingency table from a input list of relation models generated with relation_miner.py 
    in regards to predetermined topic list [topic_of_interest_list].
    The first model in [relation_models] is the reference model all other models will be compared with.
    The function extracts the top [no_entities] co-occuring entities for the first [topic_of_interest] from the 
    model and builds a contingency table.
    
    
    input:  relation_models = list
            relation_models_path = str 
            topic_of_interest_list = list           
            no_entities = int (standard 10)
            
    output: contingency_table = pandas DataFrame [rows = different models, columns = entities]
    """
    # identifier for models in relation_models_list
    i = 0
    # print models with idx
    explain_models(relation_models)

    # initialize DistanceMeasure with reference-model
    dm = DistanceMeasure(relation_models_path, str(relation_models[0]))

    # extract top NO_ENTITIES entities
    top = dm.get_top_co_occurrences(
        topic_of_interest_list[0], cutoff=no_entities, e1_is_synset=0, e2_is_synset=0
    )
    # write first row of contingency_table
    contingency_table = pd.DataFrame(
        np.array([t[1] for t in top]),
        index=[t[0] for t in top],
        columns=[str(topic_of_interest_list[0]) + " (" + str(i) + ")"],
    )

    # loop through the models
    for model in relation_models[:]:
        # initialize db_handler()
        db_handler = DBQueryHandlerCoocc(relation_models_path, model)

        for topic in topic_of_interest_list:
            # buffer for co-occurrencces
            co_occs = []

            # loop through all all entities and get number of co-occurrences
            for row in contingency_table.index:
                co_occs.append(
                    len(
                        list(
                            db_handler.select_relations(
                                e1=topic.lower(),
                                e2=row.lower(),
                                e1_is_synset=0,
                                e2_is_synset=0,
                            )
                        )
                    )
                )
            contingency_table[str(topic) + " (" + str(i) + ")"] = co_occs

        i += 1

    # transpose the contingency table to get it into the right format
    contingency_table = contingency_table.transpose()

    return contingency_table

In [None]:
def compare_entity_lists(
    relation_models_path, relation_models, topic_list1, topic_list2
):
    """
    This function creates a contingency table from a two input lists of entities [topic_list1][topic_list2]
    from a list of relation models 
    
    input:  relation_models = list
            relation_models_path = str 
            topic_list1 = list           
            topic_list2 = list
            
    output: contingency_table = pandas DataFrame [rows = different models, columns = entities]
    """
    # idx for models
    i = 0

    # print models with idx
    explain_models(relation_models)

    # initialize contingency_table
    contingency_table = pd.DataFrame(index=topic_list2)

    # loop through the models
    for model in relation_models:
        # initialize db_handler()
        db_handler = DBQueryHandlerCoocc(relation_models_path, model)

        for topic1 in topic_list1:
            # buffer for co-occurrencces
            co_occs = []

            # loop through all all entities and get number of co-occurrences
            for topic2 in topic_list2:
                co_occs.append(
                    len(
                        list(
                            db_handler.select_relations(
                                e1=topic1.lower(),
                                e2=topic2.lower(),
                                e1_is_synset=0,
                                e2_is_synset=0,
                            )
                        )
                    )
                )
            contingency_table[str(topic1) + " (" + str(i) + ")"] = co_occs

        i += 1

    # transpose the contingency table to get it into the right format
    contingency_table = contingency_table.transpose()

    return contingency_table

In [None]:
def chi_squared(contingency_table, print_orig = False, print_expect = False, print_chi_contr = False):
    """
    This function conducts a chi-squared test of independence between the different rows of a contingency table
    
    input: contingency_table
    
    ouput: None
    """
    contingency_table = sm.stats.Table(contingency_table)
    results = contingency_table.test_nominal_association()
    
    
    # orig contingency table
    if print_orig == True:
        print("Original contingency table:")
        print(contingency_table.table_orig)
    # expected values
    if print_expect == True:
        print("\nExpected values:")
        print(contingency_table.fittedvalues)
    # chi-squared contributions
    if print_chi_contr == True:
        print("\nChi-square contributions:")
        print(contingency_table.chi2_contribs)
    
    # results
    print("\nResults:")
    print(results)

   
    return


In [None]:
def do_chi_squared_comparison(
    relation_models_path, relation_models, topic_of_interest, no_entities
):
    """
    This function extracts chi-squared test results for all combinations of news-outlets
    
    input: contingency_table
    
    ouput: None
    """

    # extracting a contingency table from a single reference entity
    i = 0
    j = 0

    df_results = pd.DataFrame()

    for i in range(len(relation_models)):
        for j in range(len(relation_models)):
            models = []
            models = [relation_models[i]] + [relation_models[j]]

            contingency_table = build_contingency_table_from_single_topic(
                relation_models_path, models, topic_of_interest, no_entities
            )

            contingency_table = sm.stats.Table(contingency_table)
            results = contingency_table.test_nominal_association()

            df_results[
                str(relation_models[i][-10:-7])
                + " - "
                + str(relation_models[j][-10:-7])
            ] = [
                results.statistic,
                results.pvalue,
            ]

    df_results = df_results.transpose()
    df_results = df_results.rename(columns={0: "chi_sq", 1: "p_value"})
    df_results

    return df_results

# Variables

Variable setups for different runs of models (all scraped from commoncrawl.com)

In [None]:
# First Try
# general tryout on the NewsRelations Library

# directory path of relation models
RELATION_MODELS_PATH = "/home/jonas/Documents/GitHub/MasterThesis/models/firstTry"

# model name
RELATION_MODELS = [
    "model.sqlite",
]

In [None]:
# Second Try
# models from newssources with different biases 
#
# -year:    2008 (for foxnews 2o08-2010)
# -domain:  politics
# -sources: NYT & foxnews 

# directory path of relation models 
RELATION_MODELS_PATH = "/home/jonas/Documents/GitHub/MasterThesis/models/secondTry"

# model nameslf, path_or_b
RELATION_MODELS = [
    "RMadvanced_2008_politics_nytimes.sqlite",
    "RM_2009_politics_nytimes.sqlite",
    "RM_2008-2010_politics_foxnews.sqlite",
]



In [None]:
# Third Try
# models from newssources with different biases 
#
# -year:    2011
# -domain:  news
# -sources: reuters & national public radio

# directory path of relation models 
RELATION_MODELS_PATH = "/home/jonas/Documents/GitHub/MasterThesis/models/thirdTry"

# model names
RELATION_MODELS = [
    "RM_2011_news_reuters.sqlite",
    "RM_2011_news_npr.sqlite",
]



In [None]:
# Fourth try
# two models from newssources with different biases each, for comparing slant coherence within
# different directions. Timeslots with gapless news reporting were chosen.
#
# -year:    2011-01-01 - 2011-03-31
# -domain:  news
# -sources(left):    New York Times (NYT) 637, Washington Post (WP) 508
#         (center):  National Public Radio (NPR) 109, Reuters (RET) 300
#         (right):   FoxNews (FN) 2735, Newsmax (NM) 180

# directory path of relation models
RELATION_MODELS_PATH = "/home/jonas/Documents/GitHub/MasterThesis/models/fourthTry"

# model names
RELATION_MODELS = [
    "RM_2011_news_NYT.sqlite",
    "RM_2011_news_WP.sqlite",
    "RM_2011_news_NPR.sqlite",
    "RM_2011_news_RET.sqlite",
    "RM_2011_news_FN.sqlite",
    "RM_2011_news_NM.sqlite",
]


# Final datasets

Final datasets used in the paper

In [None]:
# Dataset for parameter estimation
# for the estiation of parameters number of entities and topic_of_interest
#
# -year:    2012
# -domain:  news
# -sources(left):    Huffington Post (HFP) 4909, New York Times (NYT) 2541,
#         (center):  CNN (CNN) 2491, Reuters (RET) 2135
#         (right):   FoxNews (FXN) 3784, (WSJ) 1215

# directory path of relation models
RELATION_MODELS_PATH = "/home/jonas/Documents/GitHub/MasterThesis/models/calibrationDataset"

# model names
RELATION_MODELS = [
    "RM_2012_news_HFP.sqlite",
    "RM_2012_news_NYT.sqlite",
    "RM_2012_news_CNN.sqlite",
    "RM_2012_news_RET.sqlite",
    "RM_2012_news_FXN.sqlite",
    "RM_2012_news_WSJ.sqlite",
]

In [None]:
# Final Dataset
# final daset based on works of Budak (2016) Flaxmann (2016) and Groseclose (2015)
#
# -year:    2011
# -domain:  news
# -sources(left):    Huffington Post (HFP) 14876, LA Times (LAT) 445, New York Times (NYT) 11281,
#                    Washington Post (WP) 14814, Daily KOS (DKO) 123
#         (center):  BBC (BBC) 52, CNN (CNN) 2652, Reuters (RET) 16767, Yahoo News (YHN) 211
#         (right):   Chicago Tribune (CTB) 2843, FoxNews (FXN) 6508, NBC (NBC) 3958, USA Today (UST) 171
#                    Wall Street Journal (WSJ) 2522, Breitbart (BBT) 76

# directory path of relation models
RELATION_MODELS_PATH = "/home/jonas/Documents/GitHub/MasterThesis/models/finalDataset"

# model names
RELATION_MODELS = [
    "RM_2011_news_BBC.sqlite",
    "RM_2011_news_BBT.sqlite",
    "RM_2011_news_CNN.sqlite",
    "RM_2011_news_CTB.sqlite",
    "RM_2011_news_DKO.sqlite",
    "RM_2011_news_FXN.sqlite",
    "RM_2011_news_HFP.sqlite",
    "RM_2011_news_LAT.sqlite",
    "RM_2011_news_NBC.sqlite",
    "RM_2011_news_NYT.sqlite",
    "RM_2011_news_RET.sqlite",
    "RM_2011_news_UST.sqlite",
    "RM_2011_news_WPO.sqlite",
    "RM_2011_news_WSJ.sqlite",
    "RM_2011_news_YHN.sqlite",
]


general global variables 

In [None]:
# path to results folder
RESULTS_PATH = "/home/jonas/Documents/GitHub/MasterThesis/results"

# number of entities you want to compare
NO_ENTITIES = 8
# number of entities in test-loops
N = 41

# reference entity
TOPIC_OF_INTEREST = "united_states"

# reference entity list
TOPIC_OF_INTEREST_LIST2 = [
    "united_states",
    "germany",
    "russia",
    "india",
    "china",
]

# Parameter estimation

## Estimation of "n" - number of entities 

To estimate the optimal n, I examine the stability of the p-value during chi-square tests with different n's, within same slant and between different slant groups


In [None]:
# hyper-parameters
# directory path of relation models
RELATION_MODELS_PATH = "/home/jonas/Documents/GitHub/MasterThesis/models/calibrationDataset"

# model names
RELATION_MODELS = [
    "RM_2012_news_HFP.sqlite",
    "RM_2012_news_NYT.sqlite",
    "RM_2012_news_CNN.sqlite",
    "RM_2012_news_RET.sqlite",
    "RM_2012_news_FXN.sqlite",
    "RM_2012_news_WSJ.sqlite",
]

# path to results folder
RESULTS_PATH = "/home/jonas/Documents/GitHub/MasterThesis/results/"

# max number of entities + 1, to compare with the reference entity
NO_ENTITIES = 41 

# reference entity - 
TOPIC_OF_INTEREST = "united_states"

In [None]:
# estimate optimal n - within same slant groups
models = [
    [RELATION_MODELS[0], RELATION_MODELS[1]],
    [RELATION_MODELS[1], RELATION_MODELS[0]],
    [RELATION_MODELS[2], RELATION_MODELS[3]],
    [RELATION_MODELS[3], RELATION_MODELS[2]],
    [RELATION_MODELS[4], RELATION_MODELS[5]],
    [RELATION_MODELS[5], RELATION_MODELS[4]],
]

# initialize dataframes and counter for columns
df = pd.DataFrame()
df_labels = pd.DataFrame()
i = 0

# loop through model constellation in models list
for constellation in tqdm(models):
    i += 1
    df_results = pd.DataFrame()
    df_singleLabels = pd.DataFrame()

    # loop through entity numbers until max entity is reached
    for n in range(1, NO_ENTITIES):
        # create SQL query and build contingency table for sm.stats
        contingency_table = build_contingency_table_from_single_topic(
            RELATION_MODELS_PATH, constellation, TOPIC_OF_INTEREST, n
        )
        contingency_table = sm.stats.Table(contingency_table)
        # calculate results + add them to dataframe
        results = contingency_table.test_nominal_association()
        df_results[n] = [results.statistic, results.pvalue]

    df_results = df_results.transpose()
    df_results = df_results.rename(
        columns={0: "chi_sq" + str(i), 1: "p_value" + str(i)}
    )
    """
    # initialize path for backup save of dataframe
    path = (
        "/home/jonas/Documents/GitHub/MasterThesis/results/nSameSlant_"
        + str(constellation[0][-10:-7])
        + "_"
        + str(constellation[1][-10:-7])
        + ".csv"
    )
    """
    # labels for visualization
    df_singleLabels = [
        str(str(constellation[0][-10:-7]) + "-" + str(constellation[1][-10:-7]))
    ]
    df_labels = df_labels.append(df_singleLabels, ignore_index=True)

    # save results to csv
    # df_results.to_csv(path)
    # concat all results to one big dataframe
    df = pd.concat([df, df_results], axis=1)

# save results to csv
df.to_csv("/home/jonas/Documents/GitHub/MasterThesis/results/nSameSlant_All.csv")

In [None]:
# Visualze

ax = plt.gca()

df.plot(kind="line", y="p_value1", color="red", ax=ax, label=df_labels[0][0])
df.plot(kind="line", y="p_value2", color="orange", ax=ax, label=df_labels[0][1])
df.plot(kind="line", y="p_value3", color="green", ax=ax, label=df_labels[0][2])
df.plot(kind="line", y="p_value4", color="yellow", ax=ax, label=df_labels[0][3])
df.plot(kind="line", y="p_value5", color="blue", ax=ax, label=df_labels[0][4])
df.plot(kind="line", y="p_value6", color="blue", ax=ax, label=df_labels[0][5])


plt.ylabel("p-value")
plt.xlabel("n")
plt.title(
    "Behavior of p-values in respective to number of examined entities\n within same slant groups"
)
plt.legend()
plt.figure(figsize=(20, 10))
plt.show()

In [None]:
# estimate optimal n - within different slant groups
models = [
    [RELATION_MODELS[0], RELATION_MODELS[2]],
    [RELATION_MODELS[0], RELATION_MODELS[3]],
    [RELATION_MODELS[0], RELATION_MODELS[4]],
    [RELATION_MODELS[0], RELATION_MODELS[5]],
    [RELATION_MODELS[1], RELATION_MODELS[2]],
    [RELATION_MODELS[1], RELATION_MODELS[3]],
    [RELATION_MODELS[1], RELATION_MODELS[4]],
    [RELATION_MODELS[1], RELATION_MODELS[5]],
    [RELATION_MODELS[2], RELATION_MODELS[0]],
    [RELATION_MODELS[2], RELATION_MODELS[1]],
    [RELATION_MODELS[2], RELATION_MODELS[4]],
    [RELATION_MODELS[2], RELATION_MODELS[5]],
    [RELATION_MODELS[3], RELATION_MODELS[0]],
    [RELATION_MODELS[3], RELATION_MODELS[1]],
    [RELATION_MODELS[3], RELATION_MODELS[4]],
    [RELATION_MODELS[3], RELATION_MODELS[5]],
    [RELATION_MODELS[4], RELATION_MODELS[0]],
    [RELATION_MODELS[4], RELATION_MODELS[1]],
    [RELATION_MODELS[4], RELATION_MODELS[2]],
    [RELATION_MODELS[4], RELATION_MODELS[3]],
    [RELATION_MODELS[5], RELATION_MODELS[0]],
    [RELATION_MODELS[5], RELATION_MODELS[1]],
    [RELATION_MODELS[5], RELATION_MODELS[2]],
    [RELATION_MODELS[5], RELATION_MODELS[3]],
]

# initialize dataframes and counter for column name
df = pd.DataFrame()
df_labels = pd.DataFrame()
i = 0

# loop through model constellation in models list
for constellation in tqdm(models):
    i += 1
    df_results = pd.DataFrame()
    df_singleLabels = pd.DataFrame()

    # loop through entity numbers until max entity is reached
    for n in range(1, NO_ENTITIES):
        # create SQL query and build contingency table for sm.stats
        contingency_table = build_contingency_table_from_single_topic(
            RELATION_MODELS_PATH, constellation, TOPIC_OF_INTEREST, n
        )
        contingency_table = sm.stats.Table(contingency_table)
        # calculate results + add them to dataframe
        results = contingency_table.test_nominal_association()
        df_results[n] = [results.statistic, results.pvalue]

    df_results = df_results.transpose()
    df_results = df_results.rename(
        columns={0: "chi_sq" + str(i), 1: "p_value" + str(i)}
    )

    """
    # initialize path for backup save of dataframe
    path = (
        "/home/jonas/Documents/GitHub/MasterThesis/results/nDiffSlant_"
        + str(constellation[0][-10:-7])
        + "_"
        + str(constellation[1][-10:-7])
        + ".csv"
    )
    """
    # labels for visualization
    df_singleLabels = [
        str(str(constellation[0][-10:-7]) + "-" + str(constellation[1][-10:-7]))
    ]
    df_labels = df_labels.append(df_singleLabels, ignore_index=True)

    # save results to csv
    # df_results.to_csv(path)
    # concat all results to one big dataframe
    df = pd.concat([df, df_results], axis=1)

# save results to csv
df.to_csv("/home/jonas/Documents/GitHub/MasterThesis/results/nDiffSlant_All.csv")
df

In [None]:
# Visualze

ax = plt.gca()

df.plot(kind="line", y="p_value1", color="red", ax=ax, label=df_labels[0][0])
df.plot(kind="line", y="p_value2", color="orange", ax=ax, label=df_labels[0][1])
df.plot(kind="line", y="p_value3", color="green", ax=ax, label=df_labels[0][2])
df.plot(kind="line", y="p_value4", color="blue", ax=ax, label=df_labels[0][3])
df.plot(kind="line", y="p_value5", color="yellow", ax=ax, label=df_labels[0][4])
df.plot(kind="line", y="p_value6", color="blue", ax=ax, label=df_labels[0][5])
df.plot(kind="line", y="p_value7", color="red", ax=ax, label=df_labels[0][6])
df.plot(kind="line", y="p_value8", color="orange", ax=ax, label=df_labels[0][7])
df.plot(kind="line", y="p_value9", color="green", ax=ax, label=df_labels[0][8])
df.plot(kind="line", y="p_value10", color="blue", ax=ax, label=df_labels[0][9])
df.plot(kind="line", y="p_value11", color="yellow", ax=ax, label=df_labels[0][10])
df.plot(kind="line", y="p_value12", color="blue", ax=ax, label=df_labels[0][11])
df.plot(kind="line", y="p_value13", color="red", ax=ax, label=df_labels[0][12])
df.plot(kind="line", y="p_value14", color="red", ax=ax, label=df_labels[0][13])
df.plot(kind="line", y="p_value15", color="red", ax=ax, label=df_labels[0][14])
df.plot(kind="line", y="p_value16", color="red", ax=ax, label=df_labels[0][15])
df.plot(kind="line", y="p_value17", color="red", ax=ax, label=df_labels[0][16])
df.plot(kind="line", y="p_value18", color="red", ax=ax, label=df_labels[0][17])
df.plot(kind="line", y="p_value19", color="red", ax=ax, label=df_labels[0][18])
df.plot(kind="line", y="p_value20", color="red", ax=ax, label=df_labels[0][19])
df.plot(kind="line", y="p_value21", color="red", ax=ax, label=df_labels[0][20])
df.plot(kind="line", y="p_value22", color="red", ax=ax, label=df_labels[0][21])
df.plot(kind="line", y="p_value23", color="red", ax=ax, label=df_labels[0][22])
df.plot(kind="line", y="p_value24", color="red", ax=ax, label=df_labels[0][23])


plt.xlabel("n")
plt.ylabel("p-value")
plt.title(
    "Behavior of p-values in respective to number of examined entities \nbetween different slant groups"
)
plt.figure(figsize=(20, 10))
plt.show()

## Estimation of optimal number of "topic_of_interest"

To estimate the optimal number of different reference-entities, I examine the behavior of the p-value witin and between different slant groups, under varying reference entities. The entities are derived from XXXX (XXXX) and represent controverse topics and non-controvers topics

In [None]:
# hyper-parameters
# directory path of relation models
RELATION_MODELS_PATH = "/home/jonas/Documents/GitHub/MasterThesis/models/calibrationDataset"

# model names
RELATION_MODELS = [
    "RM_2012_news_HFP.sqlite",
    "RM_2012_news_NYT.sqlite",
    "RM_2012_news_CNN.sqlite",
    "RM_2012_news_RET.sqlite",
    "RM_2012_news_FXN.sqlite",
    "RM_2012_news_WSJ.sqlite",
]

# path to results folder
RESULTS_PATH = "/home/jonas/Documents/GitHub/MasterThesis/results/"

# number of entities to compare with the reference entitiy
NO_ENTITIES = 41 

# reference entities 
TOPIC_OF_INTEREST_LIST = [
    "white_house",
    "obama",
    "bush",
    "NRA",
    "white_house",
    "united_states",
]

# Hypothesis testing 

## Hypothesis 1

1.1 Within same slant groups the co-occurring entities are independent from the
    news-outlet

1.2 Between different slant groups the co-occurring entities are dependent from
    the news-outlet

In [None]:
# compare everything with everything

df = do_chi_squared_comparison(
    RELATION_MODELS_PATH, RELATION_MODELS, TOPIC_OF_INTEREST, NO_ENTITIES
)

# save to csv
df.to_csv("/home/jonas/Documents/GitHub/MasterThesis/results/hyp1.csv")
df