In [14]:
import pandas as pd
from collections import Counter
from math import log, sqrt

In [10]:
def logodds(corpora_dic, bg_counter):
    """It calculates the log odds ratio of term i's frequency between
    a target corpus and another corpus, with the prior information from
    a background corpus. Inputs are:

    - a dictionary of Counter objects (corpora of our interest)
    - a Counter objects (background corpus)

    Output is a dictionary of dictionaries. Each dictionary contains the log
    odds ratio of each word.

    """
    corp_size = dict([(c, sum(corpora_dic[c].values())) for c in corpora_dic])
    bg_size = sum(bg_counter.values())
    result = dict([(c, {}) for c in corpora_dic])

    for name, c in corpora_dic.items():
        for word in c:
            # if 10 > sum(1 for corpus in corpora_dic.values() if corpus[word]):
            #    continue

            fi = c[word]
            fj = sum(co.get(word, 0) for x, co in corpora_dic.items() if x != name)
            fbg = bg_counter[word]
            ni = corp_size[name]
            nj = sum(x for idx, x in corp_size.items() if idx != name)
            nbg = bg_size
            oddsratio = (
                log(fi + fbg)
                - log(ni + nbg - (fi + fbg))
                - log(fj + fbg)
                + log(nj + nbg - (fj + fbg))
            )
            std = 1.0 / (fi + fbg) + 1.0 / (fj + fbg)
            z = oddsratio / sqrt(std)
            result[name][word] = z

    return result

In [11]:
news_citations_df = pd.read_parquet(
    "../data/intermediate/citation_analysis/news_citations.parquet"
)

In [39]:
gpt_cor = Counter(news_citations_df.query("model_family == 'gpt'").domain)
gemini_cor = Counter(news_citations_df.query("model_family == 'gemini'").domain)
perplexity_cor = Counter(news_citations_df.query("model_family == 'perplexity'").domain)

In [62]:
gpt_gemini_cor = Counter(
    news_citations_df.query("model_family == 'gpt' or model_family == 'gemini'").domain
)
gpt_ppx_cor = Counter(
    news_citations_df.query(
        "model_family == 'gpt' or model_family == 'perplexity'"
    ).domain
)
gemini_ppx_cor = Counter(
    news_citations_df.query(
        "model_family == 'gemini' or model_family == 'perplexity'"
    ).domain
)
all_cor = Counter(news_citations_df.domain)

In [42]:
gpt_v_gemini_logodds = logodds({"gpt": gpt_cor, "gemini": gemini_cor}, gpt_gemini_cor)
gpt_v_gemini_gpt_df = pd.DataFrame(
    gpt_v_gemini_logodds["gpt"].items(), columns=["domain", "logodds"]
).sort_values(by="logodds")
gpt_v_gemini_gemini_df = pd.DataFrame(
    gpt_v_gemini_logodds["gemini"].items(), columns=["domain", "logodds"]
).sort_values(by="logodds")

In [66]:
gpt_v_all_logodds = logodds({"gpt": gpt_cor, "other": gemini_ppx_cor}, all_cor)
gpt_v_all_df = pd.DataFrame(
    gpt_v_all_logodds["gpt"].items(), columns=["domain", "logodds"]
).sort_values(by="logodds", ascending=False)

gemini_v_all_logodds = logodds({"gemini": gemini_cor, "other": gpt_ppx_cor}, all_cor)
gemini_v_all_df = pd.DataFrame(
    gemini_v_all_logodds["gemini"].items(), columns=["domain", "logodds"]
).sort_values(by="logodds", ascending=False)

ppx_v_all_logodds = logodds(
    {"perplexity": perplexity_cor, "other": gpt_gemini_cor}, all_cor
)
ppx_v_all_df = pd.DataFrame(
    ppx_v_all_logodds["perplexity"].items(), columns=["domain", "logodds"]
).sort_values(by="logodds", ascending=False)

In [69]:
gpt_v_all_df.head(10)

Unnamed: 0,domain,logodds
6,reuters.com,34.089759
10,apnews.com,22.219095
2,ft.com,19.726681
7,axios.com,18.426373
33,time.com,11.430856
13,as.com,10.625586
21,theatlantic.com,8.638108
26,elpais.com,7.665638
9,theguardian.com,6.030787
91,biomedcentral.com,4.636705


In [70]:
gemini_v_all_df.head(10)

Unnamed: 0,domain,logodds
17,indiatimes.com,8.382392
9,livemint.com,6.624592
29,aljazeera.com,6.510891
52,thehindu.com,6.168107
103,dailymail.co.uk,5.310493
48,independent.co.uk,3.792984
227,history.com,3.584977
19,ndtv.com,3.58109
21,hindustantimes.com,3.490315
8,forbes.com,3.452091


In [71]:
ppx_v_all_df.head(10)

Unnamed: 0,domain,logodds
669,bbc.com,9.673403
659,nytimes.com,8.405172
162,yahoo.com,7.698068
18,espn.com,6.469246
678,cnn.com,6.069012
702,cnbc.com,5.833757
31,sohu.com,5.803378
41,163.com,5.765142
86,economictimes.com,5.528315
21,techcrunch.com,5.418761
