In [2]:
import pandas as pd
import sqlite3
import pandas as pd
cnx = sqlite3.connect('../database/output/papers.db')

papers = pd.read_sql_query("SELECT * FROM paper", cnx)
authors =  pd.read_sql_query("SELECT * FROM author", cnx)
authors_to_papers =  pd.read_sql_query("SELECT * FROM paper_author", cnx)
keywords_to_papers = pd.read_sql_query("SELECT * FROM paper_keyword", cnx)
keywords =  pd.read_sql_query("SELECT * FROM keyword", cnx)

danny = pd.read_csv("keywords_tokenized_5_or_more_danny.csv")
reconciled = pd.read_csv("diffs_reconciled.csv")


Unnamed: 0,word,frequency,war,disease,markets
0,and,826,0.0,0.0,0.0
1,information,783,0.0,0.0,0.0
2,health,779,0.0,1.0,0.0
3,of,744,0.0,0.0,0.0
4,social,638,0.0,0.0,0.0
...,...,...,...,...,...
2999,antimicrobial,5,,,
3000,tenofovir,5,,,
3001,intraocular,5,,,
3002,harms,5,,,


In [3]:
danny = danny[danny.index < 1500]

In [7]:
reconciled = reconciled[[
    "word",
    "war_reconciled",
    "disease_reconciled",
    "markets_reconciled"
]]

In [8]:
reconciled = reconciled.rename(columns={
    "war_reconciled": "war",
    "disease_reconciled": "disease",
    "markets_reconciled": "markets"
})

In [9]:
merged = danny.merge(reconciled, on="word", how="left", suffixes=("", "_updated"))

In [11]:
for col in ["war", "disease", "markets"]:
    merged[col] = merged[f"{col}_updated"].combine_first(merged[col])
    merged = merged.drop(columns=[f"{col}_updated"])

In [14]:
merged.to_csv('merged.csv')

In [17]:
score_cols = ["war", "disease", "markets"]

# Build dict: { word : {war: x, disease: y, markets: z} }
score_lookup = (
    merged
    .set_index("word")[score_cols]
    .to_dict(orient="index")
)

In [41]:
from nltk import word_tokenize

def score_keyword(kw):
    tokens = word_tokenize(str(kw).lower())

    # initialize accumulators
    totals = {col: 0 for col in score_cols}
    totals["token_count"] = len(tokens)

    # accumulate scores
    for tok in tokens:
        if tok in score_lookup:
            for col in score_cols:
                totals[col] += score_lookup[tok][col]

    return totals


In [42]:
scores = keywords["keyword"].apply(score_keyword)
scores_df = scores.apply(pd.Series)

keywords_scored = pd.concat([keywords, scores_df], axis=1)
keywords_scored

Unnamed: 0,id,keyword,war,disease,markets,token_count
0,1,corporate social responsibility,0.0,0.0,0.0,3.0
1,2,covid-19,0.0,1.0,0.0,1.0
2,3,ethical crisis communication,1.0,0.0,1.0,3.0
3,4,fake news,0.0,0.0,0.0,2.0
4,5,infodemic,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...
40457,40458,tubal sterilization,0.0,0.0,0.0,2.0
40458,40459,ejaculation,0.0,0.0,0.0,1.0
40459,40460,foreign body,1.0,1.0,0.0,2.0
40460,40461,uterine tube sterilization,0.0,1.0,0.0,3.0


In [43]:
keywords_to_papers

Unnamed: 0,paper_id,keyword_id
0,10.1515/9783110752427-031,1
1,10.1515/9783110752427-031,2
2,10.1515/9783110752427-031,3
3,10.1515/9783110752427-031,4
4,10.1515/9783110752427-031,5
...,...,...
222115,10.1016/j.fertnstert.2010.05.031,7072
222116,10.1016/j.fertnstert.2010.05.031,2294
222117,10.1016/j.fertnstert.2010.05.031,3993
222118,10.1016/j.fertnstert.2010.05.031,40462


In [49]:
merged = keywords_to_papers.merge(
    keywords_scored,
    left_on="keyword_id",
    right_on="id",
    how="left"
)

paper_scores = (
    merged.groupby("paper_id")[score_cols]
          .sum()
          .reset_index()
)

token_counts = (
    merged.groupby("paper_id")["token_count"]
          .sum()
          .rename("token_count_total")
          .reset_index()
)


In [50]:
paper_scores = paper_scores.merge(token_counts, on="paper_id", how="left")

for col in score_cols:   # ["war", "disease", "markets"]
    paper_scores[col + "_norm"] = (
        paper_scores[col] / paper_scores["token_count_total"]
    )

In [51]:
paper_scores.to_csv("paper_metaphor_scores.csv")

In [52]:
paper_scores

Unnamed: 0,paper_id,war,disease,markets,token_count_total,war_norm,disease_norm,markets_norm
0,,404.0,1108.0,690.0,11696.0,0.034542,0.094733,0.058995
1,10.1001/amajethics.2023.204,0.0,3.0,3.0,21.0,0.000000,0.142857,0.142857
2,10.1001/amajethics.2023.210,1.0,7.0,1.0,16.0,0.062500,0.437500,0.062500
3,10.1001/amajethics.2023.228,1.0,2.0,1.0,10.0,0.100000,0.200000,0.100000
4,10.1001/jama.2018.16865,2.0,14.0,4.0,52.0,0.038462,0.269231,0.076923
...,...,...,...,...,...,...,...,...
12253,10.9745/GHSP-D-21-00031,2.0,7.0,6.0,47.0,0.042553,0.148936,0.127660
12254,10.9745/GHSP-D-22-00447,0.0,8.0,0.0,21.0,0.000000,0.380952,0.000000
12255,10.9745/GHSP-D-23-00146,0.0,8.0,0.0,23.0,0.000000,0.347826,0.000000
12256,10.9781/ijimai.2020.06.003,0.0,0.0,0.0,8.0,0.000000,0.000000,0.000000


In [53]:
norm_cols = ["war_norm", "disease_norm", "markets_norm"]
def get_top_and_bottom(df, col, n=5):
    top = df.sort_values(col, ascending=False).head(n)
    bottom = df.sort_values(col, ascending=True).head(n)
    return top, bottom
    

In [54]:
results = {}

for col in norm_cols:
    top, bottom = get_top_and_bottom(paper_scores, col, n=5)
    results[col] = {
        "top": top,
        "bottom": bottom
    }

In [55]:
output_frames = []

for col in norm_cols:
    top = paper_scores.sort_values(col, ascending=False).head(5).assign(rank_type=f"{col}_top5")
    bottom = paper_scores.sort_values(col, ascending=True).head(5).assign(rank_type=f"{col}_bottom5")
    output_frames.extend([top, bottom])

summary_df = pd.concat(output_frames, ignore_index=True)

In [56]:
summary_df.to_csv("paper_score_extremes.csv", index=False)


In [58]:
def score_text(text):
    tokens = word_tokenize(str(text).lower())

    # accumulator for scores
    totals = {col: 0 for col in score_cols}
    totals["token_count"] = len(tokens)

    for tok in tokens:
        if tok in score_lookup:     # same lookup dict as keyword scoring
            for col in score_cols:
                totals[col] += score_lookup[tok][col]

    return totals


In [59]:
papers["text"] = papers["title"].fillna("") + " " + papers["abstract"].fillna("")


In [88]:
import numpy as np

paper_scores = papers["text"].apply(score_text)
paper_scores_df = paper_scores.apply(pd.Series)

for col in score_cols:
    paper_scores_df[col + "_norm"] = (
        paper_scores_df[col] / paper_scores_df["token_count"]
    )
    paper_scores_df[col + "_sqrt_norm"] = (
        paper_scores_df[col] / np.sqrt(paper_scores_df["token_count"])
    )


In [89]:
papers_scored = pd.concat([papers, paper_scores_df], axis=1)

In [105]:
papers_scored = papers_scored[papers_scored.abstract != '[No abstract available]']

In [107]:
results = {}

cols_sqrt = ['war_sqrt_norm', 'disease_sqrt_norm', 'markets_sqrt_norm']

# use unnormalized to avoid prioritizing short or nonexistent abstracts
# use normalized to avoid prioritizing long abstracts
# use sqrt norm as an attempt at a middle ground btw prioritizing long and short abstracts
for col in norm_cols:
    top, bottom = get_top_and_bottom(papers_scored, col, n=5)
    results[col] = {"top": top, "bottom": bottom}


In [108]:
frames = []

for col in norm_cols:
    top = papers_scored.sort_values(col, ascending=False).head(5)
    top = top.assign(rank_type=f"{col}_top5")
    
    bottom = papers_scored.sort_values(col, ascending=True).head(5)
    bottom = bottom.assign(rank_type=f"{col}_bottom5")
    
    frames.extend([top, bottom])

summary_df = pd.concat(frames, ignore_index=True)

In [109]:
summary_df.to_csv("abstract_title_scores_top_bottom_no_missing_abstracts.csv")

In [110]:
summary_df

Unnamed: 0,doi,title,abstract,year,community,text,war,disease,markets,token_count,war_norm,war_sqrt_norm,disease_norm,disease_sqrt_norm,markets_norm,markets_sqrt_norm,rank_type
0,10.1080/14742837.2013.807730,Law as Movement Strategy: How the Islamophobia...,Islamophobia is a strategic movement led by a ...,2014,7609,Law as Movement Strategy: How the Islamophobia...,18.0,0.0,1.0,131.0,0.137405,1.572667,0.0,0.0,0.007634,0.08737,war_norm_top5
1,10.1002/9781119289142.ch11,"National Strategy for Combating Terrorism, 200...",This chapter details the 2006 National Strateg...,2017,7202,"National Strategy for Combating Terrorism, 200...",20.0,2.0,9.0,168.0,0.119048,1.543033,0.011905,0.154303,0.053571,0.694365,war_norm_top5
2,10.4324/9781003432630,Memetic War: Online Resistance in Ukraine,Memetic War analyses memetic warfare included ...,2023,1547,Memetic War: Online Resistance in Ukraine Meme...,20.0,0.0,4.0,178.0,0.11236,1.499063,0.0,0.0,0.022472,0.299813,war_norm_top5
3,10.55540/0031-1723.3139,On “The Alt-Right Movement and US National Sec...,This commentary responds to Matthew Valasik an...,2022,3760,On “The Alt-Right Movement and US National Sec...,6.0,0.0,2.0,59.0,0.101695,0.781133,0.0,0.0,0.033898,0.260378,war_norm_top5
4,10.1080/02185377.2021.1979062,COVID-19 vaccination campaign trends and chall...,How successful have countries in Asia been at ...,2021,5170,COVID-19 vaccination campaign trends and chall...,15.0,13.0,8.0,155.0,0.096774,1.204829,0.083871,1.044185,0.051613,0.642575,war_norm_top5
5,10.1177/15579883221094716,Assessing Vasectomy-Related Information on You...,This study aims to critically appraise the qua...,2022,3097,Assessing Vasectomy-Related Information on You...,0.0,6.0,1.0,308.0,0.0,0.0,0.019481,0.341882,0.003247,0.05698,war_norm_bottom5
6,10.1542/peds.2017-2241,How parents of children with cancer learn abou...,Objectives: To determine which prognostic info...,2018,7053,How parents of children with cancer learn abou...,0.0,12.0,1.0,328.0,0.0,0.0,0.036585,0.662589,0.003049,0.055216,war_norm_bottom5
7,10.1109/BdKCSE59280.2023.10339741,Contagion Propagation with Rule-Based Reasonin...,"In recent years, and especially during the on-...",2023,1364,Contagion Propagation with Rule-Based Reasonin...,0.0,5.0,2.0,155.0,0.0,0.0,0.032258,0.40161,0.012903,0.160644,war_norm_bottom5
8,10.1109/JBHI.2020.3032479,Automatically Assessing Quality of Online Heal...,Today Information in the world wide web is ove...,2021,6740,Automatically Assessing Quality of Online Heal...,0.0,8.0,1.0,211.0,0.0,0.0,0.037915,0.550743,0.004739,0.068843,war_norm_bottom5
9,10.22201/iibi.24488321xe.2020.84.58115,Information reliability: Criteria to identify ...,The article presents information reliability c...,2020,6291,Information reliability: Criteria to identify ...,0.0,1.0,0.0,198.0,0.0,0.0,0.005051,0.071067,0.0,0.0,war_norm_bottom5
