How many articles mention contact tracing (vs. entire coronavirus corpus) per country?

In [79]:
import pandas as pd
import pickle
from dateutil import parser

In [4]:
us = pd.read_pickle("us/data/us_proc.pkl")
kor = pd.read_pickle("korea/data/korean_proc.pkl")
taiwan = pd.read_csv("taiwan/data/text_proc.csv")
hk_eng = pd.read_csv("hong kong/data/hk_eng_proc.csv")
hk_ch = pd.read_csv("hong kong/data/hk_ch_proc.csv")

In [87]:
kor = kor[~kor.date.isna()]

In [99]:
def get_week_of_year(date):
    if (type(date) == str):
        date = parser.parse(date)
    if (type(date) == float):
        print(date)
    year, week_of_year, day_of_week = date.isocalendar()
    return week_of_year

In [None]:
us.loc[:, 'week_of_year'] = us.date.apply(lambda d: get_week_of_year(d))
kor.loc[:, 'week_of_year'] = kor.date.apply(lambda d: get_week_of_year(d))
taiwan.loc[:, 'week_of_year'] = taiwan.date.apply(lambda d: get_week_of_year(parser.parse(d)))
hk_eng.loc[:, 'week_of_year'] = hk_eng.date.apply(lambda d: get_week_of_year(parser.parse(d)))
hk_ch.loc[:, 'week_of_year'] = hk_ch.date.apply(lambda d: get_week_of_year(parser.parse(d)))

In [123]:
us.loc[:, "contact_tracing"] = us.text_proc.str.contains("contact tracing|contact-tracing|contact tracking|contact-tracking")
kor.loc[:, "contact_tracing"] = kor.text_proc.str.contains("역학 조사|확진자 추적|컨택트 트레이싱|추적 조사|접촉자 추적|동선 추적")
taiwan.loc[:, "contact_tracing"] = taiwan.text_proc.str.contains("接觸 追蹤|持續 追蹤|接觸者 追蹤|追蹤 接觸者")
hk_ch.loc[:, "contact_tracing"] = hk_ch.text_proc.str.contains("接觸 追蹤|接觸者 追蹤|追蹤 接觸者|流行病學 調查")
hk_eng.loc[:, "contact_tracing"] = hk_eng.text_proc.str.contains("contact tracing|contact-tracing|contact tracking|contact-tracking")


In [130]:
us_ct = len(us[us.contact_tracing == True])
kor_ct = len(kor[kor.contact_tracing == True])
taiwan_ct = len(taiwan[taiwan.contact_tracing == True])
hk_ch_ct = len(hk_ch[hk_ch.contact_tracing == True])
hk_eng_ct = len(hk_eng[hk_eng.contact_tracing == True])

In [150]:
country = "us"
us_discrete_dtm = pd.read_csv(f"{country}/proc/discrete_dtm.csv")

country = "korea"
kor_discrete_dtm = pd.read_csv(f"{country}/proc/discrete_dtm.csv")

country = "hong kong/"
hk_eng_discrete_dtm = pd.read_csv(f"{country}/proc/eng/discrete_dtm.csv")

country = "hong kong"
hk_ch_discrete_dtm = pd.read_csv(f"{country}/proc/ch/discrete_dtm.csv")

# country = "taiwan"
# taiwan_discrete_dtm = pd.read_csv(f"{country}/proc/discrete_dtm.csv")

In [156]:
us_latent_articles = sum(us_discrete_dtm["Topic75"])
kor_latent_articles = sum(kor_discrete_dtm["Topic214"])
hk_ch_latent_articles = sum(hk_ch_discrete_dtm["Topic91"])
hk_eng_latent_articles = sum(hk_eng_discrete_dtm["Topic39"])


In [201]:
ct_articles = [
    [us_ct, us_latent_articles, 500, len(us)], 
    [kor_ct, kor_latent_articles, 500, len(kor)], 
    [taiwan_ct, 0, 500, len(taiwan)], 
    [hk_ch_ct, hk_ch_latent_articles, 500, len(hk_ch)], 
    [hk_eng_ct, hk_eng_latent_articles, 100, len(hk_eng)]
]

In [202]:
df = pd.DataFrame(data=ct_articles,
             index=['us', 'korea', 'taiwan', 'hong kong chinese', 'hong kong english'], 
             columns=['contact tracing keyword articles', 'latent topic articles', 'num topics', 'total articles'])

In [203]:
df.loc[:, 'P1: keyword articles/total articles'] = df['contact tracing keyword articles']/df['total articles'] * 100
df.loc[:, 'P2: latent topic articles/total articles'] = df['latent topic articles']/df['total articles'] * 100
df.loc[:, 'P3: latent topic articles/keyword articles'] = df['latent topic articles']/df['contact tracing keyword articles'] * 100


Proportion 1: Count of articles found by searching keyword / total articles

Proportion 2: Count of articles that scored significantly on contact tracing latent topic/ total articles

Proportion 3: Count of articles that scored significantly on contact tracing latent topic/ Count of articles found by searching contact tracing keyword (Proportion 2 / Proportion 1). Proportion 3 tells us how sensitive latent topics are, relative to a conventional keyword search.

In [204]:
df

Unnamed: 0,contact tracing keyword articles,latent topic articles,num topics,total articles,P1: keyword articles/total articles,P2: latent topic articles/total articles,P3: latent topic articles/keyword articles
us,1836,1438,500,51541,3.562213,2.790012,78.32244
korea,2732,1250,500,51194,5.336563,2.441692,45.754026
taiwan,344,0,500,84900,0.405183,0.0,0.0
hong kong chinese,135,230,500,22525,0.599334,1.021088,170.37037
hong kong english,385,136,100,4822,7.984239,2.820406,35.324675
