In [0]:
# ライブラリのインストール
%pip install pycountry

# 2. CORD-19データセットの分析
### COVID-19 Open Research Dataset Challenge (CORD-19) 作業用ノートブック

このノートブックは、CORD-19データセットの分析を容易に始められるようにするための、 [COVID-19 Open Research Dataset Challenge (CORD-19)](https://www.kaggle.com/allen-institute-for-ai/CORD-19-research-challenge) に対する作業用ノートブックです。  

<img src="https://miro.medium.com/max/3648/1*596Ur1UdO-fzQsaiGPrNQg.png" width="900"/>

アトリビューション:
* このノートブックで使用されるデータセットのライセンスは、[downloaded dataset](https://www.kaggle.com/allen-institute-for-ai/CORD-19-research-challenge/download)に含まれるメタデータcsvに記載されています。
* 2020-03-03のデータセットには以下が含まれています。
  * `comm_use_subset`: 商用利用のサブセット (PMCコンテンツを含む) -- 9000 論文(内3論文は空), 186Mb
  * `noncomm_use_subset`: 非商用利用のサブセット (PMCコンテンツを含む) -- 1973 論文(内1論文は空), 36Mb
  * `biorxiov_medrxiv`: bioRxiv/medRxiv サブセット (ピアレビューされていない準備稿) -- 803 論文, 13Mb
* DatabricksあるいはDatabricksコミュニティエディションを使用する際には、`/databricks-datasets/COVID/CORD-19`からデータセットのコピーを利用できます。
* このノートブックは[CC BY 3.0](https://creativecommons.org/licenses/by/3.0/us/)のライセンスの下で共有することができます。

> **注意**<br>
> このノートブックを実行する前に「1. JSONデータセットの読み込み」を実行して、ファイルを準備してください。

In [0]:
# ユーザーごとに一意のパスになるようにユーザー名をパスに含めます
import re
from pyspark.sql.types import * 

# Username を取得
username_raw = dbutils.notebook.entry_point.getDbutils().notebook().getContext().tags().apply('user')
# Username の英数字以外を除去し、全て小文字化。Username をファイルパスやデータベース名の一部で使用可能にするため。
username = re.sub('[^A-Za-z0-9]+', '', username_raw).lower()

print(username)

## Parquetパス変数の設定

`/tmp/<ユーザー名>/COVID/CORD-19/2020-03-13/`にParquetフォーマットで保存されています。

In [0]:
# PythonにおけるPathの設定
comm_use_subset_pq_path = f"/tmp/{username}/COVID/CORD-19/2020-03-13/comm_use_subset.parquet"
noncomm_use_subset_pq_path = f"/tmp/{username}/COVID/CORD-19/2020-03-13/noncomm_use_subset.parquet"
biorxiv_medrxiv_pq_path = f"/tmp/{username}/COVID/CORD-19/2020-03-13/biorxiv_medrxiv/biorxiv_medrxiv.parquet"
json_schema_path = "/databricks-datasets/COVID/CORD-19/2020-03-13/json_schema.txt"

# シェル環境変数の設定
import os
os.environ['comm_use_subset_pq_path']=''.join(comm_use_subset_pq_path)
os.environ['noncomm_use_subset_pq_path']=''.join(noncomm_use_subset_pq_path)
os.environ['biorxiv_medrxiv_pq_path']=''.join(biorxiv_medrxiv_pq_path)
os.environ['json_schema_path']=''.join(json_schema_path)

## Parquetファイルの読み込み

これらは適切に整形されたJSONファイルのなので、これらのファイルを読み込むために`spark.read.json`を使用できます。*multiline*オプションを指定する必要があることに注意してください。

In [0]:
# ファイルの読み込み
comm_use_subset = spark.read.format("parquet").load(comm_use_subset_pq_path)
noncomm_use_subset = spark.read.format("parquet").load(noncomm_use_subset_pq_path)
biorxiv_medrxiv = spark.read.format("parquet").load(biorxiv_medrxiv_pq_path)

In [0]:
# レコード数のカウント
comm_use_subset_cnt = comm_use_subset.count()
noncomm_use_subset_cnt = noncomm_use_subset.count()
biorxiv_medrxiv_cnt = biorxiv_medrxiv.count()

# 出力
print ("comm_use_subset: %s, noncomm_use_subset: %s, biorxiv_medrxiv: %s" % (comm_use_subset_cnt, noncomm_use_subset_cnt, biorxiv_medrxiv_cnt))

In [0]:
%sh 
cat /dbfs$json_schema_path

In [0]:
comm_use_subset.createOrReplaceTempView("comm_use_subset")
comm_use_subset.printSchema()

## 著者の抽出

論文の地理的位置を特定するために著者のメタデータを抽出し、一時ビュー`paperAuthorLocation`を作成します。

In [0]:
%sql
select paper_id, metadata.title, metadata.authors, metadata from comm_use_subset limit 10

paper_id,title,authors,metadata
9a9f30acc275c64c016770509f9b90eb091e272a,Seasonal recurrence of cowpox virus outbreaks in captive cheetahs (Acinonyx jubatus),"List(List(List(Ree Park-Safari, , List(null, Denmark, null, null, null, Ebeltoft)), , Julia, Stagegaard, List(), ), List(List(Robert Koch Institute, Centre for Biological Threats and Special Pathogens, List(null, null, null, null, null, Berlin, Germany)), , Andreas, Kurth, List(), ), List(List(Robert Koch Institute, Centre for Biological Threats and Special Pathogens, List(null, null, null, null, null, Berlin, Germany)), , Daniel, Stern, List(), ), List(List(Robert Koch Institute, , List(null, null, null, null, null, Berlin, Germany)), , Piotr, Dabrowski, List(Wojciech), ), List(List(Finn Pathologists, , List(null, England, null, null, null, Norfolk)), , Ann, Pocknell, List(), ), List(List(Robert Koch Institute, Centre for Biological Threats and Special Pathogens, List(null, null, null, null, null, Berlin, Germany)), *nitschea@rki.de, Andreas, Nitsche, List(), ), List(List(Robert Koch Institute, Centre for Biological Threats and Special Pathogens, List(null, null, null, null, null, Berlin, Germany)), , Livia, Schrick, List(), ))","List(List(List(List(Ree Park-Safari, , List(null, Denmark, null, null, null, Ebeltoft)), , Julia, Stagegaard, List(), ), List(List(Robert Koch Institute, Centre for Biological Threats and Special Pathogens, List(null, null, null, null, null, Berlin, Germany)), , Andreas, Kurth, List(), ), List(List(Robert Koch Institute, Centre for Biological Threats and Special Pathogens, List(null, null, null, null, null, Berlin, Germany)), , Daniel, Stern, List(), ), List(List(Robert Koch Institute, , List(null, null, null, null, null, Berlin, Germany)), , Piotr, Dabrowski, List(Wojciech), ), List(List(Finn Pathologists, , List(null, England, null, null, null, Norfolk)), , Ann, Pocknell, List(), ), List(List(Robert Koch Institute, Centre for Biological Threats and Special Pathogens, List(null, null, null, null, null, Berlin, Germany)), *nitschea@rki.de, Andreas, Nitsche, List(), ), List(List(Robert Koch Institute, Centre for Biological Threats and Special Pathogens, List(null, null, null, null, null, Berlin, Germany)), , Livia, Schrick, List(), )), Seasonal recurrence of cowpox virus outbreaks in captive cheetahs (Acinonyx jubatus))"
2b4696bf4bc923a139e8508086f854ca52b690d0,UniProt: a worldwide hub of protein knowledge The UniProt Consortium,"List(List(List(null, null, null), , , D506-D515, List(), ))","List(List(List(List(null, null, null), , , D506-D515, List(), )), UniProt: a worldwide hub of protein knowledge The UniProt Consortium)"
14723bfc42908c3f8f522692be41062a7cda62d4,,"List(List(List(null, null, null), , Akio, Adachi, List(), ), List(List(null, null, null), , Béatrice, Nal, List(), ), List(List(null, null, null), , Frederick, Fuller, List(Joseph), ), List(List(null, null, null), , Lv, , List(X), ), List(List(null, null, null), , Zhao, , List(K), ), List(List(null, null, null), , Lan, , List(Y), ), List(List(null, null, null), , Li, , List(Z), ), List(List(null, null, null), , Ding, , List(N), ))","List(List(List(List(null, null, null), , Akio, Adachi, List(), ), List(List(null, null, null), , Béatrice, Nal, List(), ), List(List(null, null, null), , Frederick, Fuller, List(Joseph), ), List(List(null, null, null), , Lv, , List(X), ), List(List(null, null, null), , Zhao, , List(K), ), List(List(null, null, null), , Lan, , List(Y), ), List(List(null, null, null), , Li, , List(Z), ), List(List(null, null, null), , Ding, , List(N), )), )"
bb6a9f522a87a780723faca7cde002ece6dbfb48,A novel reporter system for neutralizing and enhancing antibody assay against dengue virus,"List(List(List(null, null, null), , Ke-Yu, Song, List(), ), List(List(null, null, null), , Hui, Zhao, List(), ), List(List(null, null, null), , †, , List(), ), List(List(null, null, null), , Zhen-You, Jiang, List(), ), List(List(null, null, null), , Xiao-Feng, Li, List(), ), List(List(null, null, null), , Yong-Qiang, Deng, List(), ), List(List(null, null, null), , Tao, Jiang, List(), ), List(List(null, null, null), , Shun-Ya, Zhu, List(), ), List(List(null, null, null), , Pei-Yong, Shi, List(), ), List(List(null, null, null), , Bo, Zhang, List(), ), List(List(null, null, null), , Fu-Chun, Zhang, List(), ), List(List(null, null, null), , E-De, Qin, List(), ), List(List(null, null, null), , Cheng-Feng, Qin, List(), ))","List(List(List(List(null, null, null), , Ke-Yu, Song, List(), ), List(List(null, null, null), , Hui, Zhao, List(), ), List(List(null, null, null), , †, , List(), ), List(List(null, null, null), , Zhen-You, Jiang, List(), ), List(List(null, null, null), , Xiao-Feng, Li, List(), ), List(List(null, null, null), , Yong-Qiang, Deng, List(), ), List(List(null, null, null), , Tao, Jiang, List(), ), List(List(null, null, null), , Shun-Ya, Zhu, List(), ), List(List(null, null, null), , Pei-Yong, Shi, List(), ), List(List(null, null, null), , Bo, Zhang, List(), ), List(List(null, null, null), , Fu-Chun, Zhang, List(), ), List(List(null, null, null), , E-De, Qin, List(), ), List(List(null, null, null), , Cheng-Feng, Qin, List(), )), A novel reporter system for neutralizing and enhancing antibody assay against dengue virus)"
ad4dc41b48d9f6024088cb6c65a51972d928d200,Predicting and Evaluating the Epidemic Trend of Ebola Virus Disease in the 2014-2015 Outbreak and the Effects of Intervention Measures,"List(List(List(null, null, null), , Zuiyuan, Guo, List(), ), List(List(Fourth Military Medical University, , List(null, null, null, null, null, Xi'an)), , Dan, 2☯, List(Xiao), ), List(List(null, null, null), , Dongli, Li, List(), ), List(List(null, null, null), , Xiuhong, Wang, List(), ), List(List(null, null, null), , Yayu, Wang, List(), ), List(List(null, null, null), , Tiecheng, Yan, List(), ), List(List(Shenyang Jianzhu University, , List(null, China, null, null, null, Shenyang, Liaoning Province)), , Zhiqi, Wang, List(), ), List(List(null, null, null), , Guo, , List(Z), ), List(List(null, null, null), , Xiao, Li, List(D), ), List(List(null, null, null), , D, , List(), ), List(List(null, null, null), , Wang, Wang, List(X), ), List(List(null, null, null), , Y, , List(), ), List(List(null, null, null), , Yan, , List(), ))","List(List(List(List(null, null, null), , Zuiyuan, Guo, List(), ), List(List(Fourth Military Medical University, , List(null, null, null, null, null, Xi'an)), , Dan, 2☯, List(Xiao), ), List(List(null, null, null), , Dongli, Li, List(), ), List(List(null, null, null), , Xiuhong, Wang, List(), ), List(List(null, null, null), , Yayu, Wang, List(), ), List(List(null, null, null), , Tiecheng, Yan, List(), ), List(List(Shenyang Jianzhu University, , List(null, China, null, null, null, Shenyang, Liaoning Province)), , Zhiqi, Wang, List(), ), List(List(null, null, null), , Guo, , List(Z), ), List(List(null, null, null), , Xiao, Li, List(D), ), List(List(null, null, null), , D, , List(), ), List(List(null, null, null), , Wang, Wang, List(X), ), List(List(null, null, null), , Y, , List(), ), List(List(null, null, null), , Yan, , List(), )), Predicting and Evaluating the Epidemic Trend of Ebola Virus Disease in the 2014-2015 Outbreak and the Effects of Intervention Measures)"
eb07f3436bd925c0de35ecbe26930536e8bcf200,"Theoretical Biology and Medical Modelling Time variations in the transmissibility of pandemic influenza in Prussia, Germany, from 1918-19","List(List(List(University of Tübingen, , List(Westbahnhofstr. 55, Germany, null, D-72070, null, Tübingen)), nishiura.hiroshi@uni-tuebingen.de, Hiroshi, Nishiura, List(), ))","List(List(List(List(University of Tübingen, , List(Westbahnhofstr. 55, Germany, null, D-72070, null, Tübingen)), nishiura.hiroshi@uni-tuebingen.de, Hiroshi, Nishiura, List(), )), Theoretical Biology and Medical Modelling Time variations in the transmissibility of pandemic influenza in Prussia, Germany, from 1918-19)"
6a1fd48435dc54d1682a0bafd6cdf7142b6b5bc4,Inferring R 0 in emerging epidemics-the effect of common population structure is small,"List(List(List(Stockholm University, , List(null, Sweden, null, null, null, Stockholm)), ptrapman@math.su.se, Pieter, Trapman, List(), ), List(List(University of Nottingham, , List(null, UK, null, null, null, Nottingham)), , Frank, Ball, List(), ), List(List(Sorbonne Paris Cité, UMR 7539, List(null, France, null, null, null, null)), , Jean-Stéphane, Dhersin, List(), ), List(List(Université des Sciences et Technologies de Lille, Laboratoire Paul Painlevé, List(null, France, null, null, null, Villeneuve-d'Ascq)), , Viet, Tran, List(Chi), ), List(List(, Rijksinstituut voor Volksgezondheid en Milieu (RIVM), List(null, The Netherlands, null, null, null, Bilthoven)), , Jacco, Wallinga, List(), ), List(List(Stockholm University, , List(null, Sweden, null, null, null, Stockholm)), , Tom, Britton, List(), ))","List(List(List(List(Stockholm University, , List(null, Sweden, null, null, null, Stockholm)), ptrapman@math.su.se, Pieter, Trapman, List(), ), List(List(University of Nottingham, , List(null, UK, null, null, null, Nottingham)), , Frank, Ball, List(), ), List(List(Sorbonne Paris Cité, UMR 7539, List(null, France, null, null, null, null)), , Jean-Stéphane, Dhersin, List(), ), List(List(Université des Sciences et Technologies de Lille, Laboratoire Paul Painlevé, List(null, France, null, null, null, Villeneuve-d'Ascq)), , Viet, Tran, List(Chi), ), List(List(, Rijksinstituut voor Volksgezondheid en Milieu (RIVM), List(null, The Netherlands, null, null, null, Bilthoven)), , Jacco, Wallinga, List(), ), List(List(Stockholm University, , List(null, Sweden, null, null, null, Stockholm)), , Tom, Britton, List(), )), Inferring R 0 in emerging epidemics-the effect of common population structure is small)"
e3de6d6d50592102725cbe2ee5cb0fe02b851aac,Inhibitory Effect of Resveratrol against Duck Enteritis Virus In Vitro,"List(List(List(null, null, null), , J, Xu, List(), ), List(List(null, null, null), , Z, Yin, List(), ), List(List(null, null, null), , L, Li, List(), ), List(List(null, null, null), , A, Cheng, List(), ), List(List(null, null, null), , R, Jia, List(), ))","List(List(List(List(null, null, null), , J, Xu, List(), ), List(List(null, null, null), , Z, Yin, List(), ), List(List(null, null, null), , L, Li, List(), ), List(List(null, null, null), , A, Cheng, List(), ), List(List(null, null, null), , R, Jia, List(), )), Inhibitory Effect of Resveratrol against Duck Enteritis Virus In Vitro)"
3bac7494aeafbb2f2cff25690dac67e99c120029,Virology Journal Development of a fluorescent quantitative real-time polymerase chain reaction assay for the detection of Goose parvovirus in vivo,"List(List(List(Chongqing Academy of Animal Science, , List(null, China, null, 402460, null, Chongqing, Chongqing)), , Jin-Long, Yang, List(), ), List(List(Sichuan Agricultural University, , List(null, China, null, 625014, null, Yaan, Sichuan)), , An-Chun, Cheng, List(), ), List(List(Sichuan Agricultural University, , List(null, China, null, 625014, null, Yaan, Sichuan)), ming-shuwang-mshwang@163.com, Ming-Shu, Wang, List(), ), List(List(Sichuan Agricultural University, , List(null, China, null, 625014, null, Yaan, Sichuan)), , Kang-Cheng, Pan, List(), ), List(List(Sichuan Agricultural University, , List(null, China, null, 625014, null, Yaan, Sichuan)), , Min, Li, List(), ), List(List(Sichuan Agricultural University, , List(null, China, null, 625014, null, Yaan, Sichuan)), , Yu-Fei, Guo, List(), ), List(List(Sichuan Agricultural University, , List(null, China, null, 625014, null, Yaan, Sichuan)), , Chuan-Feng, Li, List(), ), List(List(Sichuan Agricultural University, , List(null, China, null, 625014, null, Yaan, Sichuan)), de-kangzhu-zdk24@163.com, De-Kang, Zhu, List(), ), List(List(Sichuan Agricultural University, , List(null, China, null, 625014, null, Yaan, Sichuan)), , Xiao-Yue, Chen, List(), ))","List(List(List(List(Chongqing Academy of Animal Science, , List(null, China, null, 402460, null, Chongqing, Chongqing)), , Jin-Long, Yang, List(), ), List(List(Sichuan Agricultural University, , List(null, China, null, 625014, null, Yaan, Sichuan)), , An-Chun, Cheng, List(), ), List(List(Sichuan Agricultural University, , List(null, China, null, 625014, null, Yaan, Sichuan)), ming-shuwang-mshwang@163.com, Ming-Shu, Wang, List(), ), List(List(Sichuan Agricultural University, , List(null, China, null, 625014, null, Yaan, Sichuan)), , Kang-Cheng, Pan, List(), ), List(List(Sichuan Agricultural University, , List(null, China, null, 625014, null, Yaan, Sichuan)), , Min, Li, List(), ), List(List(Sichuan Agricultural University, , List(null, China, null, 625014, null, Yaan, Sichuan)), , Yu-Fei, Guo, List(), ), List(List(Sichuan Agricultural University, , List(null, China, null, 625014, null, Yaan, Sichuan)), , Chuan-Feng, Li, List(), ), List(List(Sichuan Agricultural University, , List(null, China, null, 625014, null, Yaan, Sichuan)), de-kangzhu-zdk24@163.com, De-Kang, Zhu, List(), ), List(List(Sichuan Agricultural University, , List(null, China, null, 625014, null, Yaan, Sichuan)), , Xiao-Yue, Chen, List(), )), Virology Journal Development of a fluorescent quantitative real-time polymerase chain reaction assay for the detection of Goose parvovirus in vivo)"
21083480591a0f89c6c6a1ee575c79ef9eeaeb75,"C-reactive protein, haptoglobin, serum amyloid A and pig major acute phase protein response in pigs simultaneously infected with H1N1 swine influenza virus and Pasteurella multocida","List(List(List(null, null, null), , Małgorzata, Pomorska-Mól, List(), ), List(List(null, null, null), , Iwona, Markowska-Daniel, List(), ), List(List(null, null, null), , Krzysztof, Kwit, List(), ), List(List(null, null, null), , Katarzyna, Stępniewska, List(), ), List(List(null, null, null), , Zygmunt, Pejsak, List(), ))","List(List(List(List(null, null, null), , Małgorzata, Pomorska-Mól, List(), ), List(List(null, null, null), , Iwona, Markowska-Daniel, List(), ), List(List(null, null, null), , Krzysztof, Kwit, List(), ), List(List(null, null, null), , Katarzyna, Stępniewska, List(), ), List(List(null, null, null), , Zygmunt, Pejsak, List(), )), C-reactive protein, haptoglobin, serum amyloid A and pig major acute phase protein response in pigs simultaneously infected with H1N1 swine influenza virus and Pasteurella multocida)"


In [0]:
paperAuthorLocation = spark.sql("""
select paper_id, 
       title,  
       authors.affiliation.location.addrLine as addrLine, 
       authors.affiliation.location.country as country, 
       authors.affiliation.location.postBox as postBox,
       authors.affiliation.location.postCode as postCode,
       authors.affiliation.location.region as region,
       authors.affiliation.location.settlement as settlement
  from (
    select a.paper_id, a.metadata.title as title, b.authors
      from comm_use_subset a
        left join (
            select paper_id, explode(metadata.authors) as authors from comm_use_subset 
            ) b
           on b.paper_id = a.paper_id  
  ) x
""")
paperAuthorLocation.createOrReplaceTempView("paperAuthorLocation")

## 著者の国データの問題

`authors.affiliation.location.country`には`USA,USA,USA,USA`と言ったデータが含まれている問題があります。

In [0]:
%sql
select *
  from (
    select paper_id, metadata.title as title, explode(metadata.authors) as authors from comm_use_subset 
  ) a
where authors.affiliation.location.country like '%USA, USA, USA, USA%'

paper_id,title,authors
2a6a9de82dc0494f32530e1ee8ee7509367a04fd,Building International Genomics Collaboration for Global Health Security,"List(List(Blood Systems Research Institute, Los Alamos National Laboratory, List(null, USA, USA, USA, USA, null, null, NM, Metabiota, Los Alamos)), , Nathan, Wolfe, List(), )"
2a6a9de82dc0494f32530e1ee8ee7509367a04fd,Building International Genomics Collaboration for Global Health Security,"List(List(Blood Systems Research Institute, Los Alamos National Laboratory, List(null, USA, USA, USA, USA, null, null, NM, Metabiota, Los Alamos)), , Paras, Jain, List(), )"
2a6a9de82dc0494f32530e1ee8ee7509367a04fd,Building International Genomics Collaboration for Global Health Security,"List(List(Blood Systems Research Institute, Los Alamos National Laboratory, List(null, USA, USA, USA, USA, null, null, NM, Metabiota, Los Alamos)), , Eric, Delwart, List(), )"
2a6a9de82dc0494f32530e1ee8ee7509367a04fd,Building International Genomics Collaboration for Global Health Security,"List(List(Blood Systems Research Institute, Los Alamos National Laboratory, List(null, USA, USA, USA, USA, null, null, NM, Metabiota, Los Alamos)), hhcui@lanl.gov, Helen, Cui, List(H), )"
2a6a9de82dc0494f32530e1ee8ee7509367a04fd,Building International Genomics Collaboration for Global Health Security,"List(List(Blood Systems Research Institute, Los Alamos National Laboratory, List(null, USA, USA, USA, USA, null, null, NM, Metabiota, Los Alamos)), , Tracy, Erkkila, List(), )"
2a6a9de82dc0494f32530e1ee8ee7509367a04fd,Building International Genomics Collaboration for Global Health Security,"List(List(Blood Systems Research Institute, Los Alamos National Laboratory, List(null, USA, USA, USA, USA, null, null, NM, Metabiota, Los Alamos)), , Patrick, Chain, List(S G), )"
2a6a9de82dc0494f32530e1ee8ee7509367a04fd,Building International Genomics Collaboration for Global Health Security,"List(List(Blood Systems Research Institute, Los Alamos National Laboratory, List(null, USA, USA, USA, USA, null, null, NM, Metabiota, Los Alamos)), , Momchilo, Vuyisich, List(), )"


### データのクレンジング

著者の国データをきれいにしましょう。

### paperAuthorLocationの確認

一時ビュー`paperAuthorLocation`を確認します。

In [0]:
%sql
select * from paperAuthorLocation limit 200

paper_id,title,addrLine,country,postBox,postCode,region,settlement
e1cb86642107f15ec6d854bebf9c8341d1416ef4,Strand-Exchange Nucleic Acid Circuitry with Enhanced Thermo-and Structure-Buffering Abilities Turns Gene Diagnostics Ultra-Reliable and Environmental Compatible OPEN,,,,,,
e1cb86642107f15ec6d854bebf9c8341d1416ef4,Strand-Exchange Nucleic Acid Circuitry with Enhanced Thermo-and Structure-Buffering Abilities Turns Gene Diagnostics Ultra-Reliable and Environmental Compatible OPEN,,P.R. China,,130022,,"Changchun, Changchun, Jilin"
e1cb86642107f15ec6d854bebf9c8341d1416ef4,Strand-Exchange Nucleic Acid Circuitry with Enhanced Thermo-and Structure-Buffering Abilities Turns Gene Diagnostics Ultra-Reliable and Environmental Compatible OPEN,,United States,,78712,Texas,Austin
e1cb86642107f15ec6d854bebf9c8341d1416ef4,Strand-Exchange Nucleic Acid Circuitry with Enhanced Thermo-and Structure-Buffering Abilities Turns Gene Diagnostics Ultra-Reliable and Environmental Compatible OPEN,,United States,,78712,Texas,Austin
e1cb86642107f15ec6d854bebf9c8341d1416ef4,Strand-Exchange Nucleic Acid Circuitry with Enhanced Thermo-and Structure-Buffering Abilities Turns Gene Diagnostics Ultra-Reliable and Environmental Compatible OPEN,,United States,,78712,Texas,Austin
e1cb86642107f15ec6d854bebf9c8341d1416ef4,Strand-Exchange Nucleic Acid Circuitry with Enhanced Thermo-and Structure-Buffering Abilities Turns Gene Diagnostics Ultra-Reliable and Environmental Compatible OPEN,,P.R. China,,130022,,"Changchun, Changchun, Jilin"
e1cb86642107f15ec6d854bebf9c8341d1416ef4,Strand-Exchange Nucleic Acid Circuitry with Enhanced Thermo-and Structure-Buffering Abilities Turns Gene Diagnostics Ultra-Reliable and Environmental Compatible OPEN,,P.R. China,,130022,,"Changchun, Changchun, Jilin"
b0dcc756c7f2641a8319b96355ec871ba2922f90,Use of an Innovative Web-Based Laboratory Surveillance Platform to Analyze Mixed Infections Between Human Metapneumovirus (hMPV) and Other Respiratory Viruses Circulating in,3030 Hospital Dr. NW,Canada,,T2N 4W4,Alberta,Calgary
b0dcc756c7f2641a8319b96355ec871ba2922f90,Use of an Innovative Web-Based Laboratory Surveillance Platform to Analyze Mixed Infections Between Human Metapneumovirus (hMPV) and Other Respiratory Viruses Circulating in,,,,,,
b0dcc756c7f2641a8319b96355ec871ba2922f90,Use of an Innovative Web-Based Laboratory Surveillance Platform to Analyze Mixed Infections Between Human Metapneumovirus (hMPV) and Other Respiratory Viruses Circulating in,,,,,,


In [0]:
%sql
select count(1), count(distinct paper_id) as papers from paperAuthorLocation

count(1),papers
67671,8997


### 国データの抽出

一時ビュー`paperAuthorLocation`から国データ(`paperCountries`)を抽出します。

In [0]:
paperCountries = spark.sql("""select distinct country from paperAuthorLocation""")
paperCountries.createOrReplaceTempView("paperCountries")

### pycountryの活用

それぞれの国からalpha_3コードを抽出するために`pycountry`を使用しましす。

In [0]:
# インポート
import pycountry

# alpha_3の国コードの検索 (pycountryを使用)
def get_alpha_3(country):
    try_alpha_3 = -1
    try:
        try_alpha_3 = pycountry.countries.search_fuzzy(country)[0].alpha_3
    except:
        print("Unknown Country")
    return try_alpha_3

# UDF(ユーザー定義関数)として登録
spark.udf.register("get_alpha_3", get_alpha_3)

In [0]:
# from pyspark.sql.functions import pandas_udf, PandasUDFType

# # Use pandas_udf to define a Pandas UDF
# @pandas_udf('double', PandasUDFType.SCALAR)
# # Input/output are both a pandas.Series of doubles

# def pandas_plus_one(v):
#     return v + 1

# df.withColumn('v2', pandas_plus_one(df.v))

In [0]:
%sql
select country, get_alpha_3(country) as alpha_3 from paperCountries

country,alpha_3
"Spain, UNITED STATES",-1
Russia,RUS
"The Netherlands, The Netherlands",-1
Jamaica (,-1
"The Netherlands, France",-1
PR China,-1
Sweden,SWE
The Netherlands,NLD
"UK, UK",-1
Republic of Korea,PRK


### 国データのクレンジングのステップ

In [0]:
# ステップ 1: 容易に識別可能な国の alpha_3 の抽出
paperCountries_s01 = spark.sql("""select country, get_alpha_3(country) as alpha_3 from paperCountries""")
paperCountries_s01.cache()
paperCountries_s01.createOrReplaceTempView("paperCountries_s01")

In [0]:
# ステップ 2: 分割することで識別可能な国の alpha_3 の抽出 (例 "USA, USA, USA", "Sweden, Norway", etc)
paperCountries_s02 = spark.sql("""
select country, splitCountry as country_cleansed, get_alpha_3(ltrim(rtrim(splitCountry))) as alpha_3
  from (
select country, explode(split(regexp_replace(country, "[^a-zA-Z, ]+", ""), ',')) as splitCountry
  from paperCountries_s01
 where alpha_3 = '-1'
 ) x
""")
paperCountries_s02.cache()
paperCountries_s02.createOrReplaceTempView("paperCountries_s02")

In [0]:
# ステップ 3: (ステップ1とステップ2の後で)いまだ識別できていない国の抽出  
paperCountries_s03 = spark.sql("""select country, ltrim(rtrim(country_cleansed)) as country_cleansed, get_alpha_3(country_cleansed) from paperCountries_s02 where alpha_3 = -1""")
paperCountries_s03.cache()
paperCountries_s03.createOrReplaceTempView("paperCountries_s03")

In [0]:
# ステップ 4: settlementから国を識別
paperCountries_s04 = spark.sql("""
select distinct m.country_cleansed, f.settlement, get_alpha_3(f.settlement) as alpha_3
  from paperAuthorLocation f
    inner join paperCountries_s03 m
      on m.country = f.country
""")
paperCountries_s04.cache()
paperCountries_s04.createOrReplaceTempView("paperCountries_s04")

In [0]:
 # ステップ 5: 新たなマッピングの構築
map_country_cleansed = spark.sql("""select distinct country_cleansed, alpha_3 from paperCountries_s04 where alpha_3 <> '-1'""")
map_country_cleansed.cache()
map_country_cleansed.createOrReplaceTempView("map_country_cleansed")

In [0]:
# ステップ 6: すてっぷ5のマッピングを用いて paperCountries_s03 を更新
paperCountries_s06 = spark.sql("""
select f.country, f.country_cleansed, m.alpha_3
  from paperCountries_s03 f
    left join map_country_cleansed m
      on m.country_cleansed = f.country_cleansed
 where m.alpha_3 is not null      
""")
paperCountries_s06.cache()
paperCountries_s06.createOrReplaceTempView("paperCountries_s06")

### map_countryの構築 

上のパイプライン処理に基づきmap_countryを構築します。

In [0]:
map_country = spark.sql("""
select country, alpha_3 from paperCountries_s01 where alpha_3 <> '-1'
union all
select country, alpha_3 from paperCountries_s02 where alpha_3 <> '-1'
union all
select country, alpha_3 from paperCountries_s06
""")
map_country.cache()
map_country.createOrReplaceTempView("map_country")

### paperCountryMappedの構築

論文をalpha_3の地理的位置にマップしてすべてをまとめます。

In [0]:
paperCountryMapped = spark.sql("""
select p.paper_id, p.title, p.addrLine, p.country, p.postBox, p.postCode, p.region, p.settlement, m.alpha_3
 from paperAuthorLocation p
   left outer join map_country m
     on m.country = p.country
""")
paperCountryMapped.cache()
paperCountryMapped.createOrReplaceTempView("paperCountryMapped")

In [0]:
%sql
select * from paperCountryMapped limit 100

paper_id,title,addrLine,country,postBox,postCode,region,settlement,alpha_3
d259c80b55cb69486ef75e66d13fe60688a1e028,Middle East Respiratory Coronavirus Accessory Protein 4a Inhibits PKR-Mediated Antiviral Stress Responses,Campus Universidad Autonoma de Madrid,"Spain, UNITED STATES",,,,Madrid,ESP
d259c80b55cb69486ef75e66d13fe60688a1e028,Middle East Respiratory Coronavirus Accessory Protein 4a Inhibits PKR-Mediated Antiviral Stress Responses,Campus Universidad Autonoma de Madrid,"Spain, UNITED STATES",,,,Madrid,USA
d259c80b55cb69486ef75e66d13fe60688a1e028,Middle East Respiratory Coronavirus Accessory Protein 4a Inhibits PKR-Mediated Antiviral Stress Responses,Campus Universidad Autonoma de Madrid,"Spain, UNITED STATES",,,,Madrid,ESP
d259c80b55cb69486ef75e66d13fe60688a1e028,Middle East Respiratory Coronavirus Accessory Protein 4a Inhibits PKR-Mediated Antiviral Stress Responses,Campus Universidad Autonoma de Madrid,"Spain, UNITED STATES",,,,Madrid,USA
d259c80b55cb69486ef75e66d13fe60688a1e028,Middle East Respiratory Coronavirus Accessory Protein 4a Inhibits PKR-Mediated Antiviral Stress Responses,Campus Universidad Autonoma de Madrid,"Spain, UNITED STATES",,,,Madrid,ESP
d259c80b55cb69486ef75e66d13fe60688a1e028,Middle East Respiratory Coronavirus Accessory Protein 4a Inhibits PKR-Mediated Antiviral Stress Responses,Campus Universidad Autonoma de Madrid,"Spain, UNITED STATES",,,,Madrid,USA
d0a7af58aa5e272f1c7aef4e6908dd3059d9173e,Proteasome inhibition in cancer is associated with enhanced tumor targeting by the adeno-associated virus/phage,,United Kingdom A R,,,,London,GBR
d0a7af58aa5e272f1c7aef4e6908dd3059d9173e,Proteasome inhibition in cancer is associated with enhanced tumor targeting by the adeno-associated virus/phage,,United Kingdom A R,,,,London,GBR
c7d60067e11331d3c5e1f9b1d79e70caacb13f25,,,Utah,,,,Salt Lake City,USA
c7d60067e11331d3c5e1f9b1d79e70caacb13f25,,,Utah,,,,Salt Lake City,USA


### paperCountryMappedの統計情報

In [0]:
(ep_no, edp_no) = spark.sql("select count(1), count(distinct paper_id) from paperCountryMapped where country is null and settlement is null").collect()[0]
(ep_geo, edp_geo) = spark.sql("select count(1), count(distinct paper_id) from paperCountryMapped where country is not null or settlement is not null").collect()[0]
(ep_a3, edp_a3) = spark.sql("select count(1), count(distinct paper_id) from paperCountryMapped where alpha_3 is not null").collect()[0]
print("Distinct Papers with No Geographic Information: %s" % edp_no)
print("Distinct Papers with Some Geographic Information: %s" % edp_geo)
print("Distinct Papers with Identified Alpha_3 codes: %s" % edp_a3)

## 論文と国のマッピングの可視化

論文ごとの著者の国をマッピングします。一つの論文に対して複数の著者がいる場合にはダブルカウントになることに注意してください。

In [0]:
%sql
select alpha_3, count(distinct paper_id) 
  from paperCountryMapped 
 where alpha_3 is not null
 group by alpha_3

alpha_3,count(DISTINCT paper_id)
PSE,1
HTI,2
LVA,2
POL,37
JAM,2
BRA,124
MOZ,2
CUB,3
JOR,7
FRA,284


# END