In [8]:
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

In [9]:
##크롤링 대상 사이트 : https://www.computer.org (IEEE)

#사용할 라이브러리

import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
from multiprocessing import Pool, Array, Process
import time


In [10]:
#url 크롤링하는 함수 정의

def croll_link(next_url,link_name):
    req = requests.get('https://www.computer.org'+next_url)
    soup = bs(req.text, 'html.parser')
    next_r_url = soup.select(link_name)
    
    parsed = [] 
    for i in range(0,len(next_r_url)):
        parsed.append(next_r_url[i].get("href")) #url 추출해서 list에 append
    
    return parsed #list 반환

In [11]:
#크롤링 - site에서는 주제-연도-달(2-3달)-내용 으로 되어있음

start = time.time()

# computer science subject link croll
transactionLinks = croll_link('/csdl','a.transactionLink')

# subject -> volume(year)
volumeLinks = croll_link(transactionLinks[0],'a.volumeLink')

# volume -> issue(months)
issueLinks = croll_link(volumeLinks[0],'a.issueLink')

# issue -> contents
abstractLinks = croll_link(issueLinks[0],'div.col-xs-12 > div > div > div > button > a') # contents

# abstractLinks에서 html만 추출 (javascript와 pdf도 태그가 같았기에 soup로 구분하지 못했음)
new_absLinks = []
for i in range(0,len(abstractLinks)):
    if(abstractLinks[i][-8:] == 'abs.html'):
        new_absLinks.append(abstractLinks[i])
        
close = time.time()

print(close-start)


20.603402137756348


In [12]:
abstractLinks = new_absLinks
new_absLinks = []

#abstractLinks

In [13]:
#contents
req = requests.get('https://www.computer.org'+abstractLinks[1])
soup = bs(req.text, 'html.parser')

In [14]:
#title 크롤링
abs_title = (soup.select('div.abstractTitle > a > h2')[0].text) # title 추출
abs_title

'Simulating the Large-Scale Erosion of Genomic Privacy Over Time'

In [15]:
# url에 date내용이 들어있음
abs_yymm = abstractLinks[1][15:22]
abs_yymm

'2018/05'

In [16]:
#contents 크롤링
abs_contents = soup.select('div.abstractText.abstractTextMB')[0].text
#abs_contents


## Data set 늘리기


In [17]:
# Month를 다 합쳐봄 - volume은 같음
# 다 하면 너무 많을 것 같아서 일단 issue(month)만 합쳤음

total_absLink = []
for i in range(0,len(issueLinks)):
    total_absLink += croll_link(issueLinks[i],'div.col-xs-12 > div > div > div > button > a')

#total_absLink

In [18]:
final_absLink = []
for i in range(0,len(total_absLink)):
    if(total_absLink[i][-8:] == 'abs.html'):
        final_absLink.append(total_absLink[i])
#final_absLink

In [22]:
from datetime import datetime

now = str(datetime.now())[0:19]
start = time.time()
def croll_abs(link):
    req = requests.get('https://www.computer.org'+link)
    soup = bs(req.text, 'html.parser')
    journal = {}
    journal['title'] = soup.select('div.abstractTitle > a > h2')[0].text
    journal['abstract'] = soup.select('div.abstractText.abstractTextMB')[0].text
    journal['date'] = link[15:22]
  
    filename = ("./"+"%s"+"_output.txt") % now
    with open(filename, 'a') as f:
    #for data in journal_list:
        o_data = "%s\t%s\t%s\n" % (journal['date'], journal['title'],journal['abstract'])
        f.write(o_data)

pool = Pool(processes=4)
p = pool.map(croll_abs,final_absLink)

close = time.time()
print(close-start)

75.53028893470764


In [39]:
#data frame 만들기 - 총 175 rows
data = { 'Date':date, 'Title':title,'Contents':contents}

frame = pd.DataFrame(data)
frame

Unnamed: 0,Date,Title,Contents
0,2018/05,GenoPri'16: International Workshop on Genome P...,
1,2018/05,Simulating the Large-Scale Erosion of Genomic ...,The dramatically decreasing costs of DNA seque...
2,2018/05,Protecting Privacy and Security of Genomic Dat...,Re-use of patients’ health records can provide...
3,2018/05,Implementation and Evaluation of an Algorithm ...,We improve the quality of cryptographically pr...
4,2018/05,Guest Editorial for Special Section on the 12t...,
5,2018/05,Identification and Analysis of Key Residues in...,Protein–RNA complexes play important roles in ...
6,2018/05,Unified Deep Learning Architecture for Modelin...,Prediction of the spatial structure or functio...
7,2018/05,Mutli-Features Prediction of Protein Translati...,Post translational modification plays a signif...
8,2018/05,Sequence-Based Prediction of Putative Transcri...,A transcription factor (TF) is a protein that ...
9,2018/05,Prediction of Hot Regions in PPIs Based on Imp...,The hot regions in PPIs are some assembly regi...


In [40]:
#data frame을 파일로 저장
frame.to_csv("data.csv",header=False,index=True)

In [42]:
frame['Contents'][1]

"The dramatically decreasing costs of DNA sequencing have triggered more than a million humans to have their genotypes sequenced. Moreover, these individuals increasingly make their genomic data publicly available, thereby creating privacy threats for themselves and their relatives because of their DNA similarities. More generally, an entity that gains access to a significant fraction of sequenced genotypes might be able to infer even the genomes of unsequenced individuals. In this paper, we propose a simulation-based model for quantifying the impact of continuously sequencing and publicizing personal genomic data on a population's genomic privacy. Our simulation probabilistically models data sharing and takes into account events such as migration and interracial mating. We exemplarily instantiate our simulation with a sample population of 1,000 individuals and evaluate the privacy under multiple settings over 6,000 genomic variants and a subset of phenotype-related variants. Our findi