BeautifulSoup 사용하여 HTML 파싱

In [2]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
# 1) 문자열에서 soup을 생성한다.

soup1 = BeautifulSoup("<HTML><HEAD><<headers>></HEAD><<body>></HTML>")

# 2) 로컬 파일에서 soup을 생성한다. -> myDoc.html 파일을 사용

soup2 = BeautifulSoup(open("myDoc.html"))

# 3) 웹문서에서 soup을 생성한다 -> 크롤링의 기초
soup3 = BeautifulSoup(urlopen("http://www.networksciencelab.com/"))

In [3]:
soup1

<html><head></head><body><p>&lt;<headers>&gt;&lt;&gt;</headers></p></body></html>

In [4]:
soup2

<html><head></head><body><p>&lt;<headers>&gt;&lt;&gt;</headers></p></body></html>

In [5]:
soup3

<html>
<head>
<title>My Little Network Science Lab</title>
<link href="style.css" rel="stylesheet" type="text/css"/>
<meta charset="utf-8"/>
</head>
<body>
<h1>My Little Network Science Lab</h1>
<h2>By Dmitry Zinoviev</h2>
<p>
</p><table class="hdr"><tr><td><h3 class="nomargin">Books</h3></td></tr></table>
<p>
<a href="https://pragprog.com/book/dzpyds/data-science-essentials-in-python"><img align="left" border="1" src="https://imagery.pragprog.com/products/490/dzpyds_xlargecover.jpg?1468006361"/></a>
<a href="https://pragprog.com/book/dzcnapy/complex-network-analysis-in-python"><img align="left" border="1" src="https://imagery.pragprog.com/products/541/dzcnapy_xlargecover.jpg?1508250011"/></a>


I am excited to announce my books, "Data Science Essentials in Python. Collect →  Organize →  Explore →  Predict →  Value" (a.k.a. DZPYDS) and "Complex Network Analysis in Python. Recognize → Construct → Visualize → Analyze → Interpret" (a.k.a. DZCNAPY), published by the Pragmatic Bookshelf.
</

find(), find_all() 특정 속성 (href = 하이퍼링크 정보) 값 찾기

In [10]:
# http://www.networkscienecelab.com/ 사이트에서 하이퍼링크 정보 모두 가져오기
with urlopen("http://www.networksciencelab.com/") as doc:
  soup = BeautifulSoup(doc)

# 하이퍼 링크 추출
links = [ (link.string, link["href"])
for link in soup.find_all("a")
if link.has_attr("href") ]
links

# 
# if link.has_attr("href") -> 
# for link in soup.find_all("a") -> a로 시작하는 링크를 모두 찾아

[(None, 'https://pragprog.com/book/dzpyds/data-science-essentials-in-python'),
 (None,
  'https://pragprog.com/book/dzcnapy/complex-network-analysis-in-python'),
 ('DZPYDS', 'https://www.amazon.com/gp/product/1680501844'),
 ('DZCNAPY', 'https://www.amazon.com/gp/product/1680502697'),
 ('Networks of Music Groups as Success Predictors',
  'http://www.slideshare.net/DmitryZinoviev/networks-of-music-groups-as-success-predictors'),
 ('Network Science Workshop',
  'http://www.slideshare.net/DmitryZinoviev/workshop-20212296'),
 ('Resilience in Transaction-Oriented Networks',
  'http://www.slideshare.net/DmitryZinoviev/resilience-in-transactional-networks'),
 ('Peer Ratings in Massive Online Social Networks',
  'http://www.slideshare.net/DmitryZinoviev/peer-ratings-in-massive-online-social-networks'),
 ('Semantic Networks of Interests in Online NSSI Communities',
  'http://www.slideshare.net/DmitryZinoviev/presentation-31680572'),
 ('Towards an Ideal Store',
  'http://www.slideshare.net/Dmitry

CSV 파일 다루기

statistics 모듈을 사용해서 나이 변수의 평균과 표준편차 구하기

데이터 가져오기

In [None]:
import csv

with open("/content/Demographic_Statistics_By_Zip_Code.csv") as infile:
  data = list(csv.reader(infile))
data

In [10]:
# 첫번째 레코드에 COUNT PARTICIPANTS 데이터 인덱스 값 추출 data[0] 컬럼 값
countParticipantsIndex = data[0].index("COUNT PARTICIPANTS")
countParticipantsIndex


1

In [None]:
countParticipants = [int(row[countParticipantsIndex]) for row in data[1:]] # 리스트 내포
countParticipants

나이의 평균과 표준편차

In [5]:
import statistics #사실, statistics 모듈을 잘 안쓰고 사실은 pandas를 더 많이 쓴다.
print(statistics.mean(countParticipants), statistics.stdev(countParticipants))

17.661016949152543 43.27973735299687


In [12]:
pip install nltk #이미 코랩에 nltk 모듈이 설치되어있다. already



In [13]:
import nltk
nltk.__version__

'3.2.5'

nltk에서 영단어 온톨로지(wordnet) 사용하기

In [14]:
# https://frhyme.github.io/python-lib/nltk-wordnet/
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [15]:
wn = nltk.corpus.wordnet # 코퍼스 리더(reader)

# synset : 유의어 묶음으로 단어-품사-순번 으로 구성

wn.synsets("cat")

[Synset('cat.n.01'),
 Synset('guy.n.01'),
 Synset('cat.n.03'),
 Synset('kat.n.01'),
 Synset('cat-o'-nine-tails.n.01'),
 Synset('caterpillar.n.02'),
 Synset('big_cat.n.01'),
 Synset('computerized_tomography.n.01'),
 Synset('cat.v.01'),
 Synset('vomit.v.01')]

In [26]:
wn.synset('cat.n.01').hypernyms() # hypernyms 상의어
wn.synset('cat.n.01').hyponyms() # hyponyms : 하의어

[Synset('domestic_cat.n.01'), Synset('wildcat.n.03')]

WordNet 사용해서 synset 간 의미론적 유사도 계산하기 (0-1 사이 실수)<br>
0 이면 두 단어 서로 관계 없음
1 이면 완전한 유의어

In [17]:
# 고양이, 링스 : 시라소니 유사도 계산
x = wn.synset('cat.n.01') # 고양이
y = wn.synset('lynx.n.01') # 시라소니

# x,y 유사도 계산
x.path_similarity(y)

0.04

In [20]:
# 고양이, 링스 : 집고양이 유사도 계산
x = wn.synset('cat.n.01') # 고양이
y = wn.synset('domestic_cat.n.01') # 집고양이

# x,y 유사도 계산
x.path_similarity(y)

0.5

In [21]:
# 고양이, 고양이 : 고양이 유사도 계산
x = wn.synset('cat.n.01') # 고양이
y = wn.synset('cat.n.01') # 고양이

# x,y 유사도 계산
x.path_similarity(y)

1.0

In [22]:
# 고양이, 강아지 : 고양이 유사도 계산
x = wn.synset('cat.n.01') # 고양이
y = wn.synset('dog.n.01') # 강아지

# x,y 유사도 계산
x.path_similarity(y)

0.2

In [23]:
wn.synset('dog.n.01').hypernyms()

[Synset('canine.n.02'), Synset('domestic_animal.n.01')]

In [24]:
wn.synset('dog.n.01').hyponyms()

[Synset('basenji.n.01'),
 Synset('corgi.n.01'),
 Synset('cur.n.01'),
 Synset('dalmatian.n.02'),
 Synset('great_pyrenees.n.01'),
 Synset('griffon.n.02'),
 Synset('hunting_dog.n.01'),
 Synset('lapdog.n.01'),
 Synset('leonberg.n.01'),
 Synset('mexican_hairless.n.01'),
 Synset('newfoundland.n.01'),
 Synset('pooch.n.01'),
 Synset('poodle.n.01'),
 Synset('pug.n.01'),
 Synset('puppy.n.01'),
 Synset('spitz.n.01'),
 Synset('toy_dog.n.01'),
 Synset('working_dog.n.01')]

In [30]:
# 토큰화 (텍스트 -> 단어로 쪼갠다.)

from nltk.tokenize import WordPunctTokenizer
word_punct = WordPunctTokenizer()

text = " }Help!   :)))   :[   ..... :D{ "

# Tokenizer는 모든 구두점(punctuation; 문장부호)을 기준으로 분리
# -> 이모티콘을 이용한 감성 분석 등 문장구조 깊이 분석 할 때 사용

word_punct.tokenize(text)

['}', 'Help', '!', ':)))', ':[', '.....', ':', 'D', '{']

In [32]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [33]:
# 단어 토크나이저 = 특수기호도 다 그냥 하나로 인식해서 짤라버린다. 단어추출하기좋다.
nltk.word_tokenize(text)

['}', 'Help', '!', ':', ')', ')', ')', ':', '[', '...', '..', ':', 'D', '{']

In [38]:
# 2. 단어의 대소문자를 통일한다 (전부 다 대문자 or 소문자)
# 3. 불용어 제거 (stopwords 리스트 참고 THE 같은 것들 제거)
# 4. 형태소 분석(stemming) 단어를 형태소로 변환

# 포터 형태소 분석기 (보수적)

pstemmer = nltk.PorterStemmer()
pstemmer.stem("wonderful")

'wonder'

In [39]:
# 랭커스터 형태소 분석기 (적극적) - 더 많은 동음이의어 형태소 생산
lstemmer = nltk.LancasterStemmer()
lstemmer.stem("wonderful")

'wond'

In [40]:
# 5. 원형 추출
lemmatizer = nltk.WordNetLemmatizer()
lemmatizer.lemmatize("wonderful")

'wonderful'

In [43]:
# 그 외 품사 태깅
nltk.download('averaged_perceptron_tagger')
nltk.pos_tag(["beautiful", "world"]) # JJ	adjective(형용사), NN	noun(명사)
# https://happygrammer.github.io/nlp/postag-set/ 참조

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[('beautiful', 'JJ'), ('world', 'NN')]

index.html 파일에서 (불용어 제외) 가장 많이 등장한 단어 원형 찾아보기

In [73]:
# www.networksciencelab.com 사이트에서 하이퍼 링크 정보 모두 가져오기

# http://www.networkscienecelab.com/ 사이트에서 하이퍼링크 정보 모두 가져오기
from urllib.request import urlopen
from bs4 import BeautifulSoup

with urlopen("http://www.networksciencelab.com/") as doc:
  soup = BeautifulSoup(doc)

# 하이퍼 링크 추출
links = [ (link.string, link["href"]) for link in soup.find_all("a") if link.has_attr("href")]


In [70]:
#책 제목만 추출 (하이퍼링크는 필요없으니까, 책 제목만 추출 => 0번인덱스에는 책제목, 1번인덱스에는 하이퍼링크 니까 0번인덱스만 가져오자!)
html_text = [i[0] for i in links] #여기까지는 리스트 형태로 담았다.

html_text

[None,
 None,
 'DZPYDS',
 'DZCNAPY',
 'Networks of Music Groups as Success Predictors',
 'Network Science Workshop',
 'Resilience in Transaction-Oriented Networks',
 'Peer Ratings in Massive Online Social Networks',
 'Semantic Networks of Interests in Online NSSI Communities',
 'Towards an Ideal Store',
 'D.Zinoviev, "Analyzing Cultural Domains with Python,"',
 'D. Zinoviev, D. Stefanescu, G. Fireman, and L. Swenson, "Semantic networks of interests in online non-suicidal self-injury communities,"',
 'D.Zinoviev, "The Pain of Complexity,"',
 'D.Zinoviev, Z.Zhu, and K.Li, "Building mini-categories in product networks,"',
 'D.Zinoviev, H.Benbrahim, G.Meszoely, and D.Stefanescu, "Mitigation of delayed management costs in transaction-oriented systems,"',
 'D.Zinoviev, H.Benbrahim, G.Meszoely, and D.Stefanescu, "Simulating resilience in transaction-oriented networks,"',
 'D.Zinoviev, D.Stefanescu, L.Swenson, and G.Fireman, "Semantic networks of interests in online NSSI communities,"',
 'D.Zi

In [71]:
# None 데이터 제거 -> 리스트에서 하나의 텍스트로 변환
html_text = "       ".join(html_text[2:]) #조인을 써버리니까, 텍스트형태로 띄어쓰기 준만큼 띄어서 연결되어 담긴다.

html_text

'DZPYDS       DZCNAPY       Networks of Music Groups as Success Predictors       Network Science Workshop       Resilience in Transaction-Oriented Networks       Peer Ratings in Massive Online Social Networks       Semantic Networks of Interests in Online NSSI Communities       Towards an Ideal Store       D.Zinoviev, "Analyzing Cultural Domains with Python,"       D. Zinoviev, D. Stefanescu, G. Fireman, and L. Swenson, "Semantic networks of interests in online non-suicidal self-injury communities,"       D.Zinoviev, "The Pain of Complexity,"       D.Zinoviev, Z.Zhu, and K.Li, "Building mini-categories in product networks,"       D.Zinoviev, H.Benbrahim, G.Meszoely, and D.Stefanescu, "Mitigation of delayed management costs in transaction-oriented systems,"       D.Zinoviev, H.Benbrahim, G.Meszoely, and D.Stefanescu, "Simulating resilience in transaction-oriented networks,"       D.Zinoviev, D.Stefanescu, L.Swenson, and G.Fireman, "Semantic networks of interests in online NSSI communiti

In [59]:
# 하나의 텍스트 -> index.html 파일로 저장
html_file = open("index.html", 'w')
html_file.write(html_text)
html_file.close()

In [81]:
from bs4 import BeautifulSoup
from collections import Counter
from nltk.corpus import stopwords
from nltk import LancasterStemmer

# 형태소 분류기를 생성
ls = nltk.LancasterStemmer()

# 파일을 읽고 soup을 만든다.
with open("/content/index.html") as infile:
  soup = BeautifulSoup(infile)

In [82]:
soup # html < > 이런거 보기 싫으면?

<html><body><p>DZPYDS    DZCNAPY    Networks of Music Groups as Success Predictors    Network Science Workshop    Resilience in Transaction-Oriented Networks    Peer Ratings in Massive Online Social Networks    Semantic Networks of Interests in Online NSSI Communities    Towards an Ideal Store    D.Zinoviev, "Analyzing Cultural Domains with Python,"    D. Zinoviev, D. Stefanescu, G. Fireman, and L. Swenson, "Semantic networks of interests in online non-suicidal self-injury communities,"    D.Zinoviev, "The Pain of Complexity,"    D.Zinoviev, Z.Zhu, and K.Li, "Building mini-categories in product networks,"    D.Zinoviev, H.Benbrahim, G.Meszoely, and D.Stefanescu, "Mitigation of delayed management costs in transaction-oriented systems,"    D.Zinoviev, H.Benbrahim, G.Meszoely, and D.Stefanescu, "Simulating resilience in transaction-oriented networks,"    D.Zinoviev, D.Stefanescu, L.Swenson, and G.Fireman, "Semantic networks of interests in online NSSI communities,"    D.Zinoviev and S.Lle

In [83]:
#text만 가지고 오고 싶을때
soup.text 

'DZPYDS    DZCNAPY    Networks of Music Groups as Success Predictors    Network Science Workshop    Resilience in Transaction-Oriented Networks    Peer Ratings in Massive Online Social Networks    Semantic Networks of Interests in Online NSSI Communities    Towards an Ideal Store    D.Zinoviev, "Analyzing Cultural Domains with Python,"    D. Zinoviev, D. Stefanescu, G. Fireman, and L. Swenson, "Semantic networks of interests in online non-suicidal self-injury communities,"    D.Zinoviev, "The Pain of Complexity,"    D.Zinoviev, Z.Zhu, and K.Li, "Building mini-categories in product networks,"    D.Zinoviev, H.Benbrahim, G.Meszoely, and D.Stefanescu, "Mitigation of delayed management costs in transaction-oriented systems,"    D.Zinoviev, H.Benbrahim, G.Meszoely, and D.Stefanescu, "Simulating resilience in transaction-oriented networks,"    D.Zinoviev, D.Stefanescu, L.Swenson, and G.Fireman, "Semantic networks of interests in online NSSI communities,"    D.Zinoviev and S.Llewelyn, "Co-Evo

In [None]:
# 1) 텍스트 추출 -> 토큰화
words = nltk.word_tokenize(soup.text)
words

In [None]:
# 2) 단어를 소문자로 변환
words = [w.lower() for w in words]
words

In [89]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [92]:
# 3) 불용어 및 특수기호를 제거하고 단어의 형태소를 추출
# 3-1) 불용어 제거 stopwords.words("english")
# 3-2) 특수기호 제거 isalnum()
# 3-3) 형태소를 추출 ls.stem(w)
words = [ls.stem(w) for w in words if w not in stopwords.words("english") and w.isalnum()] #isalnum() : 문자열이 알파벳([a-zA-Z])과 숫자([0-9])로만 구성
words

['dzpyds',
 'dzcnapy',
 'network',
 'mus',
 'group',
 'success',
 'predict',
 'network',
 'sci',
 'workshop',
 'resy',
 'network',
 'peer',
 'rat',
 'mass',
 'onlin',
 'soc',
 'network',
 'sem',
 'network',
 'interest',
 'onlin',
 'nssi',
 'commun',
 'toward',
 'id',
 'stor',
 'ana',
 'cult',
 'domain',
 'python',
 'zinoviev',
 'stefanescu',
 'firem',
 'swenson',
 'sem',
 'network',
 'interest',
 'onlin',
 'commun',
 'pain',
 'complex',
 'build',
 'produc',
 'network',
 'mitig',
 'delay',
 'man',
 'cost',
 'system',
 'sim',
 'resy',
 'network',
 'sem',
 'network',
 'interest',
 'onlin',
 'nssi',
 'commun',
 'friend',
 'publ',
 'onlin',
 'blog',
 'soc',
 'network',
 'inform',
 'diff',
 'soc',
 'network',
 'gam',
 'theoret',
 'approach',
 'broadcast',
 'inform',
 'diff',
 'soc',
 'network',
 'gam',
 'theoret',
 'approach',
 'model',
 'inform',
 'dissemin',
 'gam',
 'theoret',
 'approach',
 'model',
 'inform',
 'dissemin',
 'soc',
 'network',
 'toward',
 'understand',
 'friend',
 'onlin',

In [94]:
# 4) 가장 빈번하게 등장하는 단어 5개 추출
freqs = Counter(words)
print(freqs.most_common(5))
# 이 사이트에서 가장 핫한 키워드는? network 이다!

[('network', 16), ('soc', 8), ('onlin', 7), ('inform', 4), ('gam', 4)]
