BeautifulSoup 사용하여 HTML 파싱

In [49]:
from bs4 import BeautifulSoup
from urllib.request import urlopen

In [None]:
# 문자열에서 soup를 생성한다.
soup1 = BeautifulSoup('<HTML><HEAD><<headers>></HEAD><<body>></HTML>')


# 로컬파일에서 soup를 생성한다. => myDoc.html 파일을 사용
soup2 = BeautifulSoup(open('myDoc.html'))


# 웹문서에서 soup를 생성한다. => 크롤링의 기초
soup3 = BeautifulSoup(urlopen('http://www.networksciencelab.com/'))


In [None]:
soup1

<html><head></head><body><p>&lt;<headers>&gt;&lt;&gt;</headers></p></body></html>

In [None]:
soup2

<html><head></head><body><p>&lt;<headers>&gt;&lt;&gt;</headers></p></body></html>

In [None]:
soup3

<html>
<head>
<title>My Little Network Science Lab</title>
<link href="style.css" rel="stylesheet" type="text/css"/>
<meta charset="utf-8"/>
</head>
<body>
<h1>My Little Network Science Lab</h1>
<h2>By Dmitry Zinoviev</h2>
<p>
</p><table class="hdr"><tr><td><h3 class="nomargin">Books</h3></td></tr></table>
<p>
<a href="https://pragprog.com/book/dzpyds/data-science-essentials-in-python"><img align="left" border="1" src="https://imagery.pragprog.com/products/490/dzpyds_xlargecover.jpg?1468006361"/></a>
<a href="https://pragprog.com/book/dzcnapy/complex-network-analysis-in-python"><img align="left" border="1" src="https://imagery.pragprog.com/products/541/dzcnapy_xlargecover.jpg?1508250011"/></a>


I am excited to announce my books, "Data Science Essentials in Python. Collect →  Organize →  Explore →  Predict →  Value" (a.k.a. DZPYDS) and "Complex Network Analysis in Python. Recognize → Construct → Visualize → Analyze → Interpret" (a.k.a. DZCNAPY), published by the Pragmatic Bookshelf.
</

find(), find_all() 특정 속성 (href = 하이퍼링크 정보) 값 찾기

In [None]:
# http://www.networksciencelab.com/ 사이트에서 하이퍼링크 정보 모두 가져오기
with urlopen('http://www.networksciencelab.com/') as urlo:
  soup4 = BeautifulSoup(urlo)

# 하이퍼링크 추출
links = [[link.string, link['href']] for link in soup4.find_all('a') if link.has_attr('href')]

In [None]:
links

[[None, 'https://pragprog.com/book/dzpyds/data-science-essentials-in-python'],
 [None,
  'https://pragprog.com/book/dzcnapy/complex-network-analysis-in-python'],
 ['DZPYDS', 'https://www.amazon.com/gp/product/1680501844'],
 ['DZCNAPY', 'https://www.amazon.com/gp/product/1680502697'],
 ['Networks of Music Groups as Success Predictors',
  'http://www.slideshare.net/DmitryZinoviev/networks-of-music-groups-as-success-predictors'],
 ['Network Science Workshop',
  'http://www.slideshare.net/DmitryZinoviev/workshop-20212296'],
 ['Resilience in Transaction-Oriented Networks',
  'http://www.slideshare.net/DmitryZinoviev/resilience-in-transactional-networks'],
 ['Peer Ratings in Massive Online Social Networks',
  'http://www.slideshare.net/DmitryZinoviev/peer-ratings-in-massive-online-social-networks'],
 ['Semantic Networks of Interests in Online NSSI Communities',
  'http://www.slideshare.net/DmitryZinoviev/presentation-31680572'],
 ['Towards an Ideal Store',
  'http://www.slideshare.net/Dmitry

In [None]:
with urlopen('https://news.naver.com/') as urlo:
  soup4 = BeautifulSoup(urlo)

# 하이퍼링크 추출
links = [[link.string, link['href']] for link in soup4.find_all('a') if link.has_attr('href')]

HTTPError: ignored

CSV 파일 다루기

statistics 모듈을 사용해서 나이 변수의 평균과 표준편차 구하기

In [1]:
# 데이터 가져오기
import csv

In [2]:
with open('/content/Demographic_Statistics_By_Zip_Code.csv') as file:
  data = list(csv.reader(file))

In [7]:
# 첫번째 레코드에 COUNT PARTICIPANTS 데이터 인덱스 값 추출 data[0] 컬럼 값
cntpantsindex = data[0].index('COUNT PARTICIPANTS')
cntpantsindex

1

In [None]:
# COUNT PARTICIPANTS 데이터 추출
cntpants = [int(row[cntpantsindex]) for row in data[1:]]    # 데이터 형식을 정해줘라 (컴퓨터가 지 좆대로 데이터 형식을 정해버림)
cntpants

In [None]:
## ERROR
cntpants = [row for row in data[1:]]  # 조건을 안걸어주면 모든 컬럼 값이 저장됨
cntpants = [row[1] for row in data[1:]] # 원하는 컬럼의 인덱스가 변경될 경우 다른 값이 나옴
print(cntpants)

나이의 평균과 표준편차

In [13]:
import statistics

In [14]:
print(statistics.mean(cntpants), statistics.stdev(cntpants))

17.661016949152543 43.27973735299687


자연어 처리하기

In [18]:
pip install nltk    # nltk 설치



In [21]:
import nltk
nltk.__version__    # import하고 버전확인

'3.2.5'

nltk에서 영단어 온톨로지(wordnet) 사용하기

In [22]:
# https://frhyme.github.io/python-lib/nltk-wordnet/

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [24]:
wn = nltk.corpus.wordnet           # 코퍼스 리더(reader)

#synset : 유의어 묶음으로 단어-품사-순번 으로 구성
wn.synsets('cat')

[Synset('cat.n.01'),
 Synset('guy.n.01'),
 Synset('cat.n.03'),
 Synset('kat.n.01'),
 Synset('cat-o'-nine-tails.n.01'),
 Synset('caterpillar.n.02'),
 Synset('big_cat.n.01'),
 Synset('computerized_tomography.n.01'),
 Synset('cat.v.01'),
 Synset('vomit.v.01')]

In [25]:
wn.synset('cat.n.01 ').hypernyms()    # hypernyms 상위어
wn.synset('cat.n.01 ').hyponyms()    # hypernyms 하위어

[Synset('domestic_cat.n.01'), Synset('wildcat.n.03')]

WordNet 사용해서 synset 간의 의미론적 유사도 계산하기 (0~1 사이의 실수)

0이면 두 단어 서로 관계 없음

1이면 두 단어 완전한 유의어

In [28]:
# 고양이, 링스 : 시라소니 유사도 계산
x = wn.synset('cat.n.01')  # 고양이
y = wn.synset('lynx.n.01')  # 시라소니

# x, y 유사도 계산
x.path_similarity(y)

4.0

In [27]:
# test
tx = wn.synset('domestic_cat.n.01')
ty = wn.synset('wildcat.n.03')

tx.path_similarity(ty)

0.3333333333333333

자연어 처리 정규화

In [40]:
# 1. 토큰화(텍스트 -> 단어로 쪼갠다)
from nltk.tokenize import WordPunctTokenizer
word_punct = WordPunctTokenizer()

text = 'Hello!!,::%!@gu#:)hou%!^&*  *&*&*  :):):):):: :+-[]'

In [41]:
#Tokenizer는 모든 구두점(punctuation; 문장부호)을 기준으로 분리
# -> 이모티콘을 이용한 감성 분석 등 문장구조 깊이 분석 할 때 사용

word_punct.tokenize(text)

['Hello',
 '!!,::%!@',
 'gu',
 '#:)',
 'hou',
 '%!^&*',
 '*&*&*',
 ':):):):)::',
 ':+-[]']

In [None]:
WordPunctTokenizer().tokenize(text)

In [None]:
# 단어 토크나이저
nltk.download('punkt')

nltk.word_tokenize(text)

In [44]:
# 2. 단어의 대소문자를 통일한다. (모두 대문자 or 소문자)
# 3. 불용어 제거 (stopwords 리스트 참조. THE 같은 것들을 제거)
# 4. 형태소 분석(stemming) 단어를 형태소로 변환

# 포터 형태소 분석기 (보수적)

pst = nltk.PorterStemmer()
pst.stem('wonderful')

'wonder'

In [45]:
# 랭커스터 형태소 분석기 (적극적) - 더 많은 동음이의어 형태소 생산
ls = nltk.LancasterStemmer()
ls.stem('wonderful')

'wond'

In [46]:
# 5. 원형 추출
le = nltk.WordNetLemmatizer()
le.lemmatize('wonderful')

'wonderful'

In [47]:
# 그외 품사 태깅
nltk.download('averaged_perceptron_tagger')

nltk.pos_tag(['beautiful', 'word'])  # JJ adjective(형용사), NN noun(명사)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


[('beautiful', 'JJ'), ('word', 'NN')]

index.html 파일에서(불용어 제외) 가장 많이 등장한 단어 원형 찾아보기

In [50]:
## www.networksciencelab.com 사이트에서 하이퍼 링크 정보 모두 가져오기


# http://www.networksciencelab.com/ 사이트에서 하이퍼링크 정보 모두 가져오기
with urlopen('http://www.networksciencelab.com/') as urlo:
  soup4 = BeautifulSoup(urlo)

# 하이퍼링크 추출
links = [[link.string, link['href']] for link in soup4.find_all('a') if link.has_attr('href')]


In [66]:
# 책 제목만 추출
html_text = [i[0] for i in links]

In [67]:
print(html_text)

[None, None, 'DZPYDS', 'DZCNAPY', 'Networks of Music Groups as Success Predictors', 'Network Science Workshop', 'Resilience in Transaction-Oriented Networks', 'Peer Ratings in Massive Online Social Networks', 'Semantic Networks of Interests in Online NSSI Communities', 'Towards an Ideal Store', 'D.Zinoviev, "Analyzing Cultural Domains with Python,"', 'D. Zinoviev, D. Stefanescu, G. Fireman, and L. Swenson, "Semantic networks of interests in online non-suicidal self-injury communities,"', 'D.Zinoviev, "The Pain of Complexity,"', 'D.Zinoviev, Z.Zhu, and K.Li, "Building mini-categories in product networks,"', 'D.Zinoviev, H.Benbrahim, G.Meszoely, and D.Stefanescu, "Mitigation of delayed management costs in transaction-oriented systems,"', 'D.Zinoviev, H.Benbrahim, G.Meszoely, and D.Stefanescu, "Simulating resilience in transaction-oriented networks,"', 'D.Zinoviev, D.Stefanescu, L.Swenson, and G.Fireman, "Semantic networks of interests in online NSSI communities,"', 'D.Zinoviev and S.Llew

In [65]:
# None 데이터 제거
html_text = ' '.join(html_text[2:])
print(html_text)

DZPYDS DZCNAPY Networks of Music Groups as Success Predictors Network Science Workshop Resilience in Transaction-Oriented Networks Peer Ratings in Massive Online Social Networks Semantic Networks of Interests in Online NSSI Communities Towards an Ideal Store D.Zinoviev, "Analyzing Cultural Domains with Python," D. Zinoviev, D. Stefanescu, G. Fireman, and L. Swenson, "Semantic networks of interests in online non-suicidal self-injury communities," D.Zinoviev, "The Pain of Complexity," D.Zinoviev, Z.Zhu, and K.Li, "Building mini-categories in product networks," D.Zinoviev, H.Benbrahim, G.Meszoely, and D.Stefanescu, "Mitigation of delayed management costs in transaction-oriented systems," D.Zinoviev, H.Benbrahim, G.Meszoely, and D.Stefanescu, "Simulating resilience in transaction-oriented networks," D.Zinoviev, D.Stefanescu, L.Swenson, and G.Fireman, "Semantic networks of interests in online NSSI communities," D.Zinoviev and S.Llewelyn, "Co-Evolution of Friendship and Publishing in Online 

In [57]:
# 정리한 값을 index.html 파일로 저장
html_file = open('index.html', 'w')
html_file.write(html_text)
html_file.close()

index.html 파일에서(불용어 제외) 가장 많이 등장한 단어 원형 찾아보기

In [None]:
from bs4 import BeautifulSoup
from collections import Counter
from nltk.corpus import stopwords
from nltk import LancasterStemmer
nltk.download('stopwords')

In [86]:
# 형태소 분류기를 생성한다.
Is = nltk.LancasterStemmer()

# 파일을 읽고 soup을 만든다.
with open('/content/index.html') as file:
  soup = BeautifulSoup(file)    # BeautifulSoup 쓰는 이유은 html은 그냥 문자가 아니라 <head><body>와 같이 태그로 쌓여있어서 쓴다.

In [96]:
# soup1 = BeautifulSoup(open('/content/index.html'))
# print(soup1)

<html><body><p>DZPYDS DZCNAPY Networks of Music Groups as Success Predictors Network Science Workshop Resilience in Transaction-Oriented Networks Peer Ratings in Massive Online Social Networks Semantic Networks of Interests in Online NSSI Communities Towards an Ideal Store D.Zinoviev, "Analyzing Cultural Domains with Python," D. Zinoviev, D. Stefanescu, G. Fireman, and L. Swenson, "Semantic networks of interests in online non-suicidal self-injury communities," D.Zinoviev, "The Pain of Complexity," D.Zinoviev, Z.Zhu, and K.Li, "Building mini-categories in product networks," D.Zinoviev, H.Benbrahim, G.Meszoely, and D.Stefanescu, "Mitigation of delayed management costs in transaction-oriented systems," D.Zinoviev, H.Benbrahim, G.Meszoely, and D.Stefanescu, "Simulating resilience in transaction-oriented networks," D.Zinoviev, D.Stefanescu, L.Swenson, and G.Fireman, "Semantic networks of interests in online NSSI communities," D.Zinoviev and S.Llewelyn, "Co-Evolution of Friendship and Publis

In [87]:
soup

<html><body><p>DZPYDS DZCNAPY Networks of Music Groups as Success Predictors Network Science Workshop Resilience in Transaction-Oriented Networks Peer Ratings in Massive Online Social Networks Semantic Networks of Interests in Online NSSI Communities Towards an Ideal Store D.Zinoviev, "Analyzing Cultural Domains with Python," D. Zinoviev, D. Stefanescu, G. Fireman, and L. Swenson, "Semantic networks of interests in online non-suicidal self-injury communities," D.Zinoviev, "The Pain of Complexity," D.Zinoviev, Z.Zhu, and K.Li, "Building mini-categories in product networks," D.Zinoviev, H.Benbrahim, G.Meszoely, and D.Stefanescu, "Mitigation of delayed management costs in transaction-oriented systems," D.Zinoviev, H.Benbrahim, G.Meszoely, and D.Stefanescu, "Simulating resilience in transaction-oriented networks," D.Zinoviev, D.Stefanescu, L.Swenson, and G.Fireman, "Semantic networks of interests in online NSSI communities," D.Zinoviev and S.Llewelyn, "Co-Evolution of Friendship and Publis

In [88]:
soup.text    #html, body 태그에서 텍스트를 가져옴

'DZPYDS DZCNAPY Networks of Music Groups as Success Predictors Network Science Workshop Resilience in Transaction-Oriented Networks Peer Ratings in Massive Online Social Networks Semantic Networks of Interests in Online NSSI Communities Towards an Ideal Store D.Zinoviev, "Analyzing Cultural Domains with Python," D. Zinoviev, D. Stefanescu, G. Fireman, and L. Swenson, "Semantic networks of interests in online non-suicidal self-injury communities," D.Zinoviev, "The Pain of Complexity," D.Zinoviev, Z.Zhu, and K.Li, "Building mini-categories in product networks," D.Zinoviev, H.Benbrahim, G.Meszoely, and D.Stefanescu, "Mitigation of delayed management costs in transaction-oriented systems," D.Zinoviev, H.Benbrahim, G.Meszoely, and D.Stefanescu, "Simulating resilience in transaction-oriented networks," D.Zinoviev, D.Stefanescu, L.Swenson, and G.Fireman, "Semantic networks of interests in online NSSI communities," D.Zinoviev and S.Llewelyn, "Co-Evolution of Friendship and Publishing in Online

In [None]:
# 1. 텍스트 추출 -> 토큰
words = nltk.word_tokenize(soup.text)
words

In [None]:
# 2. 단어를 소문자로 변환
words = [w.lower() for w in words]
words

In [93]:
# 3. 불용어(불용어 _ 특수기호)를 제거하고 단어의 형태소를 추출
# 3. 1 불용어 제거 stopwords.words('english')
# 3. 2 특수기호 제거 isalnum()

words = [Is.stem(w) for w in words if w not in stopwords.words('english') and w.isalnum()]   #isalnum 알파벳과 숫자로만 있나? ==> True or False

In [94]:
# 4. 가장 빈번하게 등장하는 단어 10개 추출
freqs = Counter(words)
print(freqs.most_common(10))

[('network', 16), ('soc', 8), ('onlin', 7), ('inform', 4), ('gam', 4), ('sem', 3), ('interest', 3), ('commun', 3), ('theoret', 3), ('approach', 3)]
