## 필요 라이브러리 설치

!pip install BeautifulSoup4

!pip install pandas

## 기본 html 파싱

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [3]:
res = requests.get('https://musclewiki.com/Directory')
soup = BeautifulSoup(res.content, 'html.parser')

lst = soup.find_all('tr')

## 필요 데이터 추출

In [32]:
muscle = []
exercise = []
equipment = []
difficulty = []
link = []
name = ""

for item in lst:
    txt = [tmp for tmp in item.get_text().split('\n') if tmp]
    if 'Exercises' in item.get_text():
        name = txt[0].strip()[:-10]
        
    else:
        muscle.append(name)
        exercise.append(txt[0].strip())
        equipment.append(txt[-2].strip())
        difficulty.append(txt[-1])
        tmp = 'https://musclewiki.com' + item.find('a')['href']
        link.append(tmp)

## 데이터프레임 생성

In [38]:
df = pd.DataFrame([muscle, exercise, equipment, difficulty, link], 
                  index = ['muscle', 'exercise', 'equipment', 'difficulty', 'link']).transpose()

In [84]:
set(equipment)

{'Band',
 'Barbell',
 'Bodyweight',
 'Cables',
 'Dumbbells',
 'Kettlebells',
 'Machine',
 'Stretches'}

## 데이터프레임에 이미지 및 운동 방법 추가

In [75]:
for i in range(len(df)):
    link = df.loc[i, 'link']
    res2 = requests.get(link)
    soup2 = BeautifulSoup(res2.content, 'html.parser')
    
    if '404' in soup2.get_text():
        df.loc[i, 'image1'] = 'no data'
        df.loc[i, 'image2'] = 'no data'
        df.loc[i, 'video1'] = 'no data'
        df.loc[i, 'video2'] = 'no data'
        df.loc[i, 'step'] = 'no data'
        
    else:
        images = soup2.find(class_ = 'exercise-images-grid').find_all('a')
        if images:
            df.loc[i, 'image1'] = 'https://www.musclewiki.com' + images[0].find('img')['src']
            df.loc[i, 'video1'] = images[0]['href']
            if len(images) == 2:
                df.loc[i, 'image2'] = 'https://www.musclewiki.com' + images[1].find('img')['src']
                df.loc[i, 'video2'] = images[1]['href']
            else:
                df.loc[i, 'image2'] = 'no data'
                df.loc[i, 'video2'] = 'no data'
        else:
            df.loc[i, 'image1'] = 'no data'
            df.loc[i, 'image2'] = 'no data'
            df.loc[i, 'video1'] = 'no data'
            df.loc[i, 'video2'] = 'no data'
            
        step_parse = soup2.find(class_ = 'steps-list')
        step_lst = [item for item in step_parse.get_text().split('\n') if item]
        step = ""
        
        for j in range(len(step_lst)):
            step = step + str(j + 1) + '. ' + step_lst[j] + '\n'
        step = step.strip('\n')
        
        if step:
            df.loc[i, 'step'] = step
        else:
            df.loc[i, 'step'] = 'no data'

## 데이터 한글로 변환
- googletrans 라이브러리 사용

!pip install googletrans==4.0.0-rc1

In [81]:
import googletrans
import time

translator = googletrans.Translator()

In [82]:
for i in range(len(df)):
    step = df.loc[i, 'step']
    time.sleep(0.5)
    if step != 'no data':
        trans = translator.translate(step, dest = 'ko', src = 'en')
        df.loc[i, 'step'] = trans.text
        
df.to_csv('./data/exercise_data.csv', encoding = 'utf-8-sig', index = False)