## 필요 라이브러리 설치

!pip install BeautifulSoup4

!pip install pandas

!pip install numpy

## 기본 html 파싱

In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

In [2]:
res = requests.get('https://musclewiki.com/Directory')
soup = BeautifulSoup(res.content, 'html.parser')

lst = soup.find_all('tr')

## 필요 데이터 추출

In [3]:
muscle = []
exercise = []
equipment = []
difficulty = []
link = []

for item in lst:
    txt = [tmp for tmp in item.get_text().split('\n') if tmp]
    if 'Exercises' in item.get_text():
        muscle.append(txt[0].strip()[:-10])
        exercise.append([])
        equipment.append([])
        difficulty.append([])
        link.append([])
        
    else:
        exercise[-1].append(txt[0].strip())
        equipment[-1].append(txt[-2].strip())
        difficulty[-1].append(txt[-1].strip())
        tmp = 'https://musclewiki.com' + item.find('a')['href']
        link[-1].append(tmp)

## 데이터프레임 생성

In [4]:
df_lst = []

for i in range(len(muscle)):
    tmp_df = pd.DataFrame([exercise[i], equipment[i], difficulty[i], link[i]], 
                          index = ['exercise', 'equipment', 'difficulty', 'link']).transpose()
    df_lst.append(tmp_df)

## 데이터프레임에 이미지 및 운동 방법 추가

In [5]:
for df in df_lst:
    for i in range(len(df)):
        link = df.loc[i, 'link']
        tmp_res = requests.get(link)
        tmp_soup = BeautifulSoup(tmp_res.content, 'html.parser')
        
        if '404' in tmp_soup.get_text():
            df.loc[i, 'img_link'] = '이미지 주소 오류'
        else:
            image = tmp_soup.find(class_ = 'exercise-images-grid')
            df.loc[i, 'img_link'] = 'https://www.musclewiki.com' + image.find('img')['src']
            df.loc[i, 'video_link'] = image.find('a')['href']
            
            step_parse = tmp_soup.find(class_ = 'steps-list')
            step_lst = [item for item in step_parse.get_text().split('\n') if item]
            step = ""
            for j in range(len(step_lst)):
                step = step + str(j + 1) + '. ' + step_lst[j] + "\n"
            df.loc[i, 'step'] = step

## 난이도로 구분
- 기구별 구분은 불러올 때 sql문을 이용해 조건 검색

In [6]:
beginner = []
master = []

for df in df_lst:
    beginner.append(df.loc[df['difficulty'] == 'Beginner'])
    master.append(df.loc[df['difficulty'] != 'Beginner'])

In [7]:
for i in range(len(df)):
    b_filename = "eng_Beginner_" + muscle[i]
    m_filename = "eng_Master_"+ muscle[i]
    
    b_df = beginner[i].loc[beginner[i]['difficulty'] == 'Beginner']
    m_df = master[i].loc[master[i]['difficulty'] != 'Beginner']
    
    b_df.to_csv('data/' + b_filename + '.csv', index = False)
    m_df.to_csv('data/' + m_filename + '.csv', index = False)

## 데이터 한글로 변환
- googletrans 라이브러리 사용