### Import Files

In [1]:
import requests
import lxml.html
import sqlite3 as sq3
from pandas.io import sql
import os
import re
import string
import pandas as pd
from tabulate import tabulate
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
import time
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.chrome.options import Options
import pyperclip as pc

## get_urls()
- gets the list of urls to detail pages

In [2]:
HTML_Africa_ME = 'https://www.emerics.org:446/issue.es?currentPage={}&pageCnt=50&search_area=2&mid=a10200000000&systemcode=05&search_region=&search_option=ALL&search_year=&search_month=&search_keyword='
HTML_Russia_Asia = 'https://www.emerics.org:446/issue.es?currentPage={}&pageCnt=50&search_area=2&mid=a10200000000&systemcode=04&search_region=&'\
                    'search_option=ALL&search_year=&search_month=&search_keyword='
HTML_Latin_America = 'https://www.emerics.org:446/issue.es?currentPage={}&pageCnt=50&search_area=2&mid=a10200000000&systemcode=06&search_region=&'\
                      'search_option=ALL&search_year=&search_month=&search_keyword='
HTML_CE_Europe = 'https://www.emerics.org:446/issue.es?currentPage={}&pageCnt=50&search_area=2&mid=a10200000000&systemcode=07&search_region=&search_option=ALL&search_year=&search_month=&search_keyword='

In [3]:
def get_urls(html_link):
    page = 1
    url_list = []
    max_page = 1
    
    #find the maximum number of pages
    driver.get(html_link.format(page))
    get_total_articles = driver.find_element('xpath','//span[@class="floatL count"]').text
    page1 = get_total_articles.split(' ')[1]
    page2 = int(page1.split('개')[0])
    if (page2%50 == 0):
        max_page = int(page2/50)
    else:
        max_page = int(page2/50) + 1
    
    while(max_page >= page):
        driver.get(html_link.format(page))
        time.sleep(0.5)
        
        li = driver.find_elements('xpath','//*[@id="content_detail"]/div[3]/ul/li')
        
        for each in li:
            a = each.find_element('xpath','div[2]/p[1]/a')
            url = a.get_attribute('href')
            url_list.append(url)

        if (len(url_list) >=0 & len(url_list)%50 == 0):
            page += 1

    return url_list

## Save URL list into a file

In [4]:
def saveURLs(url_list, file_name):
    # open file in write mode
    with open(file_name, 'w') as fp:
        for url in url_list:
            # write each item on a new line
            fp.write("%s\n" % url)
        print('URLs saved!')

In [5]:
def readURLs(file_name):
    # empty list to read list from a file
    urls = []
    # open file and read the content in a list
    with open(file_name, 'r') as fp:
        for line in fp:
            # remove linebreak from a current name
            # linebreak is the last character of each line
            x = line[:-1]

            # add current item to the list
            urls.append(x)
    return(urls)

## remove_punc()
- removes punctuations from string

In [6]:
def remove_punc(data):

    punc = '[!"#$%&\'()*+,-./:;<=>?[\]^_`{|}~“”·「」△《》◦•◦ㆍ‘’○❍□☐■※✔️▷①②③④【】🎡👑🔒💣🛡️🔻🌌🔥🚢🔑👀…▶ㅇ『∙』·-“”▲I]'
    new_string = re.sub(punc, '', data) # 특수문자 제거
    new_string2 = re.sub('\n', ' ', new_string) # newline 제거
    new_string3 = re.sub('\\s+', ' ', new_string2) # multiple spaces 제거
    return new_string3

## db_save()
- saves data as db file

In [7]:
def db_save(ARTICLE_LIST, db_name, table_name):
    with sq3.connect(os.path.join('.',db_name)) as con: # sqlite DB 파일이 존재하지 않는 경우 파일생성
        try:
            ARTICLE_LIST.to_sql(name = table_name, con = con, index = False, if_exists='replace') 
            #if_exists : {'fail', 'replace', 'append'} default : fail
        except Exception as e:
            print(str(e))
        print(len(ARTICLE_LIST), '건 저장완료..')

## db_select()
- read data from db file

In [8]:
def db_select(db_name, table_name):
    with sq3.connect(db_name) as con: 
        try:
            query = 'SELECT * FROM {}'.format(table_name)
            df = pd.read_sql(query, con = con)
        except Exception as e:
            print(str(e)) 
        return df

In [9]:
def db_delete(db_name, table_name):
    with sq3.connect(db_name) as con: 
        try:
            cur = con.cursor()
            sql = 'DELETE FROM {}'.format(table_name)
            cur.execute(sql)
        except Exception as e:
            print(str(e))

## get_info()
- returns detailed information on the article pages as list

In [10]:
def get_info(url):
    details = []
    articles = []
    articleString = ""
    
    driver.get(url)
    title = driver.find_element('xpath','//*[@id="content_detail"]/div[1]/div/div/h2').text
    country = driver.find_element('xpath','//*[@id="content_detail"]/div[1]/div/div/p[2]/span[1]').text
    date = driver.find_element('xpath','//*[@id="content_detail"]/div[1]/div/div/p[2]/span[5]').text
    article = driver.find_element('xpath','//*[@class="view-content"]').text
    #p = article.find_elements('xpath','text')
    
    #for each in p:
    #    articles.append(each)
    #articleString = ' '.join(articles)
    
    driver.implicitly_wait(5)
    
    #words = remove_punc(articleString)
    words = remove_punc(article)
    details.append(title)
    details.append(country)
    details.append(date)
    details.append(words)
    
    return details

## db_save_as_csv()
- saves dataframe as csv

In [11]:
def db_save_as_csv(data):
    data.to_csv("신흥지역정보_종합지식포탈.csv", index=False, encoding='utf-8')

### ---------------------------------------------------------------------------------------------------------------------
# Main
This is the main part which uses functions to read articles and saves the data as csv file
### ---------------------------------------------------------------------------------------------------------------------

In [12]:
options = webdriver.ChromeOptions() 
#options.add_argument("--start-maximized");  # Chrome 브라우저 최대화 설정
#options.add_argument('--headless')  # headless 모드
options.add_argument('--disable-gpu')  # 그래픽 가속 해제 (오류 방지)
#options.add_argument('--mute-audio')  # 음소거모드 적용

#driver = webdriver.Chrome('service = Service(ChromeDriverManager().install())', options = options)
driver = webdriver.Chrome('C:/Users/user/.wdm/drivers/chromedriver/win32/105.0.5195/chromedriver.exe', chrome_options = options) #드라이버경로 지정

## -----------------------------------------------------------------------
# urls for Africa
urls = get_urls(HTML_Africa_ME)
URL_FILE = 'url_list_africa.txt'
TABLE_NAME = 'emerics_Africa'
## -----------------------------------------------------------------------
# urls for Russia & Asia
#urls = get_urls(HTML_Russia_Asia)
#URL_FILE = 'url_list_russia_asia.txt'
#TABLE_NAME = 'emerics_Russia_Asia'
## -----------------------------------------------------------------------
# urls for Latin America
#urls = get_urls(HTML_Latin_America)
#URL_FILE = 'url_list_latin_america.txt'
#TABLE_NAME = 'emerics_Latin_America'
## -----------------------------------------------------------------------
# urls for Central East Europe
#urls = get_urls(HTML_CE_Europe)
#URL_FILE = 'url_list_ce_europe.txt'
#TABLE_NAME = 'emerics_CE_Europe'

saveURLs(urls, URL_FILE)          # save urls to txt file
url_list = readURLs(URL_FILE)     # read urls from txt file

df_list = []

for url in url_list:
    details = get_info(url)
    
    df = pd.DataFrame([{
        "제목": details[0],
        "국가": details[1],
        "날짜": details[2],
        "본문": details[3]
    }])
    
    df_list.append(df)
    driver.implicitly_wait(4)
    time.sleep(1)

ARTICLE_LIST = pd.concat(df_list)

db_save(ARTICLE_LIST, '신흥지역정보_종합지식포탈.db', TABLE_NAME)
db_save_as_csv(ARTICLE_LIST)

driver.close()
driver.quit()

  driver = webdriver.Chrome('C:/Users/user/.wdm/drivers/chromedriver/win32/105.0.5195/chromedriver.exe', chrome_options = options) #드라이버경로 지정
  driver = webdriver.Chrome('C:/Users/user/.wdm/drivers/chromedriver/win32/105.0.5195/chromedriver.exe', chrome_options = options) #드라이버경로 지정


KeyboardInterrupt: 

In [None]:
df1 = db_select('신흥지역정보_종합지식포탈.db', TABLE_NAME)
df1

In [None]:
df1.isnull().any()