# http://search.ojj.kr

In [2]:
import os
import json
import urllib
import multiprocessing
import pandas as pd

from bs4 import BeautifulSoup
from datetime import datetime
from functools import reduce
from itertools import chain

START, END = 2012, 2017

search_page_num = {
    2017: 24,
    2016: 8, 
    2015: 12,
    2014: 1,
    2013: 11,
    2012: 7,
}

def get_link(year, pn):
    return 'http://search.ojj.kr/?m=search&k=%ED%81%AC%EB%A6%AC%EC%8A%A4%EB%A7%88%EC%8A%A4&y={year}&pn={pn}'.format(year=year, pn=pn)



crawl_lists = [
    get_link(year, pn + 1)
    for year in range(START, END+1)
    for pn in range(0, search_page_num[year])
]

COLUMNS = ['rank', 'keyword', 'date']


def get_dom_from_url(u):
    with urllib.request.urlopen(u) as url:
        doc = url.read().decode('utf-8')
        soup = BeautifulSoup(doc, "html.parser")
    return soup

def get_search_kw_rows(dom):
    past_data = dom.find_all('table')[1]
    past_table = past_data.find('table')
    rows = past_table.find_all('tr')[1:]
    return rows

def parse_row(row):
    try:
        searches = row.find_all('td')
        rank = int(searches[0].text)
        keyword = searches[2].find('a').text
        date = searches[3].find('a').text
        date = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
    except:
        print(row)
        return None, None, None
    return rank, keyword, date

def process_url(u):
    dom = get_dom_from_url(u)
    rows = get_search_kw_rows(dom)
    return rows

In [3]:
doms = map(get_dom_from_url, crawl_lists)
doms = list(doms)

keywords = []
for idx, dom in enumerate(doms):
    print(idx)
    rows = get_search_kw_rows(dom)
    for row in rows:
        keyword = parse_row(row)
        keywords.append(keyword)

0
<tr><td align="center" colspan="4"><font size="2"><font color="RED"><b>1</b></font> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2012&amp;pn=2">2</a> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2012&amp;pn=3">3</a> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2012&amp;pn=4">4</a> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2012&amp;pn=5">5</a> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2012&amp;pn=6">6</a> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2012&amp;pn=7">7</a> 
</font></td></tr>
1
<tr><td align="center" colspan="4"><font size="2"><a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2012&amp;pn=1">1</a> 
<font color="RED"><b>2</b></font> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2012&amp;pn=3">3</a> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2012&amp;pn=4">4</a> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2012&amp;pn=5">5</a> 
<a class="gray_sma

<tr><td align="center" colspan="4"><font size="2"><a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2013&amp;pn=1">1</a> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2013&amp;pn=2">2</a> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2013&amp;pn=3">3</a> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2013&amp;pn=4">4</a> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2013&amp;pn=5">5</a> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2013&amp;pn=6">6</a> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2013&amp;pn=7">7</a> 
<font color="RED"><b>8</b></font> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2013&amp;pn=9">9</a> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2013&amp;pn=10">10</a> 
<a href="/?m=search&amp;k=크리스마스&amp;y=2013&amp;pn=11">▶</a>... 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2013&amp;pn=11">11</a>
</font></td></tr>
15
<tr><td align="center" colspan="4"><f

<tr><td align="center" colspan="4"><font size="2"><a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2016&amp;pn=1">1</a> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2016&amp;pn=2">2</a> 
<font color="RED"><b>3</b></font> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2016&amp;pn=4">4</a> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2016&amp;pn=5">5</a> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2016&amp;pn=6">6</a> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2016&amp;pn=7">7</a> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2016&amp;pn=8">8</a> 
</font></td></tr>
34
<tr><td align="center" colspan="4"><font size="2"><a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2016&amp;pn=1">1</a> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2016&amp;pn=2">2</a> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2016&amp;pn=3">3</a> 
<font color="RED"><b>4</b></font> 
<a class="gray_smal

<tr><td align="center" colspan="4"><font size="2"><a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2017&amp;pn=1">1</a> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2017&amp;pn=2">2</a> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2017&amp;pn=3">3</a> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2017&amp;pn=4">4</a> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2017&amp;pn=5">5</a> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2017&amp;pn=6">6</a> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2017&amp;pn=7">7</a> 
<font color="RED"><b>8</b></font> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2017&amp;pn=9">9</a> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2017&amp;pn=10">10</a> 
<a href="/?m=search&amp;k=크리스마스&amp;y=2017&amp;pn=11">▶</a>... 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2017&amp;pn=24">24</a>
</font></td></tr>
47
<tr><td align="center" colspan="4"><f

</font></td></tr>
57
<tr><td align="center" colspan="4"><font size="2"><a href="/?m=search&amp;k=크리스마스&amp;y=2017&amp;pn=1">1</a>...
<a href="/?m=search&amp;k=크리스마스&amp;y=2017&amp;pn=1">◀</a>
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2017&amp;pn=11">11</a> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2017&amp;pn=12">12</a> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2017&amp;pn=13">13</a> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2017&amp;pn=14">14</a> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2017&amp;pn=15">15</a> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2017&amp;pn=16">16</a> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2017&amp;pn=17">17</a> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2017&amp;pn=18">18</a> 
<font color="RED"><b>19</b></font> 
<a class="gray_small" href="/?m=search&amp;k=크리스마스&amp;y=2017&amp;pn=20">20</a> 
<a href="/?m=search&amp;k=크리스마스&amp;y=2017&

In [4]:
keyword_df = pd.DataFrame(keywords, columns=COLUMNS)

In [5]:
keyword_df = keyword_df.dropna(axis=0)

In [8]:
keyword_df.to_csv('./gendata/keyword/list.csv', index_label=False)

# blog