# Project Objective
한국에서 출판된 프로그래밍 언어 책의 정보를 수집, 분석하여 국내 출판책 기준으로 가장 인기있는 언어 확인<br>

[Task 1] <br> 
10개 언어에 대한 출판 책 정보 수집
* Python(파이썬), C, Java, C++, C#, Visual Basic, JavaScript, SQL, PHP, R<br>
* 수집 데이터: 책 이름, 저자, 출판사, 출판일, ISBN + (페이지수,가격)

[Task 2] <br> 
수집데이터를 ISBN 정보로 unique 검사하여 중복데이터 처리<br>

[Task 3] <br>
각 언어별 출판물의 양 시각화 및 판매 순위 정리<br>

[Task 4] <br>
수집된 정보를 기반으로 우리나라의 컴퓨터 언어 관련 출판사 순위를 확인<br>

[Task 5] <br>
수집된 정보에서 출판일 기준, 최근 2년간(20년, 21년) 데이터와 그 전 3년(17년, 18년, 19년) 데이터 비교<br>
특이사항 정리 (특정 언어 변화 감지시 강조)<br>

In [25]:
%pip install wordcloud

Collecting wordcloud
  Downloading wordcloud-1.8.1.tar.gz (220 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m220.7/220.7 KB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: wordcloud
  Building wheel for wordcloud (setup.py) ... [?25ldone
[?25h  Created wheel for wordcloud: filename=wordcloud-1.8.1-cp39-cp39-macosx_10_9_x86_64.whl size=154045 sha256=8a8401c4f1621fe21b92502bde8849e7217e5ab2731bc912bb26ff3608501559
  Stored in directory: /Users/mac/Library/Caches/pip/wheels/f9/7a/dd/06ef8b5dfe5483f6204133c08eeb16c287cc2c05e290ae2fc0
Successfully built wordcloud
Installing collected packages: wordcloud
Successfully installed wordcloud-1.8.1
Note: you may need to restart the kernel to use updated packages.


In [34]:
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd
import time
import matplotlib.pyplot as plt
import set_matplotlib_kor
from wordcloud import WordCloud

KOR setup completed in your MAC!!


In [2]:
# search keywords list
keywords_p = ['파이선','파이썬','python']
keywords_c = ['C++','C#','C']
keywords_java = ['JavaScript','자바스크립트','Java','자바']
keywords_vb = ['VisualBasic','비주얼베이직']
keywords_else = ['SQL','R','PHP']
search_keywords = keywords_p + keywords_c + keywords_java + keywords_vb + keywords_else

In [3]:
# unique isbn set (dup search)
u_isbn = set()

In [4]:
# Naver API
client_id = 'Dub7X0a1WdXUiR4wr7g3'
client_secret = 'y26kuyyXPQ'

In [5]:
# search parameters
display = 100
start_i = 1
end_i = 1000
idx = 0
query = 'R'
years = range(1980,2021,6)

In [6]:
# df base
cols = ["lang","title","author","publisher","pubdate","isbn","price","link"]
rows = []

## Functions

In [7]:
#generate search url
def gen_search_url(search_text, start_num, disp_num,start_y,end_y):
	base = "https://openapi.naver.com/v1/search/book_adv.xml"
	param_query = "?d_titl=" + urllib.parse.quote(search_text) + "&d_catg=280"
	param_date = "&d_dafr=" + start_y + "&d_dato=" + end_y
	param_start = "&start=" + str(start_num)
	param_disp = "&display=" + str(disp_num)

	return base + param_query + param_date + param_start + param_disp

In [8]:
# delete tags ("<b>,</b>") in title (search results)
def delete_tag(input_str):
	output = input_str.replace("<b>","")
	output = output.replace("</b>","")
	return output

In [9]:
# check unique isbn
def unique_isbn(isbn):
    if isbn in u_isbn:
        return False
    else:
        return True


In [10]:
# select search language
def search_lang(search):

    if search in set(keywords_p):
        key = 'Python'
    elif search in set(keywords_vb):
        key = 'VisualBasic'
    elif search in set(keywords_java):
        if len(search) > 5:
            key = 'JavaScript'
        else:
            key = 'Java'
    else:
        key = search
    
    return key
    

In [11]:
# save search items 
def save_search(items,key):
    temp_rows = []
    for i in items:
            title = delete_tag(i.find("title").text)
            author = i.find("author").text
            publisher = i.find("publisher").text
            pubdate = i.find("pubdate").text
            isbn = i.find("isbn").text
            price = i.find("price").text 
            link = i.find("link").text

            # only save unique search
            if unique_isbn(isbn):
                u_isbn.add(isbn)
                
                temp_rows.append({"lang": key,
                            "title": title,
                            "author": author,
                            "publisher": publisher,
                            "pubdate": pubdate,
                            "isbn": isbn,
                            "price": price,
                            "link":link})
    
    return temp_rows

# Collect Data

In [12]:

for search in search_keywords:
    key = search_lang(search)
    
    # 6년 단위로 검색 - Naver API 검색 시작 위치 제한 (max 1,000)
    for year in years: 
        start_y = str(year)+"0101"
        end_y = str(year+5)+"1231"
        
        for n in range(1,1000,100):
            
            request = urllib.request.Request(gen_search_url(search,n,100,start_y,end_y))
            
            request.add_header('X-Naver-Client-Id', client_id)
            request.add_header('X-Naver-Client-Secret', client_secret)
            response = urllib.request.urlopen(request)
            rescode = response.getcode()

            if(rescode == 200):
                result = response.read().decode('utf-8')
            else:
                print("error")
                
            soup = BeautifulSoup(result,'html.parser')
            items = soup.find_all('item')
            if len(items) != 0:
                rows.append(save_search(items,key))
            
            time.sleep(1)



In [13]:
rows_all = []
for i in range(len(rows)):
    rows_all.extend(rows[i])

In [14]:
books_df = pd.DataFrame(rows_all, columns=cols)

In [15]:
books_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7009 entries, 0 to 7008
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   lang       7009 non-null   object
 1   title      7009 non-null   object
 2   author     7009 non-null   object
 3   publisher  7009 non-null   object
 4   pubdate    7009 non-null   object
 5   isbn       7009 non-null   object
 6   price      7009 non-null   object
 7   link       7009 non-null   object
dtypes: object(8)
memory usage: 438.2+ KB


In [16]:
# unique isbn check
books_df['isbn'].is_unique

True

In [17]:
books_df['lang'].value_counts()

C              1714
C++            1025
Java           1020
Python          874
VisualBasic     651
JavaScript      456
SQL             441
R               372
C#              298
PHP             158
Name: lang, dtype: int64

In [18]:
books_df.to_csv('./books.csv',index=False,encoding='utf-8')

# Data Visualisation