## Collect Book Data
- Export database
- Load raw data
- Retrieve book descriptions
- Clean text
- Tokenize Japanese text
- Save

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
from janome.tokenizer import Tokenizer

## Load database export (raw data)

In [88]:
df = pd.read_csv('./data/tosho_raw.csv.bz2', sep='\t', compression='bz2')

In [69]:
df.head(10)

Unnamed: 0,id,category,title,isbn_raw
0,1000041,A,魚,9784092172043
1,1000042,A,両生類　はちゅう類,9784092172067
2,1000044,A,動物の図鑑,409217005X
3,1000045,A,航空,4092170122
4,1000046,A,船,4092170173
5,1000047,A,船,4092170173
6,1000048,A,世界　人と国とくらし,4092170203
7,1000049,A,水の生物,9784092172074
8,1000050,A,宇宙,9784092172098
9,1000051,A,海外帰国者のためのスクールガイド,9784808086237


## Convert ISBN to the right format

In [62]:
def modulus11weight10to2(c9):
    sum_ = 0
    for i in range(len(c9)):
        try:
            c = int(c9[i])
        except ValueError:
            return False
        sum_ += (10 - i) * c

    result = 11 - (sum_ % 11)
    
    if result == 11:
        return '0'
    elif result == 10:
        return 'X'
    else:
        return str(result)
    
def encode13to10(isbn13):
    prefix = isbn13[3:-1]
    check_digit = modulus11weight10to2(prefix)
    isbn10 = prefix + check_digit
    return isbn10

def convert_isbn(isbn):
    if len(isbn) == 13:
        return encode13to10(isbn)
    else:
        return isbn

In [151]:
df['isbn_processed'] = df['isbn_raw'].apply(lambda x: convert_isbn(x))

In [152]:
df.head(10)

Unnamed: 0,id,category,title,isbn_raw,isbn_processed
0,1000041,A,魚,9784092172043,4092172044
1,1000042,A,両生類　はちゅう類,9784092172067,4092172060
2,1000044,A,動物の図鑑,409217005X,409217005X
3,1000045,A,航空,4092170122,4092170122
4,1000046,A,船,4092170173,4092170173
5,1000047,A,船,4092170173,4092170173
6,1000048,A,世界　人と国とくらし,4092170203,4092170203
7,1000049,A,水の生物,9784092172074,4092172079
8,1000050,A,宇宙,9784092172098,4092172095
9,1000051,A,海外帰国者のためのスクールガイド,9784808086237,4808086239


## Retrieve book descriptions

In [122]:
def get_description(isbn, description):
    # if there is already a description, skip
    if description != '':
        return description
    
    try:
        #print('https://www.amazon.co.jp/dp/'+isbn)
        r = requests.get('https://www.amazon.co.jp/dp/'+isbn, headers={"User-Agent":"Defined"})
        soup = BeautifulSoup(r.text, 'lxml')
        description = soup.find(id="productDescription").get_text()
        print('success')
    except:
        #print('fail')
        return ''

    return description

In [227]:
df['description'] = df[['isbn_processed','description']].apply(lambda x: get_description(x[0], x[1]),axis=1)

success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success


## Clean descriptions

In [178]:
def clean_text(text):
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\s{1,}', '', text)
    
    text = re.sub(r'内容紹介', '', text)
    text = re.sub(r'出版社からのコメント', '', text)
    text = re.sub(r'商品の説明をすべて表示する', '', text)
    text = re.sub(r'内容（「MARC」データベースより）', '', text)
    text = re.sub(r'内容（「BOOK」データベースより）', '', text)

    non_japanese = re.compile(r"[^0-9\-ぁ-ヶ亜-黑ー]")
    text = re.sub(non_japanese, ' ', text)

    return text.strip()

In [229]:
df['description_clean'] = df['description'].apply(lambda x: clean_text(x))

## Tokenize Japanese text

In [167]:
j_tokenizer = Tokenizer()

def wakati_reading(text):
    tokens = j_tokenizer.tokenize(text.replace("'", "").lower())
    
    exclude_pos = [u'助動詞']
    
    #分かち書き
    tokens_w_space = ""
    for token in tokens:
        partOfSpeech = token.part_of_speech.split(',')[0]
        
        if partOfSpeech not in exclude_pos:
            tokens_w_space = tokens_w_space + " " + token.surface

    tokens_w_space = tokens_w_space.strip()
    tokens_w_space = re.sub(r'\s{2,}', ' ', tokens_w_space)
    
    return tokens_w_space

In [230]:
df['description_token'] = df['description_clean'].apply(lambda x: wakati_reading(x))

## Save

In [231]:
df.drop_duplicates(inplace=True)
df.to_csv('./data/tosho_processed_clean.csv.bz2', encoding='utf-8', sep='\t', compression='bz2', index=False)