In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
import time
import matplotlib as plt
%matplotlib inline
import seaborn as sns

In [3]:
# 前処理をするための関数を用意

# 全ての部屋をLDKの形に直す
def identify_floor_plan(floor_plan):
    if floor_plan.find('ワンルーム') > -1:
        floor_plan = '1K'
    if floor_plan.find('+') > -1:
        floor_plan = floor_plan[:floor_plan.find("+")]
        
    return floor_plan
        
#　沿線情報の修正と分割
def get_line_station(line_station):
    
    # J RをJRに変換
    if line_station.find('JR') > -1:
        line_station = line_station.replace('J R', 'JR')
    
    #バスと徒歩の時間の取得
    if line_station.find('バス') > -1:
        bus_time_text = re.search(r'バス(\d+)分', line_station)
        bus_time = bus_time_text.group(1)
        
        if line_station.find('停歩') > -1:
            walk_time_text = re.search(r'停歩(\d+)分', line_station)
            walk_time = walk_time.group(1)
        else:
            walk_time_text = re.search(r'歩(\d+)分', line_station)
            walk_time = walk_time_text.group(1)

    else:
        bus_time = 0
        walk_time_text = re.search(r'徒歩(\d+)分', line_station)
        walk_time = walk_time_text.group(1)
        
    #沿線と駅名の取得
    line = line_station[ : line_station.find('「')]
    
    station = line_station[line_station.find('「') + 1 : line_station.find('」')]
    
    return line, station, bus_time, walk_time


def get_page_count(hit_count):
    # ヒット件数の数値のみ取得
    hit_count = hit_count.strip()
    hit_count = hit_count.replace(',', '')
    hit_count = hit_count.replace('件', '')
    
    # ページの計算
    page_count = divmod(int(hit_count) , 30)
    if page_count[1] == 0:
        page_count = page_count[0]
    else:
        page_count = page_count[0] + 1
    
    return page_count

In [4]:
# 23区都心部のURL(千代田区、中央区、港区、新宿区、文京区、渋谷区)
# 23区東部(台東区、墨田区、江東区、荒川区、足立区、葛飾区、江戸川区)
# 23区南部(品川区、目黒区、大田区、世田谷区)
# 23区西部(中野区、杉並区、練馬区)
# 23区北部(豊島区、北区、板橋区)

url_list = ['https://suumo.jp/jj/bukken/ichiran/JJ010FJ001/?ar=030&bs=011&ta=13&jspIdFlg=patternShikugun&sc=13101&sc=13102&sc=13103&sc=13104&sc=13105&sc=13113&kb=1&kt=9999999&mb=0&mt=9999999&ekTjCd=&ekTjNm=&tj=0&cnb=0&cn=9999999&srch_navi=1&page={}',
            'https://suumo.jp/jj/bukken/ichiran/JJ010FJ001/?ar=030&bs=011&ta=13&jspIdFlg=patternShikugun&sc=13106&sc=13107&sc=13108&sc=13118&sc=13121&sc=13122&sc=13123&kb=1&kt=9999999&mb=0&mt=9999999&ekTjCd=&ekTjNm=&tj=0&cnb=0&cn=9999999&srch_navi=1&page={}',
            'https://suumo.jp/jj/bukken/ichiran/JJ010FJ001/?ar=030&bs=011&ta=13&jspIdFlg=patternShikugun&sc=13109&sc=13110&sc=13111&sc=13112&kb=1&kt=9999999&mb=0&mt=9999999&ekTjCd=&ekTjNm=&tj=0&cnb=0&cn=9999999&srch_navi=1&page={}',
            'https://suumo.jp/jj/bukken/ichiran/JJ010FJ001/?ar=030&bs=011&ta=13&jspIdFlg=patternShikugun&sc=13114&sc=13115&sc=13120&kb=1&kt=9999999&mb=0&mt=9999999&ekTjCd=&ekTjNm=&tj=0&cnb=0&cn=9999999&srch_navi=1&page={}',
            'https://suumo.jp/jj/bukken/ichiran/JJ010FJ001/?ar=030&bs=011&ta=13&jspIdFlg=patternShikugun&sc=13116&sc=13117&sc=13119&kb=1&kt=9999999&mb=0&mt=9999999&ekTjCd=&ekTjNm=&tj=0&cnb=0&cn=9999999&srch_navi=1&page={}']

# 5つのエリアごとに収納するリストを作成
urls = [''] * 5

for i, url_base in enumerate(url_list):
    url = url_base.format(1)    # 1ページ目の取得
    r = requests.get(url)
    soup = BeautifulSoup(r.text)

    # 検索ヒット件数の取得
    hit_count = soup.find('div', class_='pagination_set-hit').text
    r.close()

    # ページ数を取得
    
    # 各urlのページ数計算
    
    page_count = get_page_count(hit_count)
    
    """
    別方法
    
    body = soup.find('body')
    pages = body.find('ol', {'class': 'pagination-parts'}) # ページのナビタグを取得
    pages_text = str(pages)                                    # 文字列に変換
    pages_split0 = pages_text.split('</a></li>\n</ol>')
    pages_split1 = pages_split0[0]
    pages_split2 = pages_split1[-3:]                           #後ろから3文字(ページ数)を取得
    pages_split = int(pages_split2)                            #整数に変換

    """

    #URLを入れるリスト
    urls[i] = []
    #1ページ目を格納
    urls[i].append(url)

    #2ページ目以降を格納
    for j in range(page_count - 1):
        page_num = j + 2
        url_page = url_base.format(page_num)
        urls[i].append(url_page)
        
        

In [None]:
#取得する特徴量
cols = ['price', 'name', 'address', 'ward', 'line_station', 'line', 'station','bus_time',
        'walk_time', 'area', 'balcony', 'floor_plan', 'age', 'renovation', 'reform']

# 5つのテーブルデータを作成するためのリスト作成
df = [''] * 5

for i in range(5):
    
    df[i] = pd.DataFrame(index=[], columns=cols)
    data = {}
    
    for url in urls[i]:
        r = requests.get(url)
        soup = BeautifulSoup(r.text)
        r.close
        contents = soup.find_all('div', 'property_unit-content')

        for content in contents:
            details = content.find_all('dl')

            #物件名
            data['name'] = details[0].find('dd').text

            #販売価格
            data['price'] = details[1].find('span').text

            #所在地：住所と区を取り出す
            address = details[2].find('dd').text
            data['address'] = address
            data['ward'] = address.replace('東京都', '').split('区')[0] + '区'

            #沿線・駅：沿線、駅名、徒歩時間、バス時間を取り出す
            line_station = details[3].find('dd').text
            data['line_station'] = line_station

            line, station, bus_time, walk_time = get_line_station(line_station)

            data['line'] = line
            data['station'] = station
            data['bus_time'] = bus_time
            data['walk_time'] = walk_time

            #専有面積
            data['area'] = details[4].find('dd').text

            #間取り
            floor_plan = details[5].find('dd').text
            floor_plan = identify_floor_plan(floor_plan)
            data['floor_plan'] = floor_plan

            #バルコニー
            data['balcony'] = details[6].find('dd').text

            #築年数
            data['age'] = details[7].find('dd').text

            #リノベーション・リフォーム情報の取得
            header = content.find('h2').text

            #リノベーション：ヘッダーに「リノベ」の文字列があるときにFlagとして持つ
            if header.find('リノベ') > -1:
                data['renovation'] = 1
            else:
                data['renovation'] = 0

            #リフォーム：ヘッダーに「リフォーム」の文字列があるときにFlagとして持つ
            if header.find('リフォーム') > -1:
                data['reform'] = 1
            else:
                data['reform'] = 0

            if len(data) >=1:
                df[i] = df[i].append(data, ignore_index=True)
                data = {}

        #スクレイピング時のマナーとして、プログラムを停止する
        time.sleep(1)

    # CSVファイルの出力
        file_name = './property_data_' + str(i) + '.csv'
        df[i].to_csv(f'./{file_name}', index=False)

  df[i] = df[i].append(data, ignore_index=True)
  df[i] = df[i].append(data, ignore_index=True)
  df[i] = df[i].append(data, ignore_index=True)
  df[i] = df[i].append(data, ignore_index=True)
  df[i] = df[i].append(data, ignore_index=True)
  df[i] = df[i].append(data, ignore_index=True)
  df[i] = df[i].append(data, ignore_index=True)
  df[i] = df[i].append(data, ignore_index=True)
  df[i] = df[i].append(data, ignore_index=True)
  df[i] = df[i].append(data, ignore_index=True)
  df[i] = df[i].append(data, ignore_index=True)
  df[i] = df[i].append(data, ignore_index=True)
  df[i] = df[i].append(data, ignore_index=True)
  df[i] = df[i].append(data, ignore_index=True)
  df[i] = df[i].append(data, ignore_index=True)
  df[i] = df[i].append(data, ignore_index=True)
  df[i] = df[i].append(data, ignore_index=True)
  df[i] = df[i].append(data, ignore_index=True)
  df[i] = df[i].append(data, ignore_index=True)
  df[i] = df[i].append(data, ignore_index=True)
  df[i] = df[i].append(data, ignore_inde

In [6]:
#保存したcsvファイルを読み込み
df0 = pd.read_csv('./property_data_0.csv')
df1 = pd.read_csv('./property_data_1.csv')
df2 = pd.read_csv('./property_data_2.csv')
df3 = pd.read_csv('./property_data_3.csv')
df4 = pd.read_csv('./property_data_4.csv')

#分割していたファイルを結合
df = pd.concat([df0, df1, df2, df3, df4], axis = 0, ignore_index=True)

In [7]:
# データの前処理

# 価格の統一を行う関数を作成
def convert_price(price):
    # 万円 権利権を含むの処理
    if price.find("※") > -1:
        price = price[:price.find('※')]
    
    # ○○万円 ~ ○○万円の処理
    if price.find('〜') > -1:
        price = price[price.find('〜') + 1 : ]
        
        if price.find('億') > -1:
            # 1億円ジャストのような場合の処理
            if price.find('万') == -1:
                price = int(price[:price.find('億')]) * 10000
            # 1億 ○○万円のとき
            else:
                oku = int(price[:price.find('億')]) * 10000
                price = oku + int(price[price.find('億') + 1 : -2])
        else:
            price = int(price[:price.find('万')])
    else:
        if price.find('億') > -1:
            # 1億円ジャストのような場合の処理
            if price.find('万') == -1:
                price = int(price[:price.find('億')]) * 10000
            # 1億 ○○万円のとき
            else:
                oku = int(price[:price.find('億')]) * 10000
                price = oku + int(price[price.find('億') + 1 : -2])
        else:
            price = int(price[:price.find('万')])
        
    return price

In [8]:
# 前処理

# 価格の表示を数値のみで表現
for i, price in enumerate(df['price']):
    df.loc[i,'price'] = convert_price(price)

In [9]:
df.shape[0] - df.count()

price            0
name             0
address          0
ward             0
line_station     0
line             0
station          0
bus_time         0
walk_time       86
area             0
balcony          0
floor_plan       0
age              0
renovation       0
reform           0
dtype: int64

In [18]:
df.shape

(23157, 15)

In [15]:
def get_line_station(line_station):
    
    #バスと徒歩の時間の取得
    if line_station.find('バス') > -1:
        bus_time = line_station[line_station.find('バス') + 2 : line_station.find('分')]
        
        if line_station.find('停歩') > -1:
            walk_time = line_station[line_station.find('停歩') + 2: line_station.find('分')]
        else:
            walk_time = line_station[line_station.find('歩') + 1 : line_station.find('分')]

    else:
        bus_time = 0
        walk_time = line_station[line_station.find('徒歩') + 2 : line_station.find('分')]
    
    return bus_time, walk_time

'10'