In [1]:
# https://www.kaggle.com/airbnb/seattle
# 上のサイトからダウンロードしたデータセット

In [2]:
import datetime
import graphviz
import itertools
import os
import platform
import socket
import sys
import time
import warnings
import category_encoders as ce
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pandas_profiling as pdp
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from tqdm import tqdm
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 300)
sys.path.insert(0, '../utils/')
from utils import *

In [3]:
start("airbnb_seattle_cal")

DATE: 2019-04-04 17:37:31.955838
FILE: airbnb_seattle_cal
PID: 16072
HOST: Y-project-11
ENV: Windows-10-10.0.17134-SP0


In [4]:
file_name = "../data/input/airbnb_seattle/calendar.csv"
dtypes = {
    "listing_id" : "category",
    "date"       : "category",
    "available"  : "category",
    "price"      : "object"
}
with timer("read"):
    df = pd.read_csv(file_name, header=0, index_col=None, sep=',', dtype=dtypes)

[read] start
[read] done in 0 s


In [5]:
# 前処理
# date 列を datetime 型に変更
df["date"] = pd.to_datetime(df["date"])
# price 列を文字列処理
musk = ~df["price"].isnull()
df["price"][musk] = df["price"][musk].str.replace("$","")
df["price"][musk] = df["price"][musk].str.replace(",","")
df["price"] = df["price"].astype("float32")

In [6]:
df.shape

(1393570, 4)

In [7]:
df.head(3)

Unnamed: 0,listing_id,date,available,price
0,241032,2016-01-04,t,85.0
1,241032,2016-01-05,t,85.0
2,241032,2016-01-06,f,


In [8]:
df.dtypes

listing_id          category
date          datetime64[ns]
available           category
price                float32
dtype: object

In [9]:
pdp.ProfileReport(df)

0,1
Number of variables,4
Number of observations,1393570
Total Missing (%),8.2%
Total size in memory,20.1 MiB
Average record size in memory,15.1 B

0,1
Numeric,1
Categorical,2
Date,1
Text (Unique),0
Rejected,0

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
t,934542
f,459028

Value,Count,Frequency (%),Unnamed: 3
t,934542,67.1%,
f,459028,32.9%,

0,1
Distinct count,365
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Minimum,2016-01-04 00:00:00
Maximum,2017-01-02 00:00:00

0,1
Distinct count,3818
Unique (%),0.3%
Missing (%),0.0%
Missing (n),0

0,1
9925458,365
6766708,365
6958436,365
Other values (3815),1392475

Value,Count,Frequency (%),Unnamed: 3
9925458,365,0.0%,
6766708,365,0.0%,
6958436,365,0.0%,
6913706,365,0.0%,
6883913,365,0.0%,
6868378,365,0.0%,
685600,365,0.0%,
6855839,365,0.0%,
6854552,365,0.0%,
6852288,365,0.0%,

0,1
Distinct count,670
Unique (%),0.1%
Missing (%),32.9%
Missing (n),459028
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,137.94
Minimum,10.0
Maximum,1650.0
Zeros (%),0.0%

0,1
Minimum,10.0
5-th percentile,45.0
Q1,75.0
Median,109.0
Q3,160.0
95-th percentile,333.0
Maximum,1650.0
Range,1640.0
Interquartile range,85.0

0,1
Standard deviation,105.04
Coef of variation,0.76147
Kurtosis,17.992874
Mean,137.94
MAD,69.058
Skewness,3.208408
Sum,128915260.0
Variance,11033
Memory size,5.3 MiB

Value,Count,Frequency (%),Unnamed: 3
150.0,36646,2.6%,
100.0,31755,2.3%,
75.0,29820,2.1%,
125.0,27538,2.0%,
65.0,26415,1.9%,
90.0,24942,1.8%,
95.0,24327,1.7%,
99.0,23629,1.7%,
85.0,23455,1.7%,
80.0,19817,1.4%,

Value,Count,Frequency (%),Unnamed: 3
10.0,1,0.0%,
12.0,1,0.0%,
20.0,365,0.0%,
22.0,21,0.0%,
23.0,4,0.0%,

Value,Count,Frequency (%),Unnamed: 3
1199.0,71,0.0%,
1240.0,2,0.0%,
1250.0,15,0.0%,
1450.0,28,0.0%,
1650.0,91,0.0%,

Unnamed: 0,listing_id,date,available,price
0,241032,2016-01-04,t,85.0
1,241032,2016-01-05,t,85.0
2,241032,2016-01-06,f,
3,241032,2016-01-07,f,
4,241032,2016-01-08,f,


- airbnb_seatle_calendar データ
- 期間：2016年1月4日～2017年1月2日（365日間）
- 対象施設：3818件
- 施設ごと日付ごとの予約可能の可否
- 予約可能の場合は価格