# 【練習問題】民泊サービスの宿泊価格予測

https://signate.jp/competitions/266

In [None]:
# データダウンロード
!signate download -c 266 -p ./data

In [1]:
# 定数定義

DATA_DIRECTORY = "./data/"
"""データディレクトリ"""

'データディレクトリ'

In [2]:
# インポート定義

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import datetime
from dateutil import tz
import gc
import os
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

In [3]:
def load_train_data() -> pd.DataFrame:
  """学習データ読み込み

  Returns:
      pd.DataFrame: 学習データを返す
  """
  file_path = os.path.join(DATA_DIRECTORY, "train.csv")

  return load_data(file_path)

In [5]:
def load_test_data() -> pd.DataFrame:
  """テストデータ読み込み

  Returns:
      pd.DataFrame: テストデータを返す
  """
  file_path = os.path.join(DATA_DIRECTORY, "test.csv")

  return load_data(file_path)

In [6]:
def load_data(file_path: str) -> pd.DataFrame:
  """データ読み込み

  Args:
      file_path (str): データファイルパス

  Returns:
      pd.DataFrame: 読み込んだデータを返す
  """

  df = pd.read_csv(file_path, header=0)
  df = reduce_mem_usage(df)

  return df

In [7]:
def reduce_mem_usage(df: pd.DataFrame, use_float16: bool = False) -> pd.DataFrame:
    """ロードしたデータを型変換し、容量を削減する

    Args:
        df (pd.DataFrame): ロードデータ
        use_float16 (bool, optional): float16使用フラグ デフォルト値：False

    Returns:
        pd.DataFrame: 容量削減後のデータを返す
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [8]:
test_df = load_train_data()

Memory usage of dataframe is 12.30 MB
Memory usage after optimization is: 12.68 MB
Decreased by -3.1%
