In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2  # 変換前のメモリ使用量（MB単位）
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            # 整数型の場合
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                # 必要に応じて他の整数型（np.int16, np.int32, np.int64）への変換処理を追加可能
            # 浮動小数点型の場合
            elif str(col_type)[:5] == 'float':
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                else:
                    df[col] = df[col].astype(np.float32)
        else:
            # カテゴリカル変数（object型）はcategory型に変換
            df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2  # 変換後のメモリ使用量
    print('Mem. usage decreased from {:5.2f} MB to {:5.2f} MB ({:.1f}% reduction)'.format(
          start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [5]:
# CSVファイルからデータを読み込む
df = pd.read_csv('train.csv')

# 読み込んだデータのメモリ使用量を最適化
df = reduce_mem_usage(df)

# データの基本情報を確認（データ型や欠損値の有無など）
print(df.info())

# 例：'date'というカラムが存在する場合、日付型に変換する
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'])

# 例：数値型カラムの欠損値を各列の平均値で補完
num_cols = df.select_dtypes(include=[np.number]).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].mean())

# 例：カテゴリカル変数の欠損値を最頻値で補完（必要に応じて）
cat_cols = df.select_dtypes(include=['category']).columns
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# 前処理後のデータ概要を再度確認
print(df.head())

Mem. usage decreased from  0.90 MB to  0.35 MB (61.0% reduction)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   Id             1460 non-null   int64   
 1   MSSubClass     1460 non-null   int64   
 2   MSZoning       1460 non-null   category
 3   LotFrontage    1201 non-null   float16 
 4   LotArea        1460 non-null   int64   
 5   Street         1460 non-null   category
 6   Alley          91 non-null     category
 7   LotShape       1460 non-null   category
 8   LandContour    1460 non-null   category
 9   Utilities      1460 non-null   category
 10  LotConfig      1460 non-null   category
 11  LandSlope      1460 non-null   category
 12  Neighborhood   1460 non-null   category
 13  Condition1     1460 non-null   category
 14  Condition2     1460 non-null   category
 15  BldgType       1460 non-null   category
 16  HouseStyle   

  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
