In [5]:
import pandas as pd
from datetime import datetime

# 创建数据集
data = {
    'square_feet': [2000, 1600, 2200, 1800, 2400],
    'bedrooms': [3, 2, 4, 3, 4],
    'bathrooms': [2, 1, 3, 2, 3],
    'garage_area': [400, 250, 500, 350, 450],
    'basement_area': [600, 500, 800, 550, 700],
    'year_built': [1995, 1980, 2005, 1990, 2010],
    'house_type': ['Single Family', 'Apartment', 'Single Family', 'Townhouse', 'Single Family'],
    'neighborhood_quality': [8, 6, 9, 7, 8],
    'sale_price': [250000, 150000, 350000, 200000, 400000]
}

# 创建 DataFrame
df = pd.DataFrame(data)

# 衍生特征
# 计算房屋的总面积，包括地下室面积、车库面积和主居住区域面积
df['total_area'] = df['square_feet'] + df['basement_area'] + df['garage_area']

# 计算卧室数量和浴室数量之间的比例
df['bedroom_bathroom_ratio'] = df['bedrooms'] / df['bathrooms']

# 计算房屋的年龄
df['age'] = datetime.now().year - df['year_built']

# 计算每个卧室的平均面积
df['area_per_bedroom'] = df['total_area'] / df['bedrooms']

# 计算每平方英尺的房价
df['price_per_square_feet'] = df['sale_price'] / df['square_feet']

# 独热编码
df = pd.get_dummies(df, columns=['house_type'], prefix='house_type')

# 处理后的数据集
df


Unnamed: 0,square_feet,bedrooms,bathrooms,garage_area,basement_area,year_built,neighborhood_quality,sale_price,total_area,bedroom_bathroom_ratio,age,area_per_bedroom,price_per_square_feet,house_type_Apartment,house_type_Single Family,house_type_Townhouse
0,2000,3,2,400,600,1995,8,250000,3000,1.5,28,1000.0,125.0,False,True,False
1,1600,2,1,250,500,1980,6,150000,2350,2.0,43,1175.0,93.75,True,False,False
2,2200,4,3,500,800,2005,9,350000,3500,1.333333,18,875.0,159.090909,False,True,False
3,1800,3,2,350,550,1990,7,200000,2700,1.5,33,900.0,111.111111,False,False,True
4,2400,4,3,450,700,2010,8,400000,3550,1.333333,13,887.5,166.666667,False,True,False
