In [None]:
import pandas as pd
import chardet

# # 检测文件编码
# with open('./DataSet/Tokyo_20131_20234.csv', 'rb') as f:
#     result = chardet.detect(f.read())
#     encoding = result['encoding']
#     print(f"Detected encoding: {encoding}")

# 使用检测到的编码读取文件
df = pd.read_csv('../DataSet/Tokyo house price.csv', encoding="cp932")

In [None]:
df.head()

In [None]:
# 列名列表
columns = ['種類', '価格情報区分', '地域', '市区町村コード', '都道府県名', '市区町村名', '地区名', '最寄駅：名称','最寄駅：距離（分）', '取引価格（総額）', '坪単価', '面積（㎡）', '取引価格（㎡単価）', '土地の形状', '間口','延床面積（㎡）', '建築年', '建物の構造', '用途', '今後の利用目的', '前面道路：方位', '前面道路：種類','前面道路：幅員（ｍ）', '都市計画', '建ぺい率（％）', '容積率（％）', '取引時期', '取引の事情等']

# 打印每一列的唯一值和唯一值的数量
for column in columns:
    unique_values = df[column].unique()
    unique_count = df[column].nunique()
    print(f"{column}: 有 {unique_count} 个唯一值。")
    print(f"这些唯一值是: {unique_values}")
    print()

In [None]:
print(df.info)

In [None]:
df.head()

In [None]:
# 筛选出'取引時期'列中包含2018年至2023年的数据
df['取引時期'] = df['取引時期'].astype(str)
filtered_df = df[df['取引時期'].str.contains(r'202[0-3]|201[8-9]', regex=True)]
filtered_df = filtered_df .query('種類 == "宅地(土地と建物)"')
filtered_df = filtered_df .query('種類 == "宅地(土地と建物)"')
filtered_df = filtered_df .query('今後の利用目的 == "住宅"')

In [None]:
selected_columns = ['最寄駅：距離（分）', '取引価格（総額）', '面積（㎡）','建築年', '建物の構造','地区名',"建ぺい率（％）","容積率（％）"]
selected_df = df[selected_columns]

In [None]:
selected_df.head()

In [None]:
selected_df = selected_df.dropna()
#astype(int)

In [None]:
# 提取建筑年份的年份部分并转换为整数
selected_df['建築年'] = selected_df['建築年'].str.extract('(\d+)')
selected_df = selected_df.dropna()
selected_df['建築年'] = selected_df['建築年'].astype(int)
# selected_df['建築年'] = selected_df['建築年'].astype(int)
# selected_df['建筑年限'] = 2024 - selected_df['建築年']

In [None]:
selected_df.info()


In [None]:
selected_df.to_csv('exported_data3.csv', index=False, encoding='utf-8')

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# 使用OneHotEncoder进行独热编码
one_hot_encoder = OneHotEncoder(sparse=False)
encoded_features = one_hot_encoder.fit_transform(selected_df[['地区名']])

# 获取独热编码后的特征名
feature_names = one_hot_encoder.get_feature_names_out(['地区名'])

# 将编码后的特征转换为DataFrame
encoded_df = pd.DataFrame(encoded_features, columns=feature_names)

# 合并原始数据和独热编码后的数据
df_onehot = pd.concat([selected_df, encoded_df], axis=1)

In [None]:
#使用encoder
from sklearn.preprocessing import LabelEncoder
# 确保列为字符串类型
for column in [ '建物の構造','地区名']:
    selected_df[column] = selected_df[column].astype(str)

# 创建LabelEncoder对象
label_encoder = LabelEncoder()

# 对每个object列进行标签编码
for column in [ '建物の構造','地区名']:
    selected_df[column] = label_encoder.fit_transform(selected_df[column])

print(selected_df)

In [None]:
selected_df['最寄駅：距離（分）'].unique()

In [None]:
# 定义映射关系
distance_mapping = {
    '30分～60分': 30,
    '1H～1H30': 60,
    '1H30～2H': 90,
    '2H～': 120,
    '2,000㎡以上': 120
}

# 将文本值映射成数字
selected_df['最寄駅：距離（分）'] = selected_df['最寄駅：距離（分）'].map(lambda x: distance_mapping[x] if x in distance_mapping else int(x))

In [None]:
# 定义映射关系
distance_mapping = {
    '2,000㎡以上': 2000
}

# 将文本值映射成数字
selected_df['面積（㎡）'] = selected_df['面積（㎡）'].map(lambda x: distance_mapping[x] if x in distance_mapping else int(x))

In [None]:
selected_df['面積（㎡）'].unique()

In [None]:
# 定义一个函数来去除离群值
# 四分位距（IQR）方法是一种常用的检测离群值的方法。它利用数据的第一四分位数（Q1）和第三四分位数（Q3）来定义一个范围，任何超出该范围的值都被视为离群值。
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df_filtered = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df_filtered

# 去除 '取引価格（総額）' 的离群值
data_cleaned = remove_outliers(selected_df, '取引価格（総額）')

In [None]:
import numpy as np
# 定义一个函数来去除离群值
# Z-Score 方法是通过计算每个数据点与均值的标准差来检测离群值的。通常，Z-Score 大于 3 或小于 -3 的数据点被视为离群值。
def remove_outliers_zscore(df, column):
    mean = df[column].mean()
    std = df[column].std()
    z_scores = (df[column] - mean) / std
    df_filtered = df[np.abs(z_scores) < 3]
    return df_filtered

# 去除 '取引価格（総額）' 的离群值
data_cleaned = remove_outliers_zscore(data_cleaned, '取引価格（総額）')

In [None]:
# 定义一个函数来去除离群值
# Z-Score 方法是通过计算每个数据点与均值的标准差来检测离群值的。通常，Z-Score 大于 3 或小于 -3 的数据点被视为离群值。
def remove_outliers_zscore(df, column):
    mean = df[column].mean()
    std = df[column].std()
    z_scores = (df[column] - mean) / std
    df_filtered = df[np.abs(z_scores) < 3]
    return df_filtered

# 去除 '取引価格（総額）' 的离群值
data_cleaned = remove_outliers_zscore(data_cleaned, '面積（㎡）')

In [None]:
# 定义一个函数来去除离群值
# 四分位距（IQR）方法是一种常用的检测离群值的方法。它利用数据的第一四分位数（Q1）和第三四分位数（Q3）来定义一个范围，任何超出该范围的值都被视为离群值。
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df_filtered = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df_filtered

# 去除 '取引価格（総額）' 的离群值
data_cleaned = remove_outliers(selected_df, '面積（㎡）')

In [None]:
# 定义一个函数来去除离群值
# 四分位距（IQR）方法是一种常用的检测离群值的方法。它利用数据的第一四分位数（Q1）和第三四分位数（Q3）来定义一个范围，任何超出该范围的值都被视为离群值。
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df_filtered = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df_filtered

# 去除 '取引価格（総額）' 的离群值
data_cleaned = remove_outliers(selected_df, '建築年')

In [None]:
data_cleaned = data_cleaned[data_cleaned['取引価格（総額）'] <= 100000000]

In [None]:
data_cleaned = data_cleaned[data_cleaned['建築年'] > 1968]

In [None]:
data_cleaned = data_cleaned[data_cleaned['面積（㎡）'] < 200]

In [None]:
data_cleaned = data_cleaned[data_cleaned['最寄駅：距離（分）'] < 30]

In [None]:
data_cleaned.to_csv('exported_data5.csv', index=False, encoding='utf-8')

In [6]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tensorflow.keras.optimizers import Adam
from datetime import datetime
import os
from tensorflow.keras import backend as K

data_cleaned = pd.read_csv("./exported_data5.csv")
X = data_cleaned.drop('取引価格（総額）', axis=1)
y = data_cleaned['取引価格（総額）']

# 数据标准化处理
scaler_X = StandardScaler()
X_scaled = scaler_X.fit_transform(X)

scaler_y = StandardScaler()
y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1)).flatten()

# 将数据划分为训练集和验证集
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

# 创建 Sequential 模型
model = Sequential()


# 添加输入层和隐藏层
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.5))  # 添加 dropout 防止过拟合
# 添加输入层和隐藏层
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.5))  # 添加 dropout 防止过拟合

# 添加更多隐藏层
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))

# 添加输出层，输出层不需要激活函数（默认是线性输出）
model.add(Dense(1, activation='linear'))  # 输出层只有一个神经元，用于预测

optimizer = Adam(learning_rate=0.0001)

# 自定义 R² 评估函数
def r2_score(y_true, y_pred):
    ss_res = K.sum(K.square(y_true - y_pred)) 
    ss_tot = K.sum(K.square(y_true - K.mean(y_true))) 
    return 1 - ss_res / (ss_tot + K.epsilon())

# 编译模型，使用 MSE 损失函数
model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=[r2_score])

# 生成当前时间的字符串
current_time = datetime.now().strftime("%Y%m%d-%H%M")

# 创建日志目录名称
log_dir = os.path.join('logs', current_time)

# TensorBoard 回调函数
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# 训练模型batch_size=32,
history = model.fit(X_train, y_train, epochs=50, validation_data=(X_test, y_test), callbacks=[tensorboard_callback, early_stopping_callback])

# 模型评估
mse = model.evaluate(X_test, y_test, verbose=0)
print(f'Validation MSE: {mse:.4f}')

# 预测验证集
y_pred = model.predict(X_test)

# 反标准化预测结果
y_pred_original = scaler_y.inverse_transform(y_pred.flatten().reshape(-1, 1)).flatten()
y_val_original = scaler_y.inverse_transform(y_test.reshape(-1, 1)).flatten()

# 计算评价指标
mse = mean_squared_error(y_val_original, y_pred_original)
mae = mean_absolute_error(y_val_original, y_pred_original)
rmse = np.sqrt(mse)
r2 = r2_score(y_val_original, y_pred_original)
mape = np.mean(np.abs((y_val_original - y_pred_original) / y_val_original)) * 100

print(f'MSE: {mse:.4f}')
print(f'MAE: {mae:.4f}')
print(f'RMSE: {rmse:.4f}')
print(f'R2: {r2:.4f}')
print(f'MAPE: {mape:.2f}%')

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3428/3428[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 9ms/step - loss: 1.0568 - r2_score: -39.1756 - val_loss: 0.7647 - val_r2_score: -34.6842
Epoch 2/50
[1m3428/3428[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 9ms/step - loss: 0.8072 - r2_score: -39.4750 - val_loss: 0.7177 - val_r2_score: -35.6996
Epoch 3/50
[1m1646/3428[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m15s[0m 9ms/step - loss: 0.7611 - r2_score: -40.3189

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
from datetime import datetime
from tensorflow.keras.optimizers import Adam
import os
data_cleaned = pd.read_csv("./exported_data5.csv")

# 将价格转换为类别标签
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(data_cleaned['取引価格（総額）'])

# 数据标准化处理
X = data_cleaned.drop('取引価格（総額）', axis=1)
y = y_encoded

scaler_X = StandardScaler()
X_scaled = scaler_X.fit_transform(X)

# 将数据划分为训练集和验证集
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# # 创建 Sequential 模型
# model = Sequential()
# 构建模型
model = Sequential([
    Dense(128, input_dim=X_train.shape[1], activation='relu'),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(len(np.unique(y)), activation='softmax')# 添加输出层，使用 softmax 激活函数
])

# # 添加输入层和隐藏层
# model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
# model.add(Dropout(0.5))  # 添加 dropout 防止过拟合

# # 添加更多隐藏层
# model.add(Dense(64, activation='relu'))
# model.add(Dropout(0.5))


optimizer = Adam(learning_rate=0.001)

# 编译模型，使用交叉熵损失函数
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# 生成当前时间的字符串
current_time = datetime.now().strftime("%Y%m%d-%H%M%S")

# 创建日志目录名称
log_dir = os.path.join('logs', current_time)

# TensorBoard 回调函数
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# 训练模型
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), callbacks=[tensorboard_callback, early_stopping_callback])

# 模型评估
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f'Validation Loss: {loss:.4f}, Validation Accuracy: {accuracy:.4f}')

# 预测验证集
y_pred_prob = model.predict(X_test)
y_pred = np.argmax(y_pred_prob, axis=1)

# 计算准确率
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')