# 租金预测

这个项目用于研究并预测某地的房屋租金。

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
%matplotlib inline

In [None]:
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

## 数据分析

In [None]:
df_train = pd.read_csv('./Datasets/train.csv')
df_train.head()

根据训练集中房屋的位置和租金信息，可以大致了解不同地区租金的范围。

In [None]:
plt.figure(figsize=(12, 8))
plt.scatter(df_train['Latitude'], df_train['Longitude'], c=df_train['Rental'], cmap='viridis', s=30, alpha=0.6)
plt.colorbar(label='租金')
plt.xlabel('纬度')
plt.ylabel('经度')
plt.title('租金分布图')
plt.show()

可以发现不同地区的租金范围大致相同，并没有出现某个地区的租金远低于或超出其它地区。

### 商场分布

In [None]:
shopping_malls = pd.read_csv('./Datasets/shopping-malls.csv')
shopping_malls.head()

在图上标记出所有商场的位置。

In [None]:
plt.figure(figsize=(12, 8))
plt.scatter(df_train['Latitude'], df_train['Longitude'], c='blue', s=30, alpha=0.6, label='房屋')
plt.scatter(shopping_malls['Latitude'], shopping_malls['Longitude'], c='red', marker='x', s=100, label='商场')
plt.xlabel('纬度')
plt.ylabel('经度')
plt.title('商场分布图')
plt.legend()
plt.show()

大部分的地区周边都有2个以上的商场，大量商场主要集中在西边的地区。

### 小学分布

In [None]:
primary_schools = pd.read_csv('./Datasets/primary-schools.csv')
primary_schools.head()

在图上标记出所有小学的位置。

In [None]:
plt.figure(figsize=(12, 8))
plt.scatter(df_train['Latitude'], df_train['Longitude'], c='blue', s=30, alpha=0.6, label='房屋')
plt.scatter(primary_schools['Latitude'], primary_schools['Longitude'], c='green', marker='^', s=100, label='小学')
plt.xlabel('纬度')
plt.ylabel('经度')
plt.title('小学分布图')
plt.legend()
plt.show()

小学的分布相比商场更加均匀，各个地区都有至少5个以上的小学。

## 数据清洗

In [None]:
df_train.head()

由于训练集中所有的`Furnished`值都相同，它们对分析无法提供任何帮助，因此也可以删除。

In [None]:
df_train.drop('Furnished', axis=1, inplace=True)
df_train.head()

`Type`这列中的数据格式不统一，例如`3 room`和`3-room`其实是同样的房型。

In [None]:
df_train['Type'].unique()

In [None]:
df_train['Type'] = df_train['Type'].replace({
    '1 room': '1-room',
    '2 room': '2-room',
    '3 room': '3-room',
    '4 room': '4-room',
    '5 room': '5-room',
})
df_train['Type'].unique()

一般而言，卧室数量越多，房屋的租金也会越贵。因此`2-room`、`3-room`、`4-room`、`5-room`可以方便地转换为数值。

In [None]:
df_train['Type'] = df_train['Type'].map({
    '1-room': 1,
    '2-room': 2,
    '3-room': 3,
    '4-room': 4,
    '5-room': 5,
})
df_train[['Type']].head()

In [None]:
df_train.head()

将租金生效日期`Date`拆分为`Year`和`Month`会更方便模型进行分析。

In [None]:
df_train = df_train.copy()

df_train['Year'] = pd.DatetimeIndex(df_train['Date']).year
df_train['Month'] = pd.DatetimeIndex(df_train['Date']).month
df_train.drop('Date', axis=1, inplace=True)

df_train[['Year', 'Month']].head()

### 数据清洗函数

为了方便后续操作，将以上数据预处理的操作整合为一个函数。

In [None]:
def preprocessing(df):
    df = df.copy()

    df.drop('Furnished', axis=1, inplace=True)

    df['Type'] = df['Type'].replace({
        '1 room': '1-room',
        '2 room': '2-room',
        '3 room': '3-room',
        '4 room': '4-room',
        '5 room': '5-room'
    })

    df['Type'] = df['Type'].map({
        '1-room': 1,
        '2-room': 2,
        '3-room': 3,
        '4-room': 4,
        '5-room': 5
    })

    df['Year'] = pd.DatetimeIndex(df['Date']).year
    df['Month'] = pd.DatetimeIndex(df['Date']).month
    df.drop('Date', axis=1, inplace=True)
    
    return df

## 合并辅助信息

将商场和小学的位置信息添加到训练集中有助于提高模型预测的准确性。

KNN算法可以用于计算房屋周边商场和小学的数量和距离。

这里将考察房屋方圆2km内的商场和小学。在训练集中将会添加：

1. `num_of_shopping_malls`：表示周边商场数量
2. `distance_to_nearest_shopping_mall`：到最近商场的距离
3. `num_of_primary_schools`：表示周边小学数量
4. `distance_to_nearest_primary_school`：到最近小学的距离

根据经纬度信息和实际距离，可以大致计算出比例尺。

In [None]:
km_scale = lambda km: km * 0.00552
km_radius = 2

### 商场

In [None]:
knn = NearestNeighbors(radius=km_scale(km_radius))
knn.fit(shopping_malls[['Latitude', 'Longitude']])
indices_train = knn.radius_neighbors(df_train[['Latitude', 'Longitude']], return_distance=False)
df_train['num_of_shopping_malls'] = [len(indices) for indices in indices_train]

In [None]:
knn = NearestNeighbors(n_neighbors=1)
knn.fit(shopping_malls[['Latitude', 'Longitude']])
distances_train, _ = knn.kneighbors(df_train[['Latitude', 'Longitude']])
df_train['distance_to_nearest_shopping_mall'] = distances_train.flatten()

In [None]:
df_train[['num_of_shopping_malls', 'distance_to_nearest_shopping_mall']].head()

### 小学

In [None]:
knn = NearestNeighbors(radius=km_scale(km_radius))
knn.fit(primary_schools[['Latitude', 'Longitude']])
indices_train = knn.radius_neighbors(df_train[['Latitude', 'Longitude']], return_distance=False)
df_train['num_of_primary_schools'] = [len(indices) for indices in indices_train]

In [None]:
knn = NearestNeighbors(n_neighbors=1)
knn.fit(primary_schools[['Latitude', 'Longitude']])
distances_train, _ = knn.kneighbors(df_train[['Latitude', 'Longitude']])
df_train['distance_to_nearest_primary_school'] = distances_train.flatten()

In [None]:
df_train[['num_of_primary_schools', 'distance_to_nearest_primary_school']].head()

至此，我们得到了一个经过预处理的数据集。

In [None]:
df_train.head()

### 合并辅助信息函数

为了方便后续操作，将以上合并商场和小学信息的操作整合为函数。

In [None]:
def integrate_shopping_mall(df):
    shopping_malls = pd.read_csv('./Datasets/shopping-malls.csv')

    knn = NearestNeighbors(radius=km_scale(km_radius))
    knn.fit(shopping_malls[['Latitude', 'Longitude']])
    indices_train = knn.radius_neighbors(df[['Latitude', 'Longitude']], return_distance=False)
    df['num_of_shopping_malls'] = [len(indices) for indices in indices_train]

    knn = NearestNeighbors(n_neighbors=1)
    knn.fit(shopping_malls[['Latitude', 'Longitude']])
    distances_train, _ = knn.kneighbors(df[['Latitude', 'Longitude']])
    df['distance_to_nearest_shopping_mall'] = distances_train.flatten()

    df[['num_of_shopping_malls', 'distance_to_nearest_shopping_mall']].head()

    return df

In [None]:
def integrate_primary_school(df):
    primary_schools = pd.read_csv('./Datasets/primary-schools.csv')

    knn = NearestNeighbors(radius=km_scale(km_radius))
    knn.fit(primary_schools[['Latitude', 'Longitude']])
    indices_train = knn.radius_neighbors(df[['Latitude', 'Longitude']], return_distance=False)
    df['num_of_primary_schools'] = [len(indices) for indices in indices_train]

    knn = NearestNeighbors(n_neighbors=1)
    knn.fit(primary_schools[['Latitude', 'Longitude']])
    distances_train, _ = knn.kneighbors(df[['Latitude', 'Longitude']])
    df['distance_to_nearest_primary_school'] = distances_train.flatten()

    df[['num_of_primary_schools', 'distance_to_nearest_primary_school']].head()

    return df

## 预测模型

获取**输入特征**和**目标变量**。

In [None]:
x_train = df_train.drop('Rental', axis=1)
y_train = df_train['Rental']

读取测试集，并进行预处理。

In [None]:
df_test = pd.read_csv('./Datasets/test.csv')
df_test = preprocessing(df_test)
df_test = integrate_shopping_mall(df_test)
df_test = integrate_primary_school(df_test)

df_test.head()

### 线性回归Linear Regression

In [None]:
test = df_test.copy()

In [None]:
model = LinearRegression()
model.fit(x_train, y_train)

test['Predicted'] = model.predict(test)
test.head()

导出预测结果：

In [None]:
df_result = pd.DataFrame({
    'Id': range(len(test)),
    'Predicted': test['Predicted']
})

df_result.to_csv('./submission_linear_regression.csv', index=False)

### 决策树Decision Tree

In [None]:
test = df_test.copy()

In [None]:
model = DecisionTreeRegressor(random_state=0)
model.fit(x_train, y_train)

test['Predicted'] = model.predict(test)
test.head()

导出预测结果：

In [None]:
df_result = pd.DataFrame({
    'Id': range(len(test)),
    'Predicted': test['Predicted']
})

df_result.to_csv('./submission_decision_tree.csv', index=False)

### 随机森林Random Forest

In [None]:
test = df_test.copy()

In [None]:
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(x_train, y_train)

test['Predicted'] = model.predict(test)
test.head()

导出预测结果：

In [None]:
df_result = pd.DataFrame({
    'Id': range(len(test)),
    'Predicted': test['Predicted']
})

df_result.to_csv('./submission_random_forest.csv', index=False)

### 梯度提升Gradient Boosting

In [None]:
test = df_test.copy()

In [None]:
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=0)
model.fit(x_train, y_train)

test['Predicted'] = model.predict(test)
test.head()

导出预测结果：

In [None]:
df_result = pd.DataFrame({
    'Id': range(len(test)),
    'Predicted': test['Predicted']
})

df_result.to_csv('./submission_gradient_boosting.csv', index=False)