# RentListingInquries

## 基本信息
bathrooms: 浴室的数量number of bathrooms
bedrooms: 卧室数量number of bedrooms
building_id: ID号
created: 
description: 描述
display_address: 
features: 关于这间公寓的功能列表a list of features about this apartment
latitude: 纬度
listing_id
longitude: 经度
manager_id
photos: 照片链接列表a list of photo links. You are welcome to download the pictures yourselves from renthop's site, but they are the same as imgs.zip. 
price: 以美元计算in USD
street_address: 街道地址
interest_level: 这是目标变量。它有3个类别：'高'，'中等'，'低'this is the target variable. It has 3 categories: 'high', 'medium', 'low'

## 导入工具包

In [1]:
# 数据导入与处理
import numpy as np
import pandas as pd
# 数据可视化
import matplotlib.pyplot as plt
import seaborn as sns


特征工程处理过程见"./FE_RentListingInqueries.ipynb"

## 导入数据

In [2]:
# 课程网站提供的特征工程编码后的数据（RentListingInquries_FE_train.csv）
# 或稀疏编码的形式（RentListingInquries_FE_train.bin）
dpath = '../data/RentListingInquries/'
train = pd.read_csv(dpath+'RentListingInquries_FE_train.csv')
train.head()

Unnamed: 0,bathrooms,bedrooms,price,price_bathrooms,price_bedrooms,room_diff,room_num,Year,Month,Day,...,walk,walls,war,washer,water,wheelchair,wifi,windows,work,interest_level
0,1.5,3,3000,1200.0,750.0,-1.5,4.5,2016,6,24,...,0,0,0,0,0,0,0,0,0,1
1,1.0,2,5465,2732.5,1821.666667,-1.0,3.0,2016,6,12,...,0,0,0,0,0,0,0,0,0,2
2,1.0,1,2850,1425.0,1425.0,0.0,2.0,2016,4,17,...,0,0,0,0,0,0,0,0,0,0
3,1.0,1,3275,1637.5,1637.5,0.0,2.0,2016,4,18,...,0,0,0,0,0,0,0,0,0,2
4,1.0,4,3350,1675.0,670.0,-3.0,5.0,2016,4,28,...,0,0,1,0,0,0,0,0,0,2


In [3]:
# 测试数据
test = pd.read_csv(dpath+'RentListingInquries_FE_test.csv')
test.head()

Unnamed: 0,bathrooms,bedrooms,price,price_bathrooms,price_bedrooms,room_diff,room_num,Year,Month,Day,...,virtual,walk,walls,war,washer,water,wheelchair,wifi,windows,work
0,1.0,1,2950,1475.0,1475.0,0.0,2.0,2016,6,11,...,0,0,0,0,0,0,0,0,0,0
1,1.0,2,2850,1425.0,950.0,-1.0,3.0,2016,6,24,...,0,0,0,1,0,0,0,0,0,0
2,1.0,1,3758,1879.0,1879.0,0.0,2.0,2016,6,3,...,0,0,0,0,0,0,0,0,0,0
3,1.0,2,3300,1650.0,1100.0,-1.0,3.0,2016,6,11,...,0,0,0,0,0,0,1,0,0,0
4,2.0,2,4900,1633.333333,1633.333333,0.0,4.0,2016,4,12,...,0,0,0,1,0,0,0,0,0,0


In [4]:
train.isnull().sum().sort_values()

bathrooms         0
month             0
multi             0
natural           0
new               0
newly             0
ok                0
outdoor           0
oversized         0
park              0
parking           0
patio             0
modern            0
pet               0
photos            0
playroom          0
pool              0
post              0
pre               0
prewar            0
private           0
publicoutdoor     0
queen             0
ramp              0
reduced           0
pets              0
midrise           0
microwave         0
massive           0
high              0
                 ..
work              0
doorman           0
dishwasher        0
business          0
cable             0
cats              0
ceiling           0
ceilings          0
center            0
central           0
chef              0
children          0
childrens         0
clean             0
cleaning          0
close             0
closet            0
closets           0
club              0


In [5]:
test.isnull().sum().sort_values()

bathrooms                     0
modern                        0
month                         0
multi                         0
natural                       0
new                           0
newly                         0
ok                            0
outdoor                       0
oversized                     0
park                          0
parking                       0
midrise                       0
patio                         0
pets                          0
photos                        0
playroom                      0
pool                          0
post                          0
pre                           0
prewar                        0
private                       0
publicoutdoor                 0
queen                         0
ramp                          0
pet                           0
microwave                     0
massive                       0
marble                        0
hi                            0
                          ...  
dishwash

test中 'display_address_pred_0', 'display_address_pred_1', 'display_address_pred_2' 全部都是nan, 舍弃此特征

In [7]:
train.drop(['display_address_pred_0', 'display_address_pred_1', 'display_address_pred_2'], axis=1, inplace=True)
test.drop(['display_address_pred_0', 'display_address_pred_1', 'display_address_pred_2'], axis=1, inplace=True)

In [4]:
"building_id" in test.columns

False

In [9]:
print('train.shape='+str(train.shape))
print('test.shape='+str(test.shape))

train.shape=(49352, 225)
test.shape=(74659, 224)


In [10]:
train.iloc[0,:].T

bathrooms             1.50000
bedrooms              3.00000
price              3000.00000
price_bathrooms    1200.00000
price_bedrooms      750.00000
room_diff            -1.50000
room_num              4.50000
Year               2016.00000
Month                 6.00000
Day                  24.00000
Wday                  4.00000
Yday                176.00000
hour                  7.00000
top_10_manager        1.00000
top_25_manager        1.00000
top_5_manager         1.00000
top_50_manager        1.00000
top_1_manager         0.00000
top_2_manager         0.00000
top_15_manager        1.00000
top_20_manager        1.00000
top_30_manager        1.00000
cenroid               5.00000
distance              0.04026
1br                   0.00000
24                    0.00000
2br                   0.00000
3br                   0.00000
ac                    0.00000
access                0.00000
                      ...    
sublet                0.00000
subway                0.00000
super     

## 分割数据

In [11]:
# 数据量太大,分割数据
train = train.sample(frac=0.2)
train.shape

(9870, 225)

In [12]:
# 区分测试集与训练集
X = train.drop('interest_level', axis=1)
y = train['interest_level']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y
    , test_size=0.2
    , random_state=33
    , stratify=y)

In [13]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7896 entries, 33177 to 26881
Columns: 224 entries, bathrooms to work
dtypes: float64(6), int64(218)
memory usage: 13.6 MB


# 模型选择

## DecisionTreeClassifier

In [14]:
# 分类评分标准
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import accuracy_score

In [15]:
from sklearn.tree import DecisionTreeClassifier
model_tree = DecisionTreeClassifier()

In [16]:
model_tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [23]:
y_pd = model_tree.predict(X_test)
model_tree.score(X_test, y_pd)

1.0

print(roc_auc_score(y_test,y_pdpbed))

缺省的决策树正确率为1,nice

In [18]:
#查看特征重要性
df = pd.DataFrame({"columns":list(X_train.columns), "importance":list(model_tree.feature_importances_.T)})
df.sort_values(by=['importance'],ascending=False)

Unnamed: 0,columns,importance
4,price_bedrooms,0.123865
23,distance,0.101251
3,price_bathrooms,0.079985
11,Yday,0.078794
2,price,0.075549
9,Day,0.066488
12,hour,0.062185
10,Wday,0.041940
22,cenroid,0.029743
93,fee,0.023302


有好多的特征都起到作用了,估计树的深度较深

调节参数试试

In [19]:
from sklearn.tree import DecisionTreeClassifier

model_DD = DecisionTreeClassifier()

max_depth = range(1,10,1)
min_samples_leaf = range(1,10,2)

tuned_parameters = dict(max_depth=max_depth, min_samples_leaf=min_samples_leaf)

In [20]:
from sklearn.model_selection import GridSearchCV
DD = GridSearchCV(model_DD, tuned_parameters,cv=10)
DD.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': range(1, 10), 'min_samples_leaf': range(1, 10, 2)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [21]:
print("Best: %f using %s" % (DD.best_score_, DD.best_params_))

Best: 0.695922 using {'max_depth': 4, 'min_samples_leaf': 9}


In [22]:
y_pd = DD.predict(X_test)
model_tree.score(X_test, y_pd)

0.69300911854103342

调参之后正确率反而下降

## RandomForest

In [24]:
from sklearn.ensemble import RandomForestClassifier

model_RR=RandomForestClassifier()

In [25]:
model_RR.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [26]:
y_prod = model_RR.predict(X_test)
model_RR.score(X_test, y_prod)

1.0

随机森林的得分也是1,nice

试试调参, min_samples_leaf and n_estimators

In [27]:
from sklearn.ensemble import RandomForestClassifier

model_RR=RandomForestClassifier()

tuned_parameters = {'min_samples_leaf': range(1,10,2), 'n_estimators' : range(1,10,2) }
#tuned_parameters = {'min_samples_leaf': range(9,19,2), 'n_estimators' : range(6,10,1) }

In [28]:
from sklearn.model_selection import GridSearchCV
RR = GridSearchCV(model_RR, tuned_parameters,cv=10)

In [29]:
RR.fit(X_train,y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'min_samples_leaf': range(1, 10, 2), 'n_estimators': range(1, 10, 2)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [30]:
RR.grid_scores_



[mean: 0.61854, std: 0.01181, params: {'min_samples_leaf': 1, 'n_estimators': 1},
 mean: 0.63766, std: 0.01365, params: {'min_samples_leaf': 1, 'n_estimators': 3},
 mean: 0.66869, std: 0.01728, params: {'min_samples_leaf': 1, 'n_estimators': 5},
 mean: 0.68693, std: 0.01063, params: {'min_samples_leaf': 1, 'n_estimators': 7},
 mean: 0.69035, std: 0.01196, params: {'min_samples_leaf': 1, 'n_estimators': 9},
 mean: 0.63311, std: 0.01818, params: {'min_samples_leaf': 3, 'n_estimators': 1},
 mean: 0.69086, std: 0.01064, params: {'min_samples_leaf': 3, 'n_estimators': 3},
 mean: 0.69415, std: 0.00895, params: {'min_samples_leaf': 3, 'n_estimators': 5},
 mean: 0.69972, std: 0.01115, params: {'min_samples_leaf': 3, 'n_estimators': 7},
 mean: 0.70352, std: 0.01126, params: {'min_samples_leaf': 3, 'n_estimators': 9},
 mean: 0.65084, std: 0.01423, params: {'min_samples_leaf': 5, 'n_estimators': 1},
 mean: 0.69111, std: 0.00737, params: {'min_samples_leaf': 5, 'n_estimators': 3},
 mean: 0.70238, 

In [31]:
print(RR.best_score_)
print(RR.best_params_)

0.70352077001
{'min_samples_leaf': 3, 'n_estimators': 9}


In [32]:
y_pred = RR.predict(X_test)
RR.score(X_test, y_pred)

1.0

调参后得分也是1, min_samples_leaf=3, n_estimators=9

# 预测保存

In [33]:
# 选用调参后的随机森林模型生成数据 
y_ptrdict = RR.predict(test)

In [44]:
submission = test.loc[:,['Year','Month', 'Day', 'Wday', 'Yday', 'hour']]
#submission

In [42]:
submission['interest_level']= y_ptrdict
submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74659 entries, 0 to 74658
Data columns (total 7 columns):
Year              74659 non-null int64
Month             74659 non-null int64
Day               74659 non-null int64
Wday              74659 non-null int64
Yday              74659 non-null int64
hour              74659 non-null int64
interest_level    74659 non-null int64
dtypes: int64(7)
memory usage: 4.0 MB


In [43]:
submission.to_csv('submission.csv')