In [1]:
from pathlib import Path

import requests
from tqdm import tqdm

import numpy as np
import pandas as pd
import geopandas as gpd

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
import altair as alt

In [2]:
listings_detail_df = pd.read_csv('../data/listings.csv', low_memory=False,
                                 parse_dates=['host_since', 
                                              'last_scraped', 'calendar_last_scraped',
                                              'first_review', 'last_review'])

listings_detail_df.head(3)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,11551,https://www.airbnb.com/rooms/11551,20191105115249,2019-11-06,Arty and Bright London Apartment in Zone 2,Unlike most rental apartments out there my fla...,"Amenities Bedding: 1 Double bed, 1 living room...",Unlike most rental apartments out there my fla...,family,Not even 10 minutes by metro from Victoria Sta...,...,t,f,strict_14_with_grace_period,f,t,2,2,0,0,1.58
1,38151,https://www.airbnb.com/rooms/38151,20191105115249,2019-11-06,Double room/ lounge,,"Comfortable, large double room /lounge area av...","Comfortable, large double room /lounge area av...",none,,...,f,f,flexible,f,f,1,0,1,0,
2,13913,https://www.airbnb.com/rooms/13913,20191105115249,2019-11-06,Holiday London DB Room Let-on going,My bright double bedroom with a large window h...,"Hello Everyone, I'm offering my lovely double ...",My bright double bedroom with a large window h...,business,Finsbury Park is a friendly melting pot commun...,...,f,f,moderate,f,f,2,1,1,0,0.17


In [3]:
print(listings_detail_df.columns.tolist())

['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary', 'space', 'description', 'experiences_offered', 'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 'house_rules', 'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market', 'smart_location', 'country_code', 'country', 'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet', 'price', 'weekly_price', 'monthly_price', '

In [4]:
use_columns=['id','number_of_reviews','bedrooms','room_type','bed_type','price','neighbourhood_cleansed',
             'review_scores_accuracy','review_scores_cleanliness','review_scores_checkin',
             'review_scores_communication','review_scores_location','review_scores_value']

listings_detail_df=listings_detail_df[use_columns]
listings_detail_df.head(5)

Unnamed: 0,id,number_of_reviews,bedrooms,room_type,bed_type,price,neighbourhood_cleansed,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value
0,11551,185,1.0,Entire home/apt,Real Bed,$88.00,Lambeth,9.0,9.0,10.0,10.0,9.0,9.0
1,38151,0,1.0,Private room,Real Bed,$65.00,Croydon,,,,,,
2,13913,19,1.0,Private room,Real Bed,$65.00,Islington,10.0,10.0,10.0,10.0,9.0,9.0
3,38407,137,1.0,Entire home/apt,Real Bed,$79.00,Tower Hamlets,9.0,10.0,10.0,10.0,9.0,9.0
4,90700,339,1.0,Entire home/apt,Real Bed,$105.00,Kensington and Chelsea,9.0,9.0,10.0,10.0,10.0,9.0


根据在“从平台角度分析数据”我们删除number_of_reviews=0的数据

In [5]:
listings_detail_df=listings_detail_df[listings_detail_df.number_of_reviews!=0]
listings_detail_df.reset_index(drop=True, inplace=True)

处理price特征，将其变为数值类型

In [6]:
listings_detail_df['price']=listings_detail_df['price'].str[1:]
listings_detail_df['price']=listings_detail_df['price'].str.replace(',','')
listings_detail_df['price'] = pd.to_numeric(listings_detail_df['price']) 

listings_detail_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65062 entries, 0 to 65061
Data columns (total 13 columns):
id                             65062 non-null int64
number_of_reviews              65062 non-null int64
bedrooms                       65025 non-null float64
room_type                      65062 non-null object
bed_type                       65062 non-null object
price                          65062 non-null float64
neighbourhood_cleansed         65062 non-null object
review_scores_accuracy         63391 non-null float64
review_scores_cleanliness      63405 non-null float64
review_scores_checkin          63343 non-null float64
review_scores_communication    63399 non-null float64
review_scores_location         63345 non-null float64
review_scores_value            63344 non-null float64
dtypes: float64(8), int64(2), object(3)
memory usage: 6.5+ MB


有缺失的特征用均值填充

In [7]:
for column in list(listings_detail_df.columns[listings_detail_df.isnull().sum() > 0]):
    mean_val = listings_detail_df[column].mean()
    listings_detail_df[column].fillna(mean_val, inplace=True)

listings_detail_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65062 entries, 0 to 65061
Data columns (total 13 columns):
id                             65062 non-null int64
number_of_reviews              65062 non-null int64
bedrooms                       65062 non-null float64
room_type                      65062 non-null object
bed_type                       65062 non-null object
price                          65062 non-null float64
neighbourhood_cleansed         65062 non-null object
review_scores_accuracy         65062 non-null float64
review_scores_cleanliness      65062 non-null float64
review_scores_checkin          65062 non-null float64
review_scores_communication    65062 non-null float64
review_scores_location         65062 non-null float64
review_scores_value            65062 non-null float64
dtypes: float64(8), int64(2), object(3)
memory usage: 6.5+ MB


创建总评分特征

In [8]:
listings_detail_df['score']=( listings_detail_df['review_scores_accuracy']+listings_detail_df['review_scores_cleanliness']
                               +listings_detail_df['review_scores_checkin']+listings_detail_df['review_scores_communication']
                               +listings_detail_df['review_scores_value']+listings_detail_df['review_scores_location'])/6
listings_detail_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65062 entries, 0 to 65061
Data columns (total 14 columns):
id                             65062 non-null int64
number_of_reviews              65062 non-null int64
bedrooms                       65062 non-null float64
room_type                      65062 non-null object
bed_type                       65062 non-null object
price                          65062 non-null float64
neighbourhood_cleansed         65062 non-null object
review_scores_accuracy         65062 non-null float64
review_scores_cleanliness      65062 non-null float64
review_scores_checkin          65062 non-null float64
review_scores_communication    65062 non-null float64
review_scores_location         65062 non-null float64
review_scores_value            65062 non-null float64
score                          65062 non-null float64
dtypes: float64(9), int64(2), object(3)
memory usage: 6.9+ MB


注释：

- review_scores_accuracy 
   - 房源页对空间的表现有多准确？
- review_scores_cleanliness
   - 客人是否觉得房间干净整洁？
- review_scores_checkin
   - 办理手续的顺利程度如何？
- review_scores_communication
   - 入住前和入住期间的沟通情况如何？
- review_scores_location
   - 客人对小区的感觉如何？
- review_scores_value
   - 客人是否觉得该房源提供的价格物有所值？
- score
   - 总评分

查看类别特征的类别数

In [9]:
#小区位置类别
listings_detail_df['neighbourhood_cleansed'].value_counts(dropna=False)

Westminster               7380
Tower Hamlets             6451
Hackney                   4845
Camden                    4808
Kensington and Chelsea    4400
Islington                 3989
Southwark                 3921
Lambeth                   3899
Hammersmith and Fulham    3284
Wandsworth                3242
Brent                     1961
Lewisham                  1767
Haringey                  1645
Newham                    1464
Ealing                    1323
Greenwich                 1235
Barnet                    1135
Waltham Forest            1074
Merton                     976
Richmond upon Thames       906
Croydon                    802
Hounslow                   745
Redbridge                  499
Bromley                    495
Hillingdon                 472
Enfield                    465
Kingston upon Thames       414
Harrow                     342
City of London             324
Barking and Dagenham       249
Sutton                     218
Havering                   173
Bexley  

In [10]:
#房间类型
listings_detail_df['room_type'].value_counts(dropna=False)

Entire home/apt    36891
Private room       26897
Hotel room           869
Shared room          405
Name: room_type, dtype: int64

In [11]:
#床的类型
listings_detail_df['bed_type'].value_counts(dropna=False)

Real Bed         64514
Pull-out Sofa      338
Futon              139
Couch               49
Airbed              22
Name: bed_type, dtype: int64

**为用户推荐房源**

1、选择小区位置、房间类型、床的类型

In [14]:
neighbourhood = input('请选择您期望的小区位置：')

room = input('请选择您期望的房间类型：')

bed = input('请选择您期望的床的类别：')

请选择您期望的小区位置：Hackney
请选择您期望的房间类型：Private room
请选择您期望的床的类别：Real Bed


2、选择房间数、租房偏向

In [15]:
room_num = int(input('请选择您期望的房间数：'))

like = input('请选择您期望的喜好偏向（评分型/性价比型/价格型）：')

请选择您期望的房间数：2
请选择您期望的喜好偏向（评分型/性价比型/价格型）：性价比型


3、选择以下需求排名

In [16]:
# 房子干净程度、手续简便程度、沟通顺利程度、小区生活体验程度 您更重视哪个呢？请依次输入

sort=[]

for i in range(4):
    sort.append(input('请选择您的第%d个偏好：'%(i+1)))

请选择您的第1个偏好：房子干净程度
请选择您的第2个偏好：小区生活体验程度
请选择您的第3个偏好：沟通顺利程度
请选择您的第4个偏好：手续简便程度


In [17]:
sort

['房子干净程度', '小区生活体验程度', '沟通顺利程度', '手续简便程度']

In [25]:
for i in range(len(sort)):
    if sort[i]=='房子干净程度':
        sort[i]='review_scores_cleanliness'
    elif sort[i]=='小区生活体验程度':
        sort[i]='review_scores_location'
    elif sort[i]=='沟通顺利程度':
        sort[i]='review_scores_communication'
    else:
        sort[i]='review_scores_checkin'


print(sort)       

['review_scores_cleanliness', 'review_scores_location', 'review_scores_communication', 'review_scores_checkin']


In [18]:
# 根据第一个输入筛选数据

listings_detail_df=listings_detail_df[listings_detail_df['neighbourhood_cleansed']==neighbourhood]

listings_detail_df=listings_detail_df[listings_detail_df['room_type']==room]

listings_detail_df=listings_detail_df[listings_detail_df['bed_type']==bed]

listings_detail_df.head(2)

Unnamed: 0,id,number_of_reviews,bedrooms,room_type,bed_type,price,neighbourhood_cleansed,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,score
57,100326,109,1.0,Private room,Real Bed,33.0,Hackney,10.0,10.0,10.0,10.0,9.0,10.0,9.833333
59,46992,202,1.0,Private room,Real Bed,54.0,Hackney,10.0,10.0,10.0,10.0,10.0,10.0,10.0


In [20]:
# 根据第二个输入筛选数据
listings_detail_df=listings_detail_df[listings_detail_df['bedrooms']==room_num]

if like=='评分型':
    like_type='score'
elif like=='性价比型':
    like_type='review_scores_value'
else:
    like_type='price'

listings_detail_df.head(2)     

Unnamed: 0,id,number_of_reviews,bedrooms,room_type,bed_type,price,neighbourhood_cleansed,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,score
1422,873154,27,2.0,Private room,Real Bed,35.0,Hackney,9.0,9.0,8.0,8.0,9.0,8.0,8.5
3950,3455629,50,2.0,Private room,Real Bed,39.0,Hackney,10.0,9.0,10.0,10.0,10.0,10.0,9.833333


In [26]:
# 0是降序，1是升序

listings_detail_df.sort_values([like_type,sort[0],sort[1],sort[2],sort[3]],ascending=[0,0,0,0,0],inplace=True)

In [28]:
listings_detail_df.head(10)

Unnamed: 0,id,number_of_reviews,bedrooms,room_type,bed_type,price,neighbourhood_cleansed,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,score
6249,5765612,51,2.0,Private room,Real Bed,90.0,Hackney,10.0,10.0,10.0,10.0,10.0,10.0,10.0
7482,6879175,25,2.0,Private room,Real Bed,38.0,Hackney,10.0,10.0,10.0,10.0,10.0,10.0,10.0
13098,11882208,5,2.0,Private room,Real Bed,29.0,Hackney,10.0,10.0,10.0,10.0,10.0,10.0,10.0
31313,22095949,6,2.0,Private room,Real Bed,30.0,Hackney,10.0,10.0,10.0,10.0,10.0,10.0,10.0
31743,22286887,50,2.0,Private room,Real Bed,150.0,Hackney,10.0,10.0,10.0,10.0,10.0,10.0,10.0
35576,24401591,1,2.0,Private room,Real Bed,55.0,Hackney,10.0,10.0,10.0,10.0,10.0,10.0,10.0
36746,25049766,6,2.0,Private room,Real Bed,30.0,Hackney,10.0,10.0,10.0,10.0,10.0,10.0,10.0
38154,25914062,2,2.0,Private room,Real Bed,75.0,Hackney,10.0,10.0,10.0,10.0,10.0,10.0,10.0
38365,26035691,49,2.0,Private room,Real Bed,30.0,Hackney,10.0,10.0,10.0,10.0,10.0,10.0,10.0
46021,30169167,1,2.0,Private room,Real Bed,70.0,Hackney,10.0,10.0,10.0,10.0,10.0,10.0,10.0


**可改进之处：**

1、均值的填充可以分组均值填充，应该会更准确

2、性价比特征似乎用价格/评分，然后再标准化更好一些