# 特征工程on数据集Rent Listing Inqueries 

import必要的工具包，用于文件读取／特征编码

In [1]:
import numpy as np
import pandas as pd

#用于计算feature字段的文本特征提取
from sklearn.feature_extraction.text import  CountVectorizer
#from sklearn.feature_extraction.text import TfidfVectorizer

#CountVectorizer为稀疏特征，特征编码结果存为稀疏矩阵xgboost处理更高效
from scipy import sparse

#对类别型特征进行编码
from sklearn.preprocessing import LabelEncoder
from MeanEncoder import MeanEncoder

#对地理位置通过聚类进行离散化
from sklearn.cluster import KMeans
from nltk.metrics import distance as distance

数据文件路径和文件名

In [2]:
#input data
dpath = './data/'
train = pd.read_json(dpath +"RentListingInquries_train.json")
test = pd.read_json(dpath +"RentListingInquries_test.json")

#train.head()

In [3]:
#train.info()

## 标签interest_level

### 将类别型的标签interest_level编码为数字
从前面的分析和常识来看，listing_id对确定interest_level没有用，去掉

In [4]:
y_map = {'low': 2, 'medium': 1, 'high': 0}
train['interest_level'] = train['interest_level'].apply(lambda x: y_map[x])

y_train = train['interest_level']
train.drop(['listing_id', 'interest_level'], axis=1,inplace = True)

test.drop(['listing_id'], axis=1,inplace = True)

## price, bathrooms, bedrooms
数值型特征，+／-／*／ ／
特征的单调变换对XGBoost不必要

In [5]:
def remove_noise(df):
#remove some noise
    df= df[df.price < 10000]

    df.loc[df["bathrooms"] == 112, "bathrooms"] = 1.5
    df.loc[df["bathrooms"] == 10, "bathrooms"] = 1
    df.loc[df["bathrooms"] == 20, "bathrooms"] = 2

In [6]:
#构造新特征
#price_bathrooms：单位bathroom的价格
#price_bedrooms：单位bedroom的价格
def create_price_room(df):
    df['price_bathrooms'] =  (df["price"])/ (df["bathrooms"] +1.0)
    df['price_bedrooms'] =  (df["price"])/ (df["bedrooms"] +1.0)

In [7]:
#构造新特征
#room_diff：bathroom房间数 - bedroom房间数
#room_num：bathroom房间数 + bedroom房间数
def create_room_diff_sum(df):
    df["room_diff"] = df["bathrooms"] - df["bedrooms"]
    df["room_num"] = df["bedrooms"] + df["bathrooms"]

## 创建日期created

In [8]:
def procdess_created_date(df):
    df['Date'] = pd.to_datetime(df['created'])
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df['Wday'] = df['Date'].dt.dayofweek
    df['Yday'] = df['Date'].dt.dayofyear
    df['hour'] = df['Date'].dt.hour

    df.drop(['Date', 'created'], axis=1,inplace = True)

## description

In [9]:
#简单丢弃，也可以参照fature特征处理方式
def procdess_description(df):
    df.drop(['description'], axis=1,inplace = True)

## manager_id
将manager分为几个等级
top 1%， 2%， 5， 10， 15， 20， 25， 30， 50，

In [10]:
def procdess_manager_id(df):
    managers_count = df['manager_id'].value_counts()

    df['top_10_manager'] = df['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
        managers_count.values >= np.percentile(managers_count.values, 90)] else 0)
    df['top_25_manager'] = df['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
        managers_count.values >= np.percentile(managers_count.values, 75)] else 0)
    df['top_5_manager'] = df['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
        managers_count.values >= np.percentile(managers_count.values, 95)] else 0)
    df['top_50_manager'] = df['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
        managers_count.values >= np.percentile(managers_count.values, 50)] else 0)
    df['top_1_manager'] = df['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
        managers_count.values >= np.percentile(managers_count.values, 99)] else 0)
    df['top_2_manager'] = df['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
        managers_count.values >= np.percentile(managers_count.values, 98)] else 0)
    df['top_15_manager'] = df['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
        managers_count.values >= np.percentile(managers_count.values, 85)] else 0)
    df['top_20_manager'] = df['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
        managers_count.values >= np.percentile(managers_count.values, 80)] else 0)
    df['top_30_manager'] = df['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
        managers_count.values >= np.percentile(managers_count.values, 70)] else 0)
    
    df.drop(['manager_id'], axis=1,inplace = True)

## building_id
类似manager_id处理
直接删除

In [11]:
def procdess_building_id(df):
    df.drop(['building_id'], axis=1,inplace = True)

## photos

In [12]:
def procdess_photos(df):
    #df['photos_count'] = df['photos'].apply(lambda x: len(x))
    df.drop(['photos'], axis=1, inplace=True)

## latitude, longtitude
聚类降维编码(#用训练数据训练，对训练数据和测试数据都做变换)
到中心的距离（论坛上讨论到曼哈顿中心的距离更好）

In [13]:
def procdess_location_train(df):   
    train_location = df.loc[:,[ 'latitude', 'longitude']]
    
     # Clustering
    kmeans_cluster = KMeans(n_clusters=20)
    res = kmeans_cluster.fit(train_location)
    res = kmeans_cluster.predict(train_location)

    df['cenroid'] = res

    # L1 distance
    center = [ train_location['latitude'].mean(), train_location['longitude'].mean()]
    df['distance'] = abs(df['latitude'] - center[0]) + abs(df['longitude'] - center[1])
    
    #原始特征也可以考虑保留，此处简单丢弃
    df.drop(['latitude', 'longitude'], axis=1, inplace=True)
    
    return kmeans_cluster,center

In [14]:
def procdess_location_test(df, kmeans_cluster, center):   
    test_location = df.loc[:,[ 'latitude', 'longitude']]
    
     # Clustering
    res = kmeans_cluster.predict(test_location)

    df['cenroid'] = res

    # L1 distance
    df['distance'] = abs(df['latitude'] - center[0]) + abs(df['longitude'] - center[1])
    df.drop(['latitude', 'longitude'], axis=1, inplace=True)

## display_address
定义高基数类别型特征编码函数
对这些特征进行均值编码（该特征值在每个类别的概率，即原来的一维特征变成了C-1维特征，C为标签类别数目）

In [15]:
def procdess_display_address_train_test(df_train, y_train, df_test):
    n_train_samples = len(df_train.index)    
    df_train_test = pd.concat((df_train, df_test), axis=0)

    lb = LabelEncoder()
    lb.fit(list(df_train_test['display_address'].values))
    df_train_test ['display_address'] = lb.transform(list(df_train_test['display_address'].values))
    
    #import pdb
    #pdb.set_trace()
    me = MeanEncoder(['display_address'], target_type='classification')
    df_train_test = me.fit_transform(df_train_test, y_train)

    df_train_test.drop(['display_address'], axis=1,inplace = True)
    
    df_train = df_train_test.iloc[:n_train_samples, :]
    df_test = df_train_test.iloc[n_train_samples:, :]
    
    return df_train, df_test

In [16]:
def procdess_display_address_test(df, lb, me):
    #要警惕test中出现train中没有的特征取值，最好train和test一起处理
    df['display_address'] = lb.transform(list(df['display_address'].values))
    df = me.transform(df)

    df.drop(['display_address'], axis=1,inplace = True)
    return df

## street_address

In [17]:
# 和display_address信息冗余，去掉
def procdess_street_address(df):
    df = df.drop(['street_address'], axis=1,inplace = True)

## features
描述特征文字长度
特征中单词的词频，相当于以数据集features中出现的词语为字典的one-hot编码（虽然是词频，但在这个任务中每个单词通常只出现一次）

In [18]:
def procdess_features_train_test(df_train, df_test):
    n_train_samples = len(df_train.index)
    
    df_train_test = pd.concat((df_train, df_test), axis=0)
    df_train_test['features2'] = df_train_test['features']
    df_train_test['features2'] = df_train_test['features2'].apply(lambda x: ' '.join(x))

    c_vect = CountVectorizer(stop_words='english', max_features=200, ngram_range=(1, 1), decode_error='ignore')
    c_vect_sparse = c_vect.fit_transform(df_train_test['features2'])
    c_vect_sparse_cols = c_vect.get_feature_names()

    df_train.drop(['features'], axis=1, inplace=True)
    df_test.drop(['features'], axis=1, inplace=True)
    
    #hstack作为特征处理的最后一部，先将其他所有特征都转换成数值型特征才能处理,稀疏表示
    df_train_sparse = sparse.hstack([df_train, c_vect_sparse[:n_train_samples,:]]).tocsr()
    df_test_sparse = sparse.hstack([df_test, c_vect_sparse[n_train_samples:,:]]).tocsr()
    
    #常规datafrmae
    tmp = pd.DataFrame(c_vect_sparse.toarray()[:n_train_samples,:],columns = c_vect_sparse_cols, index=df_train.index)
    df_train = pd.concat([df_train, tmp], axis=1)
    
    tmp = pd.DataFrame(c_vect_sparse.toarray()[n_train_samples:,:],columns = c_vect_sparse_cols, index=df_test.index)
    df_test = pd.concat([df_test, tmp], axis=1)
    
    #df_test = pd.concat([df_test, tmp[n_train_samples:,:]], axis=1)
  
    return df_train_sparse,df_test_sparse,df_train, df_test

In [19]:
def procdess_features_test(df, c_vect):
    df['features2'] = df['features']
    df['features2'] = df['features2'].apply(lambda x: ' '.join(x))

    c_vect_sparse = c_vect.transform(df['features2'])
    c_vect_sparse_cols = c_vect.get_feature_names()

    df.drop(['features', 'features2'], axis=1, inplace=True)
    
    #hstack作为特征处理的最后一部，先将其他所有特征都转换成数值型特征才能处理
    df_sparse = sparse.hstack([df, c_vect_sparse]).tocsr()
    
    tmp = pd.DataFrame(c_vect_sparse.toarray(),columns = c_vect_sparse_cols, index=df.index)
    df = pd.concat([df, tmp], axis=1)
    
    return df_sparse, df

## 对训练样本做特征工程

In [20]:
remove_noise(train)

create_price_room(train)
create_room_diff_sum(train)

procdess_created_date(train)

procdess_description(train)

procdess_manager_id(train)

procdess_building_id(train)
procdess_photos(train)

kmeans_cluster,center = procdess_location_train(train)
procdess_street_address(train)

#测试集中可能出现新的特征值，所以训练和测试集一起做
#lb, me, train = procdess_display_address_train(train, y_train)
#X_train_sparse,X_test_sparse,train,test = procdess_features_train_test(train,test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [21]:
remove_noise(test)

create_price_room(test)
create_room_diff_sum(test)

procdess_created_date(test)

procdess_description(test)

procdess_manager_id(test)

procdess_building_id(test)
procdess_photos(test)

procdess_location_test(test, kmeans_cluster, center)

procdess_street_address(test)

#测试数据出现了训练数据中没有出现的词语，报错，可以训练数据和测试数据一起训练CountVectorizer
#test = procdess_display_address_test(test, lb, me )
#X_test_sparse,test = procdess_features_test(test, c_vect)

## 对测试样本做特征工程

In [21]:
remove_noise(test)

create_price_room(test)
create_room_diff_sum(test)

procdess_created_date(test)

procdess_description(test)

procdess_manager_id(test)

procdess_building_id(test)
procdess_photos(test)

procdess_location_test(test, kmeans_cluster, center)

procdess_street_address(test)

#测试数据出现了训练数据中没有出现的词语，报错，可以训练数据和测试数据一起训练CountVectorizer
#test = procdess_display_address_test(test, lb, me )
#X_test_sparse,test = procdess_features_test(test, c_vect)

In [22]:
train,test = procdess_display_address_train_test(train, y_train,test)

is deprecated and will be removed in a future version
  col_avg_y = X_train.groupby(by=variable, axis=0)['pred_temp'].agg({'mean': 'mean', 'beta': 'size'})


In [23]:
X_train_sparse,X_test_sparse,train,test = procdess_features_train_test(train,test)

## 特征处理结果存为文件

In [24]:
#存为csv格式方便用excel查看(属性名字有重复，features得到的词语中也有bathrooms和bedrooms)
train = pd.concat([train, y_train], axis=1)
train.to_csv(dpath + 'RentListingInquries_FE_train.csv', index=False)

In [25]:
test.to_csv(dpath + 'RentListingInquries_FE_test.csv', index=False)

In [26]:
#from  scipy.io import mmwrite
#train_sparse = sparse.hstack([X_train_sparse, sparse.csr_matrix(y_train).T]).tocsr()

#mmwrite(dpath + 'RentListingInquries_FE_train.txt',train_sparse)
#mmwrite(dpath + 'RentListingInquries_FE_test.txt',X_test_sparse)

#存为libsvm稀疏格式，直接调用XGBoost的话用稀疏格式更高效
#from sklearn.datasets import dump_svmlight_file
#dump_svmlight_file(X_train_sparse, y_train, dpath + 'RentListingInquries_FE_train.txt', zero_based=False) 

import xgboost as xgb

dtrain = xgb.DMatrix(X_train_sparse, label = y_train)
dtest = xgb.DMatrix(X_test_sparse)

dtrain.save_binary(dpath + 'RentListingInquries_FE_train.bin')
dtest.save_binary(dpath + 'RentListingInquries_FE_test.bin')