In [1]:
import numpy as np
import pandas as pd
import string
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.stats import boxcox
from scipy import sparse
from sklearn.model_selection import StratifiedKFold
from itertools import product
import xgboost as xgb
from sklearn import preprocessing

import nltk
from nltk.tag import pos_tag
from nltk import word_tokenize
from nltk.corpus import stopwords
import re

## Script 설명

1. 파생 변수를 생성하여 새로운 Input Data Set을 만드는 코드
2. 학습하는 코드에서는 여기서 만든 Input Data Set을 사용하여 바로바로 학습함
3. 따로 전처리하는 부분과 학습하는 부분을 분리한 이유는 전처리하는 데도 시간이 10분정도 소요되고 전처리하여 나온 Input Data Set을 여러가지 다른 알고리즘에서 학습하기 위하여 따로 분리함

# Data Read

Input Data Set을 새롭게 만들어 놓는 이유는 Image나 Building ID, Price정보는 처리하는데 시간이 오래걸리기 때문에 결과가 나오고 나서 그 결과를 따로 Input Set으로 다시 만들어 시간을 절약함

오리지날 Input Data Set

In [90]:
original_train = pd.read_json('input/train.json')
original_test = pd.read_json('input/test.json')

Original data Set에서 Image 크기와 채도 명도가 추가되어 있는 Input Data Set

In [70]:
original_train = pd.read_json('input/train_with_image.json')
original_test = pd.read_json('input/test_with_image.json')

Image정보에 Building ID 0인 값을 채운 Input Data Set
Building ID 0인 값은 위도 경도로 집계하였을 때 Building ID가 0을 제외하고 Unique하면 0값을 Unique한 Building ID로 채움

In [4]:
original_train = pd.read_json('input/filled_building_id_train.json')
original_test = pd.read_json('input/filled_building_id_test.json')

위도 경도를 +-0.005로 잘라서 그 지역의 Price 시세를 구하고 High, Medium, Low의 수량으로 인기지역인지 판단한 정보가 추가되어 있는 Data Set

In [88]:
original_train2 = pd.read_json('input/train_add_price.json')
original_test2 = pd.read_json('input/test_add_price.json')

In [89]:
merge = merge_train_test(original_train,original_test)
merge2 = merge_train_test(original_train2,original_test2)

#### check raw input

In [3]:
original_train.shape

(49352, 19)

In [5]:
original_test.shape

(74659, 18)

# Data Preprocessing

**기본적인 Feature를 추가하는 함수**<br>
>1. Kernel을 보면서 여러가지 Feature들을 추가
>2. Data를 관찰하면서 새로운 Feature들을 추가

<br>**Base Kernel**<br>
>* https://www.kaggle.com/visnaga/two-sigma-connect-rental-listing-inquiries/xgboost-for-the-millionth-time-0-54724-lb
>* https://www.kaggle.com/rakhlin/another-python-version-of-it-is-lit-by-branden
>* https://www.kaggle.com/sudalairajkumar/simple-exploration-notebook-2-connect


In [8]:
def add_feature(df):
    data = df.copy()
    data['date'] = pd.to_datetime(data['created'])
    data['year'] = data['date'].dt.year
    data['month'] = data['date'].dt.month
    data['day'] = data['date'].dt.day
    data['wday'] = data['date'].dt.dayofweek
    data['yday'] = data['date'].dt.dayofyear
    data['hour'] = data['date'].dt.hour
    data["total_days"] =   (data["month"] -4.0)*30 + data["day"] +  data["hour"] /25.0
    data["diff_rank"]= data["total_days"]/data["listing_id"]
    
    # 사진개수
    data["photo_count"] = data["photos"].apply(len)

    # 가격에 대한 추가 Feature
    data["pricePerBed"] = data['price'] / data['bedrooms']
    data["pricePerBath"] = data['price'] / data['bathrooms']
    data["pricePerRoom"] = data['price'] / (data['bedrooms'] + data['bathrooms'])
    data["price_latitue"] = (data["price"])/ (data["latitude"]+1.0)
    data["price_longtitude"] = (data["price"])/ (data["longitude"]-1.0)
    #data["num_price_by_furniture"] = data["price"] / (data["bathrooms"] + data["bedrooms"])
    
    # 방에 대한 추가 Feature
    data["bedPerBath"] = data['bedrooms'] / data['bathrooms']
    data["bedPerDiff"] = data['bedrooms'] - data['bathrooms']
    data["num_furniture"] =  data["bathrooms"] + data["bedrooms"]
    data["num_furniture"] = data["num_furniture"].apply(lambda x:  str(x) if float(x)<9.5 else '10')
    data['num_furniture'] = data['num_furniture'].astype('float')
    
    # https://www.kaggle.com/ogrellier/median-rental-prices-matter/notebook/notebook
    median_prices = data[['price','bedrooms']]
    medians_by_key = median_prices.groupby(by='bedrooms')['price'].median().reset_index()
    medians_by_key.rename(columns={'price': 'median_price_bed'}, inplace=True)
    data = data.merge(medians_by_key, on='bedrooms', how='left')
    data['price_ratio_bed_price_median'] = data['price'] / data['median_price_bed']
    
    data['zero_building_id'] = data['building_id'].apply(lambda x: 1 if x == '0' else 0)

    #bc_price, tmp = boxcox(data.price)
    #data['bc_price'] = bc_price
    
    data = data.fillna(-1).replace(np.inf, -1)
    
    data = convert_display_address(data)
    data['empty_display_address'] = data['display_address'].apply(lambda x: 1 if x=='' else 0)
    data['empty_street_address'] = data['street_address'].apply(lambda x: 1 if x=='' else 0)
    
    data = add_rank_feature(data,'building_id')
    data = add_rank_feature(data,'manager_id')
    
    #data['longi_lati'] = data.apply(lambda row: str(row['longitude'])+ str("_") + str(row['latitude']),axis=1)
    
    data = category_combine(data)
    data = manager_id_diff_median_price(data)
    data = image_timestamp(data)
    data = add_city_distance(data)
    data = add_floor_plan(data)
    data = add_how_many_time_same_building(data)
    data = add_how_many_time_same_display(data)
    #data = calculate_manager_id_distance(data)
    data = add_market_price(data)
    
    return data

In [5]:
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.cluster import Birch
def cluster_latlon(n_clusters, df):  
    data = df.copy()
    #split the data between "around NYC" and "other locations" basically our first two clusters 
    data_c=data[(data.longitude>-74.05)&(data.longitude<-73.75)&(data.latitude>40.4)&(data.latitude<40.9)]
    data_e=data[~((data.longitude>-74.05)&(data.longitude<-73.75)&(data.latitude>40.4)&(data.latitude<40.9))]
    #put it in matrix form
    coords=data_c.as_matrix(columns=['latitude', "longitude"])
    
    brc = Birch(branching_factor=100, n_clusters=n_clusters, threshold=0.01,compute_labels=True)

    brc.fit(coords)
    clusters=brc.predict(coords)
    data_c["cluster_"+str(n_clusters)]=clusters
    data_e["cluster_"+str(n_clusters)]=-1 #assign cluster label -1 for the non NYC listings 
    data=pd.concat([data_c,data_e])
    plt.scatter(data_c["longitude"], data_c["latitude"], c=data_c["cluster_"+str(n_clusters)], s=10, linewidth=0.1)
    plt.title(str(n_clusters)+" Neighbourhoods from clustering")
    plt.show()
    return data 

https://www.kaggle.com/adamsfei/only-brand-new-features

In [4]:
import math
def cart2rho(x, y):
    rho = np.sqrt(x**2 + y**2)
    return rho


def cart2phi(x, y):
    phi = np.arctan2(y, x)
    return phi


def rotation_x(row, alpha):
    x = row['latitude']
    y = row['longitude']
    return x*math.cos(alpha) + y*math.sin(alpha)


def rotation_y(row, alpha):
    x = row['latitude']
    y = row['longitude']
    return y*math.cos(alpha) - x*math.sin(alpha)


def add_rotation(degrees, df):
    namex = "rot" + str(degrees) + "_X"
    namey = "rot" + str(degrees) + "_Y"

    df['num_' + namex] = df.apply(lambda row: rotation_x(row, math.pi/(180/degrees)), axis=1)
    df['num_' + namey] = df.apply(lambda row: rotation_y(row, math.pi/(180/degrees)), axis=1)

    return df

def operate_on_coordinates(tr_df, te_df):
    for df in [tr_df, te_df]:
        #polar coordinates system
        df["num_rho"] = df.apply(lambda x: cart2rho(x["latitude"] - 40.78222222, x["longitude"]+73.96527777), axis=1)
        df["num_phi"] = df.apply(lambda x: cart2phi(x["latitude"] - 40.78222222, x["longitude"]+73.96527777), axis=1)
        #rotations
        for angle in [15,30,45,60]:
            df = add_rotation(angle, df)

    return tr_df, te_df

https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries/discussion/31962

In [3]:
import re

def cap_share(x):
    return sum(1 for c in x if c.isupper())/float(len(x)+1)

def operate_description_featre(train_df,test_df):
    train = train_df.copy()
    test = test_df.copy()
    
    for df in [train, test]:
        # do you think that users might feel annoyed BY A DESCRIPTION THAT IS SHOUTING AT THEM?
        df['num_cap_share'] = df['description'].apply(cap_share)

        # how long in lines the desc is?
        df['num_nr_of_lines'] = df['description'].apply(lambda x: x.count('<br /><br />'))

        # is the description redacted by the website?        
        df['num_redacted'] = 0
        df['num_redacted'].ix[df['description'].str.contains('website_redacted')] = 1

        df['num_exclamation'] =  df['description'].apply(lambda x: x.count('!'))
        # can we contact someone via e-mail to ask for the details?
        df['num_email'] = 0
        df['num_email'].ix[df['description'].str.contains('@')] = 1

        #and... can we call them?

        reg = re.compile(".*?(\(?\d{3}\D{0,3}\d{3}\D{0,3}\d{4}).*?", re.S)
        def try_and_find_nr(description):
            if reg.match(description) is None:
                return 0
            return 1

        df['num_phone_nr'] = df['description'].apply(try_and_find_nr)
        
    return train,test

In [2]:
def add_how_many_time_same_building(df):
    data = df.copy()
    temp = data.groupby(by=['manager_id','building_id','bedrooms','bathrooms'])['interest_level'].size().reset_index()
    temp.rename(columns={0:'how_many_time_same_building'},inplace=True)
    data = data.merge(temp,on=['manager_id','building_id','bedrooms','bathrooms'],how='left')
    del temp
    
    return data

In [1]:
def add_how_many_time_same_display(df):
    data = df.copy()
    temp = data.groupby(by=['manager_id','display_address'])['interest_level'].size().reset_index()
    temp.rename(columns={0:'how_many_time_same_display_address'},inplace=True)
    data = data.merge(temp,on=['manager_id','display_address'],how='left')
    del temp
    
    return data

floor plan(평면도)을 detecting하여 csv로 파일로 만들어 놓음<br>
listingid로 merge하여 사용함<br>
**floor plan을 detecting하는 방법**
>1. Image파일을 읽어서 사용한 색의 개수를 Count함
>2. 사용한 색의 개수의 표준편차를 구하여 표준편차가 큰 것은 floor plan
>3. 대각선의 색을 가져와 사용한 색의 개수가 적으면 floor plan

In [9]:
def add_floor_plan(df):
    data = df.copy()
    floor_plan = pd.read_csv('input/floor_pan.csv')
    
    data = data.merge(floor_plan,on='listing_id',how='left')
    del floor_plan
    
    return data

Newyork 시의 중심에서 부터의 거리 추가<br>
https://www.kaggle.com/enrique1500/rental-listing-ny-map

In [10]:
ny_lat = 40.785091
ny_lon = -73.968285

def distance(row):
    return np.sqrt((row['latitude'] - ny_lat)**2 + (row['longitude']-ny_lon)**2)

def add_city_distance(df):
    data = df.copy()
    data['city_distance'] = data.apply(lambda row: distance(row),axis=1 )
    
    return data

manager_id로 집계하여 여러가지 파생변수 생성

In [11]:
def calculate_manager_id_distance(df):
    data = df.copy()
    
    manager_long_lati_median = data.groupby(by='manager_id')[['longitude','latitude']].median().reset_index()
    manager_long_lati_median.rename(columns={'longitude': 'mid_long_median','latitude':'mid_lati_median'}, inplace=True)
    data = data.merge(manager_long_lati_median,on='manager_id',how='left')
    
    data['x_diff'] = (data['mid_long_median'] - data['longitude'])**2
    data['y_diff'] = (data['mid_lati_median'] - data['latitude'])**2
    data['mid_distance'] = np.sqrt(data['x_diff'] + data['y_diff'])
    
    del manager_long_lati_median, data['mid_long_median'], data['mid_lati_median'],data['x_diff'],data['y_diff']
    
    return data

**merge magic timestamp feature**<br>
이번 Kaggle에서는 Image 분석을 위하여 각 방의 Image를 제공하였는데 제공된 Image의 폴더 생성시간이 magic(leak) feature로 누출됨<br>
0.001에서도 점수가 갈리는 대회에서 0.015점 정도가 향상되어 막판에 대회가 매우 치열해짐<br>
https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries/discussion/31870

In [12]:
def image_timestamp(df):
    data = df.copy()
    
    leakage_image = pd.read_csv('listing_image_time.csv')
    data = data.merge(leakage_image,on='listing_id',how='left')
    
    del leakage_image
    return data

manager_id로 만든 파생변수

In [14]:
def manager_id_diff_median_price(df):
    data = df.copy()
    manager_id_price_median = data.groupby(by='manager_id')['price'].median().reset_index()
    manager_id_price_median.rename(columns={'price': 'mid_price_median'}, inplace=True)
    data = data.merge(manager_id_price_median,on='manager_id',how='left')
    
    manager_set = pd.DataFrame(data['manager_id'].value_counts()).reset_index()
    manager_set.rename(columns={'index':'manager_id','manager_id':'mid_count'},inplace=True)
    data = data.merge(manager_set,on='manager_id',how='left')
    
    mid_disp_price_median = data.groupby(by=('manager_id','display_address'))['price'].median().reset_index()
    mid_disp_price_median.rename(columns={'price': 'mid_disp_price_median'}, inplace=True)
    data = data.merge(mid_disp_price_median,on=('manager_id','display_address'),how='left')

    mid_building_price_median = data.groupby(by=('manager_id','building_id'))['price'].median().reset_index()
    mid_building_price_median.rename(columns={'price': 'mid_building_price_median'}, inplace=True)
    data = data.merge(mid_building_price_median,on=('manager_id','building_id'),how='left')

    #data['price_ratio_mid_price_median'] = data['price'] / data['mid_price_median']
    #data['price_ratio_mid_disp_price_median'] = data['price'] / data['mid_disp_price_median']
    #data['price_ratio_mid_building_price_median'] = data['price'] / data['mid_building_price_median']
    #data['diff_manager_price'] = data['manager_price_median'] - data['price']
    del manager_set,manager_id_price_median,mid_building_price_median,mid_disp_price_median
    
    return data

문제를 제출한 renthop site에 방 타입별로 얼마나 비싼지가 나와있어서 feature로 추가함<br>
https://www.renthop.com/nyc/apartments-for-rent

In [15]:
def add_display_roomtype_price(df):
    data = df.copy()
    
    data['bedroom_cat'] = data['bedrooms'].apply(lambda x: 4 if x>=4 else x)
    data['bathroom_cat'] = data['bathrooms'].apply(lambda x: 5 if x>=5 else x)
    
    disp_roomtype_price_median = data.groupby(by=('display_address','fea_studio','fea_loft','bedroom_cat','bathroom_cat'))['price'].median().reset_index()
    disp_roomtype_price_median.rename(columns={'price': 'disp_roomtype_price_median'}, inplace=True)
    
    data = data.merge(disp_roomtype_price_median,on=('display_address','fea_studio','fea_loft','bedroom_cat','bathroom_cat'),how='left')
    data['how_many_expansive'] = data['price'] - data['disp_roomtype_price_median']
    
    del data['bedroom_cat'],data['bathroom_cat']
    return data

위도 경도 +-0.005 지역을 정하여 각 지역마다 가격의 시세를 정하였고 그 지역이 인기 있는 지역인지 feature를 추가함<br>
CV Score는 좋았지만 LB Score가 좋지 않아 결국 사용하지 않음(overfitting 됨)<br>
**LB Score는 Kaggle에서 Submission을 하였을 때 나오는 Score를 말함**

In [16]:
def add_market_price(df):
    data = df.copy()
    #unique_longi_lat_df = data.copy()
    data['longi_lati'] = data.apply(lambda row: str(row['longitude'])+ str("_") + str(row['latitude']),axis=1)
    unique_longi_lat_df = data.copy()
    unique_longi_lat_df = unique_longi_lat_df.drop_duplicates(subset='longi_lati')

    for idx,row in unique_longi_lat_df.iterrows():
        lati = row['latitude']
        longi = row['longitude']
        near_place_df = data.ix[(data['latitude']>lati-0.005)&(data['latitude']<lati+0.005)&(data['longitude']<longi+0.005)&(data['longitude']>longi-0.005)].copy()
        
        low_df = near_place_df.ix[near_place_df['interest_level']=='low']
        unique_longi_lat_df.set_value(idx,'place_low_median',low_df['price'].median())
        low_cnt = low_df.shape[0]
        
        medium_df = near_place_df.ix[near_place_df['interest_level']=='medium']
        unique_longi_lat_df.set_value(idx,'place_medium_median',medium_df['price'].median())
        medi_cnt = medium_df.shape[0]
        
        high_df = near_place_df.ix[near_place_df['interest_level']=='high']
        unique_longi_lat_df.set_value(idx,'place_high_median',high_df['price'].median())
        high_cnt = high_df.shape[0]
        
        all_cnt = low_cnt+medi_cnt+high_cnt
        
        unique_longi_lat_df.set_value(idx,'market_price',near_place_df['price'].median())
        popular = 0
        if all_cnt != 0:
            popular = (medi_cnt+2*high_cnt)/all_cnt
            
        unique_longi_lat_df.set_value(idx,'popularity_place',popular)
        
        del row,near_place_df,low_df,medium_df,high_df
    
    temp = unique_longi_lat_df[['longi_lati','market_price','popularity_place','place_low_median','place_medium_median','place_high_median']]
    data = data.merge(temp, on='longi_lati', how='left')
    del unique_longi_lat_df,temp
    
    return data

아래 Kernel을 Manager의 Level을 추가함<br>
https://www.kaggle.com/guoday/cv-statistics-better-parameters-and-explaination/notebook/notebook

In [17]:
import random

def manager_level(train,test):
    train_df = train.copy()
    test_df = test.copy()
    
    index=list(range(train_df.shape[0]))
    random.shuffle(index)
    a=[np.nan]*len(train_df)
    b=[np.nan]*len(train_df)
    c=[np.nan]*len(train_df)

    for i in range(5):
        building_level={}
        for j in train_df['manager_id'].values:
            building_level[j]=[0,0,0]
        test_index=index[int((i*train_df.shape[0])/5):int(((i+1)*train_df.shape[0])/5)]
        train_index=list(set(index).difference(test_index))
        for j in train_index:
            temp=train_df.iloc[j]
            if temp['interest_level']=='low':
                building_level[temp['manager_id']][0]+=1
            if temp['interest_level']=='medium':
                building_level[temp['manager_id']][1]+=1
            if temp['interest_level']=='high':
                building_level[temp['manager_id']][2]+=1
        for j in test_index:
            temp=train_df.iloc[j]
            if sum(building_level[temp['manager_id']])!=0:
                a[j]=building_level[temp['manager_id']][0]*1.0/sum(building_level[temp['manager_id']])
                b[j]=building_level[temp['manager_id']][1]*1.0/sum(building_level[temp['manager_id']])
                c[j]=building_level[temp['manager_id']][2]*1.0/sum(building_level[temp['manager_id']])
    train_df['manager_level_low']=a
    train_df['manager_level_medium']=b
    train_df['manager_level_high']=c

    a=[]
    b=[]
    c=[]
    building_level={}
    for j in train_df['manager_id'].values:
        building_level[j]=[0,0,0]
    for j in range(train_df.shape[0]):
        temp=train_df.iloc[j]
        if temp['interest_level']=='low':
            building_level[temp['manager_id']][0]+=1
        if temp['interest_level']=='medium':
            building_level[temp['manager_id']][1]+=1
        if temp['interest_level']=='high':
            building_level[temp['manager_id']][2]+=1

    for i in test_df['manager_id'].values:
        if i not in building_level.keys():
            a.append(np.nan)
            b.append(np.nan)
            c.append(np.nan)
        else:
            a.append(building_level[i][0]*1.0/sum(building_level[i]))
            b.append(building_level[i][1]*1.0/sum(building_level[i]))
            c.append(building_level[i][2]*1.0/sum(building_level[i]))
    test_df['manager_level_low']=a
    test_df['manager_level_medium']=b
    test_df['manager_level_high']=c

    return train_df,test_df

In [37]:
def category_combine(data):
    lencat=len(categorical)
    for f in range (0,lencat):
        for s in range (f+1,lencat):
            new_category = categorical[f] + "_" +categorical[s]
            if new_category == 'display_address_street_address':
                continue
                
            data[new_category] =data[categorical[f]]+"_" + data[categorical[s]]
            categorical.append(new_category)
    
    return data

In [36]:
def get_category_combine_list(ct):
    category = ct.copy()
    lencat=len(category)
    for f in range (0,lencat):
        for s in range (f+1,lencat):
            new_category = category[f] + "_" +category[s]
            if new_category == 'display_address_street_address':
                continue
                
            category.append(new_category)
    
    return category

In [35]:
address_map = {
    'w': 'west',
    'st.': 'street',
    'ave': 'avenue',
    'st': 'street',
    'e': 'east',
    'n': 'north',
    's': 'south'
}

new_address_cols = ['street', 'avenue', 'east', 'west', 'north', 'south']

def address_map_func(s):
    s = s.split(' ')
    out = []
    for x in s:
        if x in address_map:
            out.append(address_map[x])
        else:
            out.append(x)
    return ' '.join(out)

def convert_display_address(data):
    fmt = lambda s: s.replace("\u00a0", "").strip().lower()
    data["display_address"] = data["display_address"].apply(fmt)
    data['display_address'] = data['display_address'].apply(lambda x: x.translate(remove_punct_map))
    data['display_address'] = data['display_address'].apply(lambda x: address_map_func(x))
    
    data["street_address"] = data["street_address"].apply(fmt)
    data['street_address'] = data['street_address'].apply(lambda x: x.translate(remove_punct_map))
    data['street_address'] = data['street_address'].apply(lambda x: address_map_func(x))
    
    for col in new_address_cols:
        data[col] = data['display_address'].apply(lambda x: 1 if col in x else 0)
        
    data['other_address'] = data[new_address_cols].apply(lambda x: 1 if x.sum() == 0 else 0, axis=1)
    
    return data

In [34]:
def add_rank_feature(data,col):
    val = [99,98,95,90,85,80,75,70,50]
    column = ['top1','top2','top5','top10','top15','top20','top25','top30','top50']
    attributes = zip(val,column)
    
    value_count = data[col].value_counts()
    
    for val,column_name in attributes:
        top_column_name = column_name + "_"  + col
        upper_value = np.percentile(value_count.values, 90)
        upper_index = value_count.index.values[value_count.values >= upper_value ]
        data[top_column_name] = data[col].apply(lambda x: 1 if x in upper_index else 0)
        
    return data

In [33]:
def designate_single_observations(df1, df2, column):
    ps = df1[column].append(df2[column])
    grouped = ps.groupby(ps).size().to_frame().rename(columns={0: "size"})
    new_column = column +"_exist_one"
    df1[new_column] = 0
    df1.loc[df1.join(grouped, on=column, how="left")["size"] <= 1, new_column] = 1
    df2[new_column] = 0
    df2.loc[df2.join(grouped, on=column, how="left")["size"] <= 1, new_column] = 1
    return df1, df2

In [32]:
def hcc_encode(train_df, test_df, variable, target, prior_prob, k, f=1, g=1, r_k=None, update_df=None):
    """
    See "A Preprocessing Scheme for High-Cardinality Categorical Attributes in
    Classification and Prediction Problems" by Daniele Micci-Barreca
    """
    hcc_name = "_".join(["hcc", variable, target])

    grouped = train_df.groupby(variable)[target].agg({"size": "size", "mean": "mean"})
    grouped["lambda"] = 1 / (g + np.exp((k - grouped["size"]) / f))
    grouped[hcc_name] = grouped["lambda"] * grouped["mean"] + (1 - grouped["lambda"]) * prior_prob

    df = test_df[[variable]].join(grouped, on=variable, how="left")[hcc_name].fillna(prior_prob)
    if r_k: df *= np.random.uniform(1 - r_k, 1 + r_k, len(test_df))     # Add uniform noise. Not mentioned in original paper

    if update_df is None: update_df = test_df
    if hcc_name not in update_df.columns: update_df[hcc_name] = np.nan
    update_df.update(df)
    return

In [31]:
string.punctuation.__add__('!!')
string.punctuation.__add__('(')
string.punctuation.__add__(')')

remove_punct_map = dict.fromkeys(map(ord, string.punctuation))

stop = stopwords.words('english')
def cleaning_text(sentence):
    sentence=re.sub('\d+',' ', sentence) #removes digits
    cleaned=' '.join([w for w in sentence.split() if not w in stop]) # removes english stopwords
    #cleaned=' '.join([w for w , pos in pos_tag(cleaned.split()) if (pos == 'NN' or pos=='JJ' or pos=='JJR' or pos=='JJS' )])
    #selecting only nouns and adjectives
    cleaned=' '.join([w for w in cleaned.split() if not len(w)<=2 ]) #removes single lettered words and digits
    cleaned=cleaned.strip()
    return cleaned

def convert_description(data):
    data['desc'] = data['description']
    fmt = lambda s: s.replace("\u00a0", "").strip().lower()
    #fmt = lambda s: s.strip().lower()
    data['desc'] = data['desc'].apply(fmt)
    data['desc'] = data['desc'].apply(lambda x: x.replace('<p><a  website_redacted ', ''))
    data['desc'] = data['desc'].apply(lambda x: x.replace('!<br /><br />', ''))
    data['desc'] = data['desc'].apply(lambda x: x.translate(remove_punct_map))
    data['desc'] = data['desc'].apply(lambda x: cleaning_text(x))
    stem_word = nltk.stem.SnowballStemmer('english')
    data['desc'] = data['desc'].apply(lambda x: stem_word.stem(x))
    
    data["desc_wordcount"] = data["desc"].apply(str.split).apply(len)
    
    return data

In [30]:
string.punctuation.__add__('!!')
string.punctuation.__add__('(')
string.punctuation.__add__(')')

remove_punct_map = dict.fromkeys(map(ord, string.punctuation))

def replace_similar_word(s):
    x = s.replace("no fee", "nofee")
    x = x.replace("no-fee", "nofee")
    x = x.replace("no  fee", "nofee")
    x = x.replace("no_fee", "nofee")
    
    x = x.replace("reduced_fee", "lowfee")
    x = x.replace("low_fee", "lowfee")
    x = x.replace("reduced_fee", "lowfee")
    x = x.replace("low fee", "lowfee")
    
    x = x.replace("hardwood", "parquet")
    
    x = x.replace("concierge", "doorman")
    x = x.replace("housekeep", "doorman")
    x = x.replace("in_super", "doorman")
    
    x = x.replace("pre_war", "prewar")
    x = x.replace("pre war", "prewar")
    x = x.replace("pre-war", "prewar")
    
    x = x.replace("laundry", "lndry")
    
    x = x.replace("gym", "health")
    x = x.replace("fitness", "health")
    x = x.replace("training", "health")
    
    x = x.replace("train", "transport")
    x = x.replace("subway", "transport")
    
    x = x.replace("subway", "transport")
    
    x = x.replace("twenty four hour", "24")
    x = x.replace("24/7", "24")
    x = x.replace("24hr", "24")
    x = x.replace("24-hour", "24")
    x = x.replace("24hour", "24")
    x = x.replace("24 hour", "24")
    x = x.replace("common", "cm")
    
    x = x.replace("bicycle", "bike")
    
    x = x.replace("private", "pv")
    x = x.replace("decorative", "deco")
    x = x.replace("onsite", "os")
    x = x.replace("outdoor", "od")
    x = x.replace("ss appliances", "stainless")
    return x
    

stop = stopwords.words('english')
def cleaning_text(sentence):
    sentence=re.sub('\d+',' ', sentence) #removes digits
    cleaned=' '.join([w for w in sentence.split() if not w in stop]) # removes english stopwords
    #cleaned=' '.join([w for w , pos in pos_tag(cleaned.split()) if (pos == 'NN' or pos=='JJ' or pos=='JJR' or pos=='JJS' )])
    #selecting only nouns and adjectives
    cleaned=' '.join([w for w in cleaned.split() if not len(w)<=2 ]) #removes single lettered words and digits
    cleaned=cleaned.strip()
    return cleaned

def convert_feature(data):
    data['feature'] = data['features']
    data["num_features"] = data["feature"].apply(len)
    data['feature']=data['feature'].apply(lambda x: ', '.join(x))
    fmt = lambda s: s.replace("\u00a0", "").strip().lower()
    data['feature'] = data['feature'].apply(fmt)
    data['feature'] = data['feature'].apply(lambda x: x.translate(remove_punct_map))
    data['feature'] = data['feature'].apply(lambda x: cleaning_text(x))
    data['feature'] = data['feature'].apply(lambda x: replace_similar_word(x))
    
    
    stem_word = nltk.stem.SnowballStemmer('english')
    data['feature'] = data['feature'].apply(lambda x: stem_word.stem(x))
    
    data['feature'] = data['feature'].apply(lambda x: x.strip())
    #data["feature_wordcount"] = data["feature"].apply(str.split).apply(len)
    
    return data

In [29]:
def separate_train_test(data):
    train_df = data.ix[:train_row-1,:]
    test_df = data.ix[train_row:,:]
    
    del_column = ['interest_level','pred_0', 'pred_1', 'pred_2']
    for col in del_column:
        if col in test_df.columns:
            del test_df[col]

    return train_df,test_df

In [28]:
def merge_train_test(train_df,test_df):
    data = pd.concat((train_df, test_df), axis=0).reset_index(drop=True)
    return data

In [27]:
def drop_column(data,col_list):
    data = data.drop(col_list, axis=1)
    return data

### Preprocessing

In [71]:
train_df = original_train.copy()
test_df = original_test.copy()

In [72]:
train_row = train_df.shape[0]
print(train_row)

49352


In [73]:
listing_id = test_df.listing_id.values

#### add feature

In [74]:
merged_df = merge_train_test(train_df,test_df)

In [78]:
categorical = ["display_address", "manager_id", "building_id","street_address"]
merged_df = add_feature(merged_df)

#### Input Data Set에 place정보가 추가되어 있을 경우만 사용함 fillna place price feature

In [49]:
del merged_df['longi_lati']

In [50]:
place_high_median_median = merged_df.ix[~merged_df['place_high_median'].isnull()]['place_high_median'].median()
merged_df.loc[merged_df['place_high_median'].isnull(),'place_high_median'] = place_high_median_median

place_medium_median_median = merged_df.ix[~merged_df['place_medium_median'].isnull()]['place_medium_median'].median()
merged_df.loc[merged_df['place_medium_median'].isnull(),'place_medium_median'] = place_medium_median_median

place_low_median_median = merged_df.ix[~merged_df['place_low_median'].isnull()]['place_low_median'].median()
merged_df.loc[merged_df['place_low_median'].isnull(),'place_low_median'] = place_low_median_median

#### 빌딩 ID Merge - 성능이 좋아지지 않아 마지막에는 사용하지 않음

In [75]:
merged_df.ix[merged_df['building_id']=='0'].shape

(20664, 19)

In [76]:
building_id_train = pd.read_json('input/filled_building_id_train.json')
building_id_test = pd.read_json('input/filled_building_id_test.json')
building_merge = merge_train_test(building_id_train,building_id_test)
merged_df.loc[merged_df['building_id']=='0','building_id']=np.nan

temp = building_merge[['listing_id','building_id']]

merged_df = merged_df.merge(temp,on='listing_id',how='left')
del merged_df['building_id_x']
merged_df.rename(columns={'building_id_y':'building_id'},inplace=True)

del temp,building_id_train,building_id_test,building_merge

In [77]:
merged_df.ix[merged_df['building_id']=='0'].shape

(4507, 19)

#### no - use find single category feature and make new feature

In [44]:
train_df, test_df = separate_train_test(merged_df)

In [72]:
# Special designation for building_ids, manager_ids, display_address with only 1 observation
for col in ('building_id', 'manager_id', 'display_address','street_address'):
    train_df, test_df = designate_single_observations(train_df, test_df, col)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


#### called cv statistics almost same feature hcc encoding

In [79]:
train_df, test_df = separate_train_test(merged_df)

In [80]:
train_df,test_df = manager_level(train_df,test_df)

#### coordinate feature

In [None]:
train_df, test_df = operate_on_coordinates(train_df, test_df)

#### description feature

In [None]:
train_df, test_df = operate_description_featre(train_df, test_df)

#### newyork cluster

In [None]:
merged_df = merge_train_test(train_df,test_df)

In [None]:
merged_only_long_lati = merged_df[['longitude','latitude','listing_id']]
merged_only_long_lati = cluster_latlon(14,merged_only_long_lati)
merged_df = merged_df.merge(merged_only_long_lati[['listing_id','cluster_14']],on='listing_id',how='left')

#### description

In [81]:
merged_df = merge_train_test(train_df,test_df)

In [82]:
merged_df = convert_description(merged_df)

###### description 변환은 현재 사용하지 않음 

In [None]:
tfidfdesc=TfidfVectorizer(min_df=20, max_features=50, lowercase =True,
                        analyzer='word', ngram_range=(1, 2), use_idf=False,smooth_idf=False, 
    sublinear_tf=True, stop_words = 'english')

tfidfdesc.fit(merged_df['desc'])

desc_sparse = tfidfdesc.transform(merged_df['desc'])

desc_sparse_cols = tfidfdesc.get_feature_names()
desc_col_list = []
for desc_col in desc_sparse_cols:
    desc_col_list.append('desc'+'_'+desc_col)
    
desc_sparse_df = pd.DataFrame(desc_sparse.toarray(),columns=desc_col_list)
merged_df = pd.concat([merged_df,desc_sparse_df],axis=1)

In [92]:
del_col = []
for col in merged_df.columns:
    if (col.find('desc_')!=-1) & (col != 'desc_wordcount'):
        del_col.append(col)
        
for col in del_col:
    del merged_df[col]

#### feature 특성 변환

In [83]:
merged_df = convert_feature(merged_df)

c_vect = CountVectorizer(stop_words='english', max_features=200, ngram_range=(1, 1))
c_vect.fit(merged_df['feature'])

c_vect_sparse_1 = c_vect.transform(merged_df['feature'])

c_vect_sparse1_cols = c_vect.get_feature_names()
feature_col_list = []
for desc_col in c_vect_sparse1_cols:
    feature_col_list.append('fea'+'_'+desc_col)
    
feature_sparse_df = pd.DataFrame(c_vect_sparse_1.toarray(),columns=feature_col_list)

merged_df = pd.concat([merged_df,feature_sparse_df],axis=1)

#### add display roomtype price

In [84]:
merged_df = add_display_roomtype_price(merged_df)

In [85]:
del merged_df['desc'],merged_df['description']

In [86]:
del merged_df['feature'],merged_df['features']

#### factorize

In [87]:
class ValueTooSmallError(Exception):
    def __str__(self):
        return repr(self.value)

try:
    if len(categorical) <5:
        raise ValueTooSmallError
except NameError:
    categorical = ["display_address", "manager_id", "building_id","street_address"]
    categorical = get_category_combine_list(categorical)
except ValueTooSmallError:
    categorical = get_category_combine_list(categorical)

In [88]:
for feat in categorical:
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(merged_df[feat].values))
    merged_df[feat] = lbl.transform(list(merged_df[feat].values))

#### change interest_level

In [89]:
train_row = train_df.shape[0]
print(train_row)

49352


In [90]:
train_df, test_df = separate_train_test(merged_df)

In [91]:
train_df = train_df.replace({"interest_level": {"low": 0, "medium": 1, "high": 2}})

#### drop column

In [92]:
train_drop_col_list = ['date', 'created','year','photos']
test_drop_col_list = ['date', 'created','year','photos']
train_df = drop_column(train_df,train_drop_col_list)
test_df = drop_column(test_df,test_drop_col_list)

In [93]:
for col in train_df.columns:
    if (col.find('longi_')!=-1):
        print(col)
        del train_df[col]
        del test_df[col]

In [94]:
y_train = train_df.interest_level.values

In [95]:
del train_df['interest_level']

#### model save

In [96]:
x_train = train_df.copy()
x_test = test_df.copy()

In [97]:
x_train.shape

(49352, 279)

In [98]:
x_test.shape

(74659, 279)

In [99]:
x_train.to_csv('x_train_14th.csv',index=False)

In [100]:
x_test.to_csv('x_test_14th.csv',index=False)

In [101]:
pd.DataFrame(y_train,columns=['interest_level']).to_csv('y_train_14th.csv',index=False)

In [102]:
listing_id = x_test.listing_id.values

In [103]:
pd.DataFrame(listing_id,columns = ['listing_id']).to_csv('test_listing_14th.csv',index=False)