In [1]:
import pandas as pd
import numpy as np
import re
import itertools as it
from time import time

In [2]:
train=pd.read_json('train.json')
train['listing_id']=train['listing_id'].apply(str)
test=pd.read_json('test.json')
test['listing_id']=test['listing_id'].apply(str)

### Gather and observe all original features

In [215]:
feature_total=[]
train['features'].apply(lambda x: feature_total.append(x))
feature_total=list(it.chain.from_iterable(feature_total))
len(feature_total)

267906

In [216]:
uniq_feature_total=set(feature_total)
len(uniq_feature_total)

1556

In [217]:
list(uniq_feature_total)[:10]

['Air conditioning',
 'fitness facility',
 '1st',
 "** HOLY DEAL BATMAN! * SPRAWLING 3BR HOME * COOK'S KITCHEN * 2 BLKS TO WATERFRONT * BEDFORD L STOP **",
 'private roof deck',
 'Children’s Playroom.',
 'Yoga Room',
 'All Utilities included',
 'children’s playroom',
 'heated lap pool']

#### Findings:
1. For many listings, the features are linked with \*, which should be separated.
2. The uppercase letters should be tranfered to lowercase, which will be taken care of later. 

In [3]:
def feature_star_sep(feature_list):
    '''
    Seperate feature text with * or • as separator
    '''
    new_list=[]
    for feature in feature_list:
        if feature[:2]=='**':
            new=feature[3:-3]
            new_list+new.split(" * ")
        elif feature[:1]=='•':
            new=feature[2:]
            new_list+new.split(" • ")            
        else:
            new_list.append(feature)
            
    return new_list

In [4]:
train['features']=train['features'].apply(feature_star_sep)
test['features']=test['features'].apply(feature_star_sep)

### Adopt DecisionTree on feature text

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
## Code copied from @sudalairajkumar 
vec=CountVectorizer(stop_words='english', max_features=200)
train['features_new'] = train["features"].apply(lambda y: " ".join(["_".join(x.split(" ")).lower() for x in y]))
tr_sparse = vec.fit_transform(train["features_new"])
feature_names=vec.get_feature_names()

In [7]:
test['features_new'] = test["features"].apply(lambda y: " ".join(["_".join(x.split(" ")).lower() for x in y]))

In [184]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import log_loss

In [185]:
target_num_map = {'high':0, 'medium':1, 'low':2}
features=tr_sparse.toarray()
labels=train['interest_level'].apply(lambda x: target_num_map[x]).as_matrix()

In [186]:
clf=DecisionTreeClassifier(max_depth=5)

In [190]:
cv=StratifiedShuffleSplit(n_splits=3, test_size=0.3)

for train_idx, test_idx in cv.split(features, labels): 
    features_train,labels_train = features[train_idx],labels[train_idx]
    features_test,labels_test = features[test_idx],labels[test_idx]
    clf.fit(features_train,labels_train)
    print("log loss:",(-1)*round(log_loss(labels_test,clf.predict_proba(features_test)),3))
    
    ## Print out features with high importance
    print('high importance features:')
    for idx in np.where(clf.feature_importances_>0.05)[0]:
        print("  ",feature_names[idx],round(clf.feature_importances_[idx],3))
        

log loss: -0.746
high importance features:
   doorman 0.146
   hardwood_floors 0.262
   laundry_in_building 0.092
   no_fee 0.096
   reduced_fee 0.121
   war 0.065
log loss: -0.766
high importance features:
   doorman 0.152
   fitness_center 0.082
   hardwood_floors 0.138
   laundry_in_building 0.072
   no_fee 0.282
   reduced_fee 0.116
log loss: -0.757
high importance features:
   doorman 0.151
   fitness_center 0.052
   hardwood_floors 0.133
   laundry_in_building 0.071
   no_fee 0.279
   reduced_fee 0.123
   war 0.06


### Check important features

Some important features:
- hardwood floors
- doorman
- reduced fee
- no fee

Other candidates:
- war
- laundry
- fitness/gym

In [191]:
feature_total=[]
train['features'].apply(lambda x: feature_total.append(x))
feature_total=list(it.chain.from_iterable(feature_total))
uniq_feature_total=list(set(feature_total))

In [192]:
def re_search(key):
    '''
    Present all features with specific re pattern
    '''
    result=[]
    my_reg=r""+key
    for item in uniq_feature_total:
        if re.compile(my_reg ,re.IGNORECASE).search(item)!=None:
            result.append(item)
    return result

In [193]:
# Check all text including 'hardwood'
re_search('hardwood')

['boating hardwood floors',
 'Hardwood Floors',
 'Hardwood',
 'Polished Hardwood Floors',
 'HARDWOOD',
 'HARDWOOD FLOORS',
 'and stainless steel appliances with a dishwasher well maintained hardwood floors add to the warmth and modern feel of the space.',
 'Hardwood floors',
 'NO FEE HARDWOOD FLOORS',
 'Hardwood floor',
 'Exposed BrickHardwood Floors MicrowaveRenovated open view Excellent',
 'Hardwood Floor',
 'hardwood floors',
 'Hardwood Flooring',
 'Hardwood Flooring Throughout',
 'Redwood Hardwood Floor',
 'hardwood  floors']

In [194]:
# Check all text including 'doorman'
re_search('doorman')

['24 hr doorman',
 '24 Hour Doorman Concierge',
 '24hr doorman',
 '24 Hour Doorman',
 '24-hour doorman',
 'Full-time Doorman',
 'NO FEE...LUXURY DOORMAN BUILDING^^^RIVER VIEWS',
 '24hr Doorman',
 'virtual doorman',
 '24 hour doorman',
 '24/7 Doorman Concierge',
 'Twenty-four-hour concierge and doorman',
 'Full-time doorman',
 '24/7 Full-Time Doorman Concierge',
 'Part-time doorman',
 '24/7 Doorman',
 'Doorman',
 '24HR Doorman',
 'doorman',
 '24hr white-gloved doorman',
 'FT Doorman',
 'Virtual Doorman',
 '24-hour concierge and doorman',
 '24/7 DOORMAN',
 'Twenty-four hour concierge and doorman']

In [196]:
# Check all text including 'fee'
re_search('fee')

['No application fee!',
 'NO FEE!!',
 'NO FEE!!!Roof Deck',
 'One Month Fee',
 'BROKER FEE.',
 'Owner Occupied - 3 family townhouse - no realtor fees - this beautiful apt is offered below market rate',
 '1/2 Month Fee',
 "No Broker's Fees",
 'LIMITED TIME - NO FEE',
 'No Broker Fee',
 'NO FEE...LUXURY DOORMAN BUILDING^^^RIVER VIEWS',
 'No fee',
 'NO FEE!!!',
 'No Fee!',
 'NO FEE!',
 'Low Fee',
 'Reduced Fee',
 'Parking spot with additional fee',
 'No Fee',
 'NO FEE',
 'No Fee!!!',
 'FREE MONTH & NO-FEE',
 'and stainless steel appliances with a dishwasher well maintained hardwood floors add to the warmth and modern feel of the space.',
 'Dogs under 20 Lbs.                Parking ( additional Fee)',
 'Low fee and great space too!',
 'NO FEE HARDWOOD FLOORS',
 'No Fee or One Month Free',
 'Accepts Credit Cards (Fee Applies)',
 'No broker fee!',
 'NO FEE Specialist! Harry Ego 718.413.8270 hego@citihabitats.com',
 "1 mo broker's fee 18 mo lease",
 'One month Fee',
 'NO BROKER FEE! NO FEE',


In [197]:
# Extract no fee
re_search('no\s*\w*\s*fee')

['No application fee!',
 'NO FEE!!',
 'NO FEE!!!Roof Deck',
 'Owner Occupied - 3 family townhouse - no realtor fees - this beautiful apt is offered below market rate',
 'LIMITED TIME - NO FEE',
 'No Broker Fee',
 'NO FEE...LUXURY DOORMAN BUILDING^^^RIVER VIEWS',
 'No fee',
 'NO FEE!!!',
 'No Fee!',
 'NO FEE!',
 'No Fee',
 'NO FEE',
 'No Fee!!!',
 'NO FEE HARDWOOD FLOORS',
 'No Fee or One Month Free',
 'No broker fee!',
 'NO FEE Specialist! Harry Ego 718.413.8270 hego@citihabitats.com',
 'NO BROKER FEE! NO FEE',
 'ONLY 1st and Security**Heat and Hot water included**NO FEE**732-330-4737',
 'no fee']

In [198]:
# Extract low fee
re_search('reduce|low\sfee')

['Low Fee',
 'Reduced Fee',
 'Low fee and great space too!',
 'Low Fee Listing',
 'LOW FEE',
 'Reduced fee while it lasts!',
 '!!!!LOW FEE!!!!',
 'reduced fee while it lasts!']

In [199]:
# Check all text including 'laundry'
re_search('laundry')

['laundry & housekeeping',
 'Elevator & Laundry',
 'Laundry in building',
 'laundry / dry-cleaning and auto care',
 'laundry room',
 'laundry hookup',
 'laundry in building',
 'laundry room as well as housekeeping & dry-cleaning services',
 'On-site Laundry',
 'Laundry in Building!',
 'laundry & housekeeping • Marc Club includes a cinema room',
 'Laundry On Floor',
 'elevator & Laundry',
 'ELEVATOR/LAUNDRY/ SO CLOSE TO THE 6 $2450!!',
 'Garage Fitness Facility Laundry Room Valet Lounge Billiards Room Rooftop Deck WiFi Access',
 'Laundry facility on every floor',
 'Laundry In Unit',
 'laundry in bldg',
 'Laundry Room',
 'Private laundry room on every floor',
 'Valet Laundry',
 'Laundry.',
 'Spotless Laundry',
 'Laundry in Building',
 'laundry in bldg.',
 'On-site laundry',
 'Laundry in Some Units',
 'Laundry room on every floor. Health club',
 'Laundry',
 'Laundry room',
 'Elevator & LAUNDRY',
 'air conditioned laundry facility',
 'Laundry on every floor',
 'laundry in basement',
 'Laun

In [200]:
# Extract war and exclude other keyword with 'war' such as warmth and wardrobe
re_search('war\Z|war\s')

['Post War',
 'Post-War',
 'PRE-WAR',
 'Pre-war Charm',
 'Pre-War brand new renovations',
 'Prewar',
 'Postwar',
 'Pre-War Details',
 'Pre-War',
 'prewar',
 '* MARVELOUS MIDTOWN GEM * SPACIOUS SUNDRENCHED STUDIO * STUNNING PREWAR DETAILS * CATS OK **',
 'postwar',
 'Pre War',
 'Pre-War small dogs']

In [201]:
# Check all text including 'fitness' or 'gym'
re_search('fitness|gym')

['fitness facility',
 'Fully equipped fitness center with studio for classes',
 'Local Gym Discount',
 'Two Level Fitness Center',
 'Gym Discount',
 'fitness center',
 'State-of-the-Art Cardio and Fitness Club',
 'Gym In Building',
 'Residents-only fitness center and aerobic room professionally outfitted with a full complement of strength and cardio-training equipment',
 'Gym',
 'gym',
 'Fitness Center and health club',
 'Fully-equipped Club fitness center',
 'Concierge service. Fitness center',
 'Garage Fitness Facility Laundry Room Valet Lounge Billiards Room Rooftop Deck WiFi Access',
 'Full size gym',
 'Gym/Fitness',
 'free gym',
 'fully equipped gym',
 '000 SF Fitness Center',
 'A wide array of complimentary fitness classes offered weekly',
 'featuring a sophisticated Precor® online profile system that tracks your personal fitness goals',
 'state-of-the-art fitness center',
 'Free Gym',
 '24/7 Fitness Center',
 'Gym Fitness Lounge Swimming Pool Sun Decks Exercise Studios Indoor Ha

### Build new features

In [8]:
def add_feature(row):
    if re.search(r'hardwood',row['features_new'],re.IGNORECASE)!=None:
        row['hardwood']=1
    else:
        row['hardwood']=0
        
    if re.search(r'doorman',row['features_new'],re.IGNORECASE)!=None:
        row['doorman']=1
    else:
        row['doorman']=0
    
    if re.search(r'no\w*fee',row['features_new'],re.IGNORECASE)!=None:
        row['no_fee']=1
    else:
        row['no_fee']=0
    
    if re.search(r'reduce|low\wfee',row['features_new'],re.IGNORECASE)!=None:
        row['reduce_fee']=1
    else:
        row['reduce_fee']=0

    if re.search(r'laundry',row['features_new'],re.IGNORECASE)!=None:
        row['laundry']=1
    else:
        row['laundry']=0

    if re.search(r'war\Z|war\s|war_',row['features_new'],re.IGNORECASE)!=None:
        row['war']=1
    else:
        row['war']=0

    if re.search(r'fitness|gym',row['features_new'],re.IGNORECASE)!=None:
        row['gym']=1
    else:
        row['gym']=0
        
    return row


In [223]:
t0=time()
train=train.apply(add_feature,axis=1)
print(time()-t0)

174.36564755439758


In [225]:
train[['hardwood','doorman','no_fee','reduce_fee','laundry','war','gym']].apply(sum)

hardwood      24629
doorman       21037
no_fee        18068
reduce_fee      702
laundry       23888
war           10555
gym           13291
dtype: int64

In [228]:
train.to_json('train_cleaned.json')

In [9]:
t0=time()
test=test.apply(add_feature,axis=1)
print(time()-t0)

272.64689207077026


In [10]:
test[['hardwood','doorman','no_fee','reduce_fee','laundry','war','gym']].apply(sum)

hardwood      37260
doorman       31649
no_fee        27359
reduce_fee     1120
laundry       35964
war           16144
gym           20246
dtype: int64

In [11]:
test.to_json('test_cleaned.json')