In [9]:
# Import Useful Modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import collections
import re
from sklearn import ensemble
from sklearn import metrics
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.ensemble import ExtraTreesClassifier

In [10]:
# Load the file as dataframe
file_path = '/Users/chuanlong/GitHub/Tandon_Torch/data/train.json'
df = pd.read_json(file_path);

In [11]:
# This Method include the pipeline to clean the dataset
def clean_data(df):
    # Filter the building id outlier
    outlier_filter_buildingid = (df.building_id != '0')
    df = df[outlier_filter_buildingid]
    # Filter the price outlier
    outlier_filter_price = (500 < df.price) & (df.price < 8000)
    df = df[outlier_filter_price]
    # Filter the null listing id
    df = df[df.listing_id.isnull() == False]
    # Filter the position outlier
    outlier_filter_longitude = (df.longitude > -74.2) & (df.longitude < -73.6)
    df = df[outlier_filter_longitude]
    outlier_filter_latitude = (df.latitude > 40.0) & (df.latitude < 41.0) 
    df = df[outlier_filter_latitude]
    return df

In [12]:
# This method transform the Created timeframe to days
def transformCreatedTime(df, attribute):
    current_time = pd.to_datetime('2017-01-01')
    df[attribute] = df[attribute].map(lambda x : current_time - pd.to_datetime(x))\
                                 .map(lambda x : x / np.timedelta64(1, 's'))\
                                 .map(lambda x : float(x) / (60 * 60 * 24))
    return df[attribute]

In [16]:
def create_features(df):
    def word_stem(s):
        if type(s) == str:
            s = s.lower()
            s = s.strip()
            s = s.replace('/', ' ')
            s = s.replace(':', ' ')
            s = s.replace('&', ' ')
            s = s.replace('-', ' ')
            s = re.sub(r'[!]+', '', s)
            return s
        else:
            return None
    def list_stem(features):
        if len(features) > 0:
            s = ''
            for i in range(len(features)):
                features[i] = word_stem(features[i])
                s += ' ' + features[i]
            return s
        else:
            return ''
    df['features'] = df['features'].map(lambda x: list_stem(x))
    df['doorman'] = df['features'].map(lambda x: 1 if 'doorman' in x else 0)
    df['elevator'] = df['features'].map(lambda x: 1 if 'elevator' in x else 0)
    df['laundry'] = df['features'].map(lambda x: 1 if 'laundry' in x else 0)
    df['hardwood'] = df['features'].map(lambda x: 1 if 'hardwood' in x else 0)
    df['fitness'] = df['features'].map(lambda x: 1 if 'fitness' in x else 0)
    df['pets'] = df['features'].map(lambda x: 1 if 'cats' in x or 'dogs'in x else 0)
    df['dishwasher'] = df['features'].map(lambda x: 1 if 'dishwasher' in x else 0)
    df['no_fee'] = df['features'].map(lambda x: 1 if 'no fee' in x else 0)
    return df

In [14]:
# This method is to preprocess the dataset, you will all set if you call it
def preprocess_data(df):
    df = clean_data(df)
    df.created = transformCreatedTime(df, 'created')
    df = create_features(df)
    return df

In [15]:
df = preprocess_data(df)
df

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,...,price,street_address,doorman,elevator,laundry,hardwood,fitness,pets,dishwasher,no_fee
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,190.670556,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,,medium,40.7145,7211212,...,3000,792 Metropolitan Avenue,0,0,0,0,0,0,0,0
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,202.486493,,Columbus Avenue,doorman elevator fitness center cats allowed ...,low,40.7947,7150865,...,5465,808 Columbus Avenue,1,1,0,0,1,1,0,0
100004,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,258.856470,"Top Top West Village location, beautiful Pre-w...",W 13 Street,laundry in building dishwasher hardwood floor...,high,40.7388,6887163,...,2850,241 W 13 Street,0,0,1,1,0,0,1,0
100007,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,257.901366,Building Amenities - Garage - Garden - fitness...,East 49th Street,hardwood floors no fee,low,40.7539,6888711,...,3275,333 East 49th Street,0,0,0,1,0,0,0,1
100014,2.0,4,38a913e46c94a7f46ddf19b756a9640c,256.816123,,West 18th Street,,medium,40.7429,6894514,...,7995,350 West 18th Street,0,0,0,0,0,0,0,0
100016,1.0,2,3ba49a93260ca5df92fde024cb4ca61f,248.861157,Stunning unit with a great location and lots o...,West 107th Street,prewar elevator dogs allowed cats allowed low...,low,40.8012,6930771,...,3600,210 West 107th Street,0,1,0,1,0,1,0,1
100020,2.0,1,0372927bcb6a0949613ef5bf893bbac7,262.748819,"This huge sunny ,plenty of lights 1 bed/2 bath...",West 21st Street,doorman elevator pre war terrace laundry in u...,low,40.7427,6867392,...,5645,155 West 21st Street,1,1,1,1,0,0,1,0
100026,1.0,1,a7efbeb58190aa267b4a9121cd0c88c0,255.891262,<p><a website_redacted,Hamilton Terrace,cats allowed dogs allowed elevator laundry in...,medium,40.8234,6898799,...,1725,63 Hamilton Terrace,0,1,1,0,0,1,0,0
100044,1.0,2,67c9b420da4a365bc26a6cd0ef4a5320,256.765683,***LOW FEE. Beautiful CHERRY OAK WOODEN FLOORS...,E 38th St,doorman elevator laundry in building no fee,high,40.7488,6895442,...,3000,137 E 38th St,1,1,1,0,0,0,0,1
100051,1.0,0,bfb9405149bfff42a92980b594c28234,257.891667,Stunning full renovated studio unit. High cei...,East 34th Street,doorman elevator fitness center laundry in bu...,medium,40.7439,6889043,...,2350,340 East 34th Street,1,1,1,0,1,0,0,1


In [13]:
# Preprocess the train dataset
columns = ['bathrooms', 'bedrooms', 'latitude', 'longitude', 'price', 'created', 'doorman', 'elevator', 'laundry', 'hardwood', 'fitness', 'pets', 'dishwasher', 'no_fee', 'interest_level']
df = preprocess_data(df)
train_data = df[columns]

### Feature Engineering Experiment

In [None]:
forest = ExtraTreesClassifier(n_estimators=250, random_state=0)
X, y = train_data.drop('interest_level', 1), train_data.interest_level
forest.fit(X, y)
importances = forest.feature_importances_

In [None]:
# Plot the Picture for the feature importances
plt.figure(figsize=(16,8))
plt.bar(np.arange(14), forest.feature_importances_)
plt.xticks(np.arange(14), columns)
plt.ylabel('Importances')
plt.title('Feature Importances')
plt.show()

### Model Training

* Should use the grid search
* Should test other models
* How could you measure the training progress?

In [None]:
def log_loss_score(true_label, predict_label):
    return metrics.log_loss(true_label, predict_label)

In [None]:
def train_model(train_data, test_data):
    # Should use the grid search
    params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 1.0, 'loss': 'deviance'}
    clf = ensemble.GradientBoostingClassifier(**params)
    X_train, y_train = train_data.drop('interest_level', 1), train_data.interest_level
    X_test, y_test = test_data.drop('interest_level', 1), test_data.interest_level
    clf = clf.fit(X_train, y_train)
    res = clf.predict_proba(X_test)
    score = metrics.log_loss(y_test, res)