In [2]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
%matplotlib inline
# plt.rcdefaults()
mpl.style.use('additional')

from itertools import product
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier, LogisticRegressionCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict, StratifiedShuffleSplit, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, log_loss
from scipy import sparse
from xgboost import XGBClassifier
from pylightgbm.models import GBMClassifier
os.environ['LIGHTGBM_EXEC'] = '/Users/Terence/Develop/bin/lightgbm'



In [3]:
def add_features(df):
    fmt = lambda s: s.replace("\u00a0", "").strip().lower()
    df["photo_count"] = df["photos"].apply(len)
    df["street_address"] = df['street_address'].apply(fmt)
    df["display_address"] = df["display_address"].apply(fmt)
    df["desc_wordcount"] = df["description"].apply(str.split).apply(len)
    df["pricePerBed"] = df['price'] / df['bedrooms']
    df["pricePerBath"] = df['price'] / df['bathrooms']
    df["pricePerRoom"] = df['price'] / (df['bedrooms'] + df['bathrooms'])
    df["bedPerBath"] = df['bedrooms'] / df['bathrooms']
    df["bedBathDiff"] = df['bedrooms'] - df['bathrooms']
    df["bedBathSum"] = df["bedrooms"] + df['bathrooms']
    df["bedsPerc"] = df["bedrooms"] / (df['bedrooms'] + df['bathrooms'])

    df = df.fillna(-1).replace(np.inf, -1)
    return df


def factorize(df1, df2, column):
    ps = df1[column].append(df2[column])
    factors = ps.factorize()[0]
    df1[column] = factors[:len(df1)]
    df2[column] = factors[len(df1):]
    return df1, df2


def designate_single_observations(df1, df2, column):
    ps = df1[column].append(df2[column])
    grouped = ps.groupby(ps).size().to_frame().rename(columns={0: "size"})
    df1.loc[df1.join(grouped, on=column, how="left")["size"] <= 1, column] = -1
    df2.loc[df2.join(grouped, on=column, how="left")["size"] <= 1, column] = -1
    return df1, df2


def hcc_encode(train_df, test_df, variable, target, prior_prob, k, f=1, g=1, r_k=None, update_df=None):
    """
    See "A Preprocessing Scheme for High-Cardinality Categorical Attributes in
    Classification and Prediction Problems" by Daniele Micci-Barreca
    """
    hcc_name = "_".join(["hcc", variable, target])

    grouped = train_df.groupby(variable)[target].agg({"size": "size", "mean": "mean"})
    grouped["lambda"] = 1 / (g + np.exp((k - grouped["size"]) / f))
    grouped[hcc_name] = grouped["lambda"] * grouped["mean"] + (1 - grouped["lambda"]) * prior_prob

    df = test_df[[variable]].join(grouped, on=variable, how="left")[hcc_name].fillna(prior_prob)
    if r_k: df *= np.random.uniform(1 - r_k, 1 + r_k, len(test_df))     # Add uniform noise. Not mentioned in original paper

    if update_df is None: update_df = test_df
    if hcc_name not in update_df.columns: update_df[hcc_name] = np.nan
    update_df.update(df)
    return


def create_binary_features(df):
    bows = {
        "dogs": ("dogs", "dog"),
        "cats": ("cats",),
        "nofee": ("no fee", "no-fee", "no  fee", "nofee", "no_fee"),
        "lowfee": ("reduced_fee", "low_fee", "reduced fee", "low fee"),
        "furnished": ("furnished",),
        "parquet": ("parquet", "hardwood"),
        "concierge": ("concierge", "doorman", "housekeep", "in_super"),
        "prewar": ("prewar", "pre_war", "pre war", "pre-war"),
        "laundry": ("laundry", "lndry"),
        "health": ("health", "gym", "fitness", "training"),
        "transport": ("train", "subway", "transport"),
        "parking": ("parking",),
        "utilities": ("utilities", "heat water", "water included")
    }

    def indicator(bow):
        return lambda s: int(any([x in s for x in bow]))

    features = df["features"].apply(lambda f: " ".join(f).lower())   # convert features to string
    for key in bows:
        df["feature_" + key] = features.apply(indicator(bows[key]))

    return df
    
    
# Load data
X_train = pd.read_json("train.json").sort_values(by="listing_id")
X_test = pd.read_json("test.json").sort_values(by="listing_id")

# Make target integer, one hot encoded, calculate target priors
X_train = X_train.replace({"interest_level": {"low": 0, "medium": 1, "high": 2}})
X_train = X_train.join(pd.get_dummies(X_train["interest_level"], prefix="pred").astype(int))
prior_0, prior_1, prior_2 = X_train[["pred_0", "pred_1", "pred_2"]].mean()

# Add common features
X_train = add_features(X_train)
X_test = add_features(X_test)

# Special designation for building_ids, manager_ids, display_address with only 1 observation
for col in ('building_id', 'manager_id', 'display_address'):
    X_train, X_test = designate_single_observations(X_train, X_test, col)

# High-Cardinality Categorical encoding
skf = StratifiedKFold(5)
attributes = product(("building_id", "manager_id"), zip(("pred_1", "pred_2"), (prior_1, prior_2)))
for variable, (target, prior) in attributes:
    hcc_encode(X_train, X_test, variable, target, prior, k=5, r_k=None)
    for train, test in skf.split(np.zeros(len(X_train)), X_train['interest_level']):
        hcc_encode(X_train.iloc[train], X_train.iloc[test], variable, target, prior, k=5, r_k=0.01, update_df=X_train)

# Factorize building_id, display_address, manager_id, street_address
for col in ('building_id', 'display_address', 'manager_id', 'street_address'):
    X_train, X_test = factorize(X_train, X_test, col)

# Create binarized features
X_train = create_binary_features(X_train)
X_test = create_binary_features(X_test)

# save
X_train = X_train.sort_index(axis=1).sort_index()
X_test = X_test.sort_index(axis=1).sort_index()
columns_to_drop = ["photos", "pred_0","pred_1", "pred_2", "created"]
X_train.drop(columns_to_drop, axis=1, errors="ignore", inplace=True)
X_test.drop(columns_to_drop, axis=1, errors="ignore", inplace=True)

In [4]:
from itertools import chain

def features_cleanup_star(x):
    return list(chain.from_iterable(re.sub(r'\s*\*+\s*\**\s*', r'*', i).strip('*').split('*') for i in x))

def process_features(df):
    df['features_clean'] = df['features']\
                    .apply(lambda x: ' '.join([re.sub(r'\W', '', i) for i in x]).lower())

    df.loc[df.features_clean.str.contains('\*'), 'features_clean'] = \
            df.loc[df.features_clean.str.contains('\*'), 'features']\
                    .apply(features_cleanup_star)\
                    .apply(lambda x: ' '.join([re.sub(r'\W', '', i) for i in x]).lower())
                
process_features(X_train)
process_features(X_test)

countvec_features = CountVectorizer(stop_words='english', max_features=200)
countvec_features.fit(X_train['features_clean'].tolist() + X_test['features_clean'].tolist())
X_train_features = countvec_features.transform(X_train['features_clean'])
X_test_features = countvec_features.transform(X_test['features_clean'])

countvec_desc = CountVectorizer(stop_words='english', max_features=200)
countvec_desc.fit(X_train['description'].tolist() + X_test['description'].tolist())
X_train_desc = countvec_desc.transform(X_train['description'])
X_test_desc = countvec_desc.transform(X_test['description'])

columns_to_drop = ["description", "features", "features_clean"]
X_train.drop(columns_to_drop, axis=1, errors="ignore", inplace=True)
X_test.drop(columns_to_drop, axis=1, errors="ignore", inplace=True)

In [5]:
X_train.columns

Index(['bathrooms', 'bedBathDiff', 'bedBathSum', 'bedPerBath', 'bedrooms',
       'bedsPerc', 'building_id', 'desc_wordcount', 'display_address',
       'feature_cats', 'feature_concierge', 'feature_dogs',
       'feature_furnished', 'feature_health', 'feature_laundry',
       'feature_lowfee', 'feature_nofee', 'feature_parking', 'feature_parquet',
       'feature_prewar', 'feature_transport', 'feature_utilities',
       'hcc_building_id_pred_1', 'hcc_building_id_pred_2',
       'hcc_manager_id_pred_1', 'hcc_manager_id_pred_2', 'interest_level',
       'latitude', 'listing_id', 'longitude', 'manager_id', 'photo_count',
       'price', 'pricePerBath', 'pricePerBed', 'pricePerRoom',
       'street_address'],
      dtype='object')

Use `XGBClassifier`.

In [57]:
gbc = XGBClassifier(max_depth=10, learning_rate=0.1, n_estimators=100, objective='multi:softprob', subsample=0.7)\
                .fit(X_train.drop('interest_level', axis=1), X_train.interest_level)
scores = cross_val_score(gbc, X_train.drop('interest_level', axis=1), X_train.interest_level, scoring='neg_log_loss', 
                         cv=StratifiedKFold(3, shuffle=True), n_jobs=-1)
scores

array([-0.5550611 , -0.55131487, -0.54869598])

Use `LightGBM`.

In [8]:
lbgm = GBMClassifier(application='multiclass', metric='multi_logloss', learning_rate=0.1, num_iterations=100, num_class=3, early_stopping_round=10, verbose=False)
lbgm.fit(X_train.drop('interest_level', axis=1), X_train.interest_level)
scores = cross_val_score(lbgm, X_train.drop('interest_level', axis=1), X_train.interest_level, scoring='neg_log_loss', 
                         cv=StratifiedKFold(3, shuffle=True), n_jobs=-1)
scores



array([-0.54471735, -0.55575272, -0.5539737 ])

Include the text probs from `SGD`.

In [11]:
X_train_text_combined = sparse.hstack([X_train_features, X_train_desc]).tocsr()
sgd = SGDClassifier(alpha=1e-3, n_jobs=-1, loss='log').fit(X_train_text_combined, X_train.interest_level)
y_pred_cv_sgd = cross_val_predict(sgd, X_train_text_combined, X_train.interest_level, n_jobs=-1, method='predict_proba')

In [12]:
gbc_semi = XGBClassifier(max_depth=10, learning_rate=0.1, n_estimators=100, objective='multi:softprob', subsample=0.7)\
                .fit(np.hstack([X_train.drop('interest_level', axis=1), y_pred_cv_sgd]), X_train.interest_level)
scores = cross_val_score(gbc_semi, np.hstack([X_train.drop('interest_level', axis=1), y_pred_cv_sgd]), X_train.interest_level, scoring='neg_log_loss', 
                         cv=StratifiedKFold(3, shuffle=True), n_jobs=-1)
scores

array([-0.5561396 , -0.55336424, -0.56136587])

Use `LightGBM`.

In [13]:
lbgm_semi = GBMClassifier(application='multiclass', metric='multi_logloss', learning_rate=0.1, num_iterations=100, num_class=3, early_stopping_round=10, verbose=False)
lbgm_semi.fit(np.hstack([X_train.drop('interest_level', axis=1), y_pred_cv_sgd]), X_train.interest_level)
scores = cross_val_score(lbgm_semi, np.hstack([X_train.drop('interest_level', axis=1), y_pred_cv_sgd]), X_train.interest_level, scoring='neg_log_loss', 
                         cv=StratifiedKFold(3, shuffle=True), n_jobs=-1)
scores



array([-0.5587934 , -0.55233137, -0.54761985])

In [None]:
X_test_text_combined = sparse.hstack([X_test_features, X_test_desc]).tocsr()
y_test_pred_sgd = sgd.predict_proba(X_test_text_combined)
y_test = lbgm.predict_proba(np.hstack([X_test, y_test_pred_sgd]))
# pd.DataFrame(y_test, index=X_test.listing_id, columns=['low', 'medium', 'high'])[['high', 'medium', 'low']].to_csv('submission_xgb_more_features-lbgm.csv')