In [1]:
import pandas as pd
import time
import numpy as np
from sklearn.model_selection import train_test_split,StratifiedKFold, KFold
from sklearn.feature_extraction.text import CountVectorizer
import random
from sklearn import preprocessing
import gc
from scipy.stats import skew, boxcox
from scipy import sparse
from datetime import datetime


import re, nltk      
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import sent_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


seed = 2017



In [2]:
#######
# based on http://www.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    # remove non letters
    text = re.sub("[^a-zA-Z]", " ", text)
    # tokenize
    tokens = nltk.word_tokenize(text)
    # stem
    stems = stem_tokens(tokens, stemmer)
    return stems
######## 


def description_sentiment(sentences):
    analyzer = SentimentIntensityAnalyzer()
    result = []
    for sentence in sentences:
        vs = analyzer.polarity_scores(sentence)
        result.append(vs)
    return pd.DataFrame(result).mean()


# Load Data

In [3]:
data_path = "../input/"
train_file = data_path + "train.json"
test_file = data_path + "test.json"
train_df = pd.read_json(train_file).reset_index()
test_df = pd.read_json(test_file).reset_index()
ntrain = train_df.shape[0]
print train_df.shape
print test_df.shape
print ntrain

(49352, 16)
(74659, 15)
49352


# Numerical Feature

In [4]:
# sc_price
tmp = pd.concat([train_df['price'],test_df['price']])
ulimit = np.percentile(tmp.values, 99)
llimit = np.percentile(tmp.values, 1)

train_df.loc[:,'sc_price'] = train_df['price'].values.reshape(-1, 1)
test_df.loc[:,'sc_price'] = test_df['price'].values.reshape(-1, 1)

train_df['price_outlier'] = 0
test_df['price_outlier'] = 0

train_df.loc[train_df['sc_price']>ulimit, ['price_outlier']] = 1
test_df.loc[test_df['sc_price']>ulimit, ['price_outlier']] = 1
train_df.loc[train_df['sc_price']<llimit, ['price_outlier']] = 1
test_df.loc[test_df['sc_price']<llimit, ['price_outlier']] = 1

train_df.loc[train_df['sc_price']>ulimit, ['sc_price']] = ulimit
test_df.loc[test_df['sc_price']>ulimit, ['sc_price']] = ulimit
train_df.loc[train_df['sc_price']<llimit, ['sc_price']] = llimit
test_df.loc[test_df['sc_price']<llimit, ['sc_price']] = llimit



features_to_use  = [ "sc_price"]


In [5]:
# price per bathrooms
inx_train = train_df['bathrooms'] == 0
inx_test = test_df['bathrooms'] == 0

non0_inx_train = ~inx_train
non0_inx_test = ~inx_test

train_df.loc[non0_inx_train,'sc_ba_price'] = train_df.loc[non0_inx_train,'sc_price']\
                                                /train_df.loc[non0_inx_train,'bathrooms']
test_df.loc[non0_inx_test,'sc_ba_price'] = test_df.loc[non0_inx_test,'sc_price']\
                                                /test_df.loc[non0_inx_test,'bathrooms']

train_df.loc[inx_train,'sc_ba_price'] = 0
test_df.loc[inx_test,'sc_ba_price'] = 0

train_df.loc[non0_inx_train,'bathrooms0'] = 1
test_df.loc[non0_inx_test,'bathrooms0'] = 1

train_df.loc[inx_train,'bathrooms0'] = 0
test_df.loc[inx_test,'bathrooms0'] = 0

# price per bedrooms

inx_train = train_df['bedrooms'] == 0
inx_test = test_df['bedrooms'] == 0

non0_inx_train = ~inx_train
non0_inx_test = ~inx_test

train_df.loc[non0_inx_train,'sc_be_price'] = train_df.loc[non0_inx_train,'sc_price'] \
                                                /train_df.loc[non0_inx_train,'bedrooms']
test_df.loc[non0_inx_test,'sc_be_price'] = test_df.loc[non0_inx_test,'sc_price']\
                                                /test_df.loc[non0_inx_test,'bedrooms']

train_df.loc[inx_train,'sc_be_price'] = 0
test_df.loc[inx_test,'sc_be_price'] = 0

train_df.loc[non0_inx_train,'bedrooms0'] = 1
test_df.loc[non0_inx_test,'bedrooms0'] = 1

train_df.loc[inx_train,'bedrooms0'] = 0
test_df.loc[inx_test,'bedrooms0'] = 0

features_to_use.extend(["sc_ba_price", "sc_be_price"])

In [6]:
# bathrooms

ulimit = 5

train_df['sc_bathrooms']=train_df['bathrooms']
test_df['sc_bathrooms']=test_df['bathrooms']

train_df['bathrooms_outlier'] = 0
test_df['bathrooms_outlier'] = 0

train_df.loc[train_df['sc_bathrooms']>ulimit, ['bathrooms_outlier']] = 1
test_df.loc[test_df['sc_bathrooms']>ulimit, ['bathrooms_outlier']] = 1

train_df.loc[train_df['sc_bathrooms']>ulimit,['sc_bathrooms']] = ulimit
test_df.loc[test_df['sc_bathrooms']>ulimit,['sc_bathrooms']] = ulimit

# bedrooms

ulimit = 8


train_df['sc_bedrooms']=train_df['bedrooms']
test_df['sc_bedrooms']=test_df['bedrooms']

train_df['bedrooms_outlier'] = 0
test_df['bedrooms_outlier'] = 0

train_df.loc[train_df['sc_bedrooms']>ulimit, ['bedrooms_outlier']] = 1
test_df.loc[test_df['sc_bedrooms']>ulimit, ['bedrooms_outlier']] = 1

train_df.loc[train_df['sc_bedrooms']>ulimit, ['sc_bedrooms']] = ulimit
test_df.loc[test_df['sc_bedrooms']>ulimit,['sc_bedrooms']] = ulimit

# bathrooms / bedrooms

inx_train = train_df['bedrooms'] == 0
inx_test = test_df['bedrooms'] == 0

non0_inx_train = ~inx_train
non0_inx_test = ~inx_test

train_df.loc[non0_inx_train,'sc_babe'] = train_df.loc[non0_inx_train,'sc_bathrooms'] \
                                                /train_df.loc[non0_inx_train,'sc_bedrooms']
test_df.loc[non0_inx_test,'sc_babe'] = test_df.loc[non0_inx_test,'sc_bathrooms']\
                                                /test_df.loc[non0_inx_test,'sc_bedrooms']

train_df.loc[inx_train,'sc_babe'] = 0
test_df.loc[inx_test,'sc_babe'] = 0


features_to_use.extend(["sc_bathrooms",  'sc_bedrooms', 'sc_babe'])

In [7]:
# longitude

tmp = pd.concat([train_df['longitude'],test_df['longitude']])
llimit = np.percentile(tmp.values, 0.1)
ulimit = np.percentile(tmp.values, 99.9)

train_df['sc_longitude']=train_df['longitude']
test_df['sc_longitude']=test_df['longitude']

train_df['longitude_outlier'] = 0
test_df['longitude_outlier'] = 0

train_df.loc[train_df['sc_longitude']>ulimit, ['longitude_outlier']] = 1
test_df.loc[test_df['sc_longitude']>ulimit, ['longitude_outlier']] = 1
train_df.loc[train_df['sc_longitude']<llimit, ['longitude_outlier']] = 1
test_df.loc[test_df['sc_longitude']<llimit, ['longitude_outlier']] = 1

train_df.loc[train_df['sc_longitude']>ulimit, ['sc_longitude']] = ulimit
test_df.loc[test_df['sc_longitude']>ulimit, ['sc_longitude']] = ulimit
train_df.loc[train_df['sc_longitude']<llimit, ['sc_longitude']] = llimit
test_df.loc[test_df['sc_longitude']<llimit, ['sc_longitude']] = llimit

# latitude

tmp = pd.concat([train_df['latitude'],test_df['latitude']])
llimit = np.percentile(tmp.values, 0.1)
ulimit = np.percentile(tmp.values, 99.9)

train_df['sc_latitude']=train_df['latitude']
test_df['sc_latitude']=test_df['latitude']

train_df['latitude_outlier'] = 0
test_df['latitude_outlier'] = 0

train_df.loc[train_df['sc_latitude']>ulimit, ['latitude_outlier']] = 1
test_df.loc[test_df['sc_latitude']>ulimit, ['latitude_outlier']] = 1
train_df.loc[train_df['sc_latitude']<llimit, ['latitude_outlier']] = 1
test_df.loc[test_df['sc_latitude']<llimit, ['latitude_outlier']] = 1

train_df.loc[train_df['sc_latitude']>ulimit, ['sc_latitude']] = ulimit
test_df.loc[test_df['sc_latitude']>ulimit, ['sc_latitude']] = ulimit
train_df.loc[train_df['sc_latitude']<llimit, ['sc_latitude']] = llimit
test_df.loc[test_df['sc_latitude']<llimit, ['sc_latitude']] = llimit


features_to_use.extend(['sc_longitude', "sc_latitude"])

In [8]:
# count of photos #
train_df["num_photos"] = train_df["photos"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)

# count of "features" #
train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

# convert the created column to datetime object so as to extract more features 
train_df["created"] = pd.to_datetime(train_df["created"])
test_df["created"] = pd.to_datetime(test_df["created"])

# Let us extract some features like weekday, month, day, hour from date columns #
train_df["created_month"] = train_df["created"].dt.month
test_df["created_month"] = test_df["created"].dt.month
train_df["created_day"] = train_df["created"].dt.day
test_df["created_day"] = test_df["created"].dt.day
train_df["created_weekday"] = train_df["created"].dt.weekday
test_df["created_weekday"] = test_df["created"].dt.weekday
train_df["created_hour"] = train_df["created"].dt.hour
test_df["created_hour"] = test_df["created"].dt.hour

# adding all these new features to use list #
features_to_use.extend(["num_photos", "num_features", "created_month", "created_day", "created_hour", 'created_weekday'])


In [9]:
# count of words present in description column #
train_df['clean_description'] = train_df['description'].apply(lambda x: x.replace('<p>', ''))
train_df['clean_description'] = train_df['clean_description'].apply(lambda x: x.replace('<a', ''))
train_df['clean_description'] = train_df['clean_description'].apply(lambda x: x.replace('website_redacted', '')) 
train_df['clean_description'] = train_df['clean_description'].apply(lambda x: x.replace('<br />', ''))
train_df['clean_description'] = train_df['clean_description'].apply(lambda x: x.replace('<br/>', ''))
train_df['clean_description'] = train_df['clean_description'].apply(lambda x: x.replace('&amp;', ''))

test_df['clean_description'] = test_df['description'].apply(lambda x: x.replace('<p>', ''))
test_df['clean_description'] = test_df['clean_description'].apply(lambda x: x.replace('<a', ''))
test_df['clean_description'] = test_df['clean_description'].apply(lambda x: x.replace('website_redacted', '')) 
test_df['clean_description'] = test_df['clean_description'].apply(lambda x: x.replace('<br />', ''))
test_df['clean_description'] = test_df['clean_description'].apply(lambda x: x.replace('<br/>', ''))
test_df['clean_description'] = test_df['clean_description'].apply(lambda x: x.replace('&amp;', ''))

test_df.loc[27992,'clean_description'] = 'This beatiful one bedroom is in a well maintained. \
                                            pet friendly pre-war walk-up building is located \
                                            on the border of Chelsea, the West Village, and \
                                            the trendy Meat Packing District and convenient to \
                                            everything in the city. Apartment Features: Private Balcony,  \
                                            Granite Counters, Stainless Steal Kitchen Appliances with Dishwasher.  \
                                            Spacious Bedroom (can fit a queen size bed). Nice Size Living \
                                            Room Building. Laundry in Building. Super lives in the \
                                            Area Pet Friendly Walk-up Pre-war Building Location:On the \
                                            Same Block as the A, C, E, and L Trains Steps from the Meat \
                                            Packing District and Highline Park. Trendy Bars, Restaurants, \
                                            and Night Life Activities Boutiques Convenient to the West Side \
                                            Highway and Holland Tunnel Historic Landmarks Cultural Activities.'

train_df['description_tokens'] = train_df['clean_description'].apply(sent_tokenize)
train_df['num_description_sent'] = train_df['description_tokens'].apply(len)
test_df['description_tokens'] = test_df['clean_description'].apply(sent_tokenize)
test_df['num_description_sent'] = test_df['description_tokens'].apply(len)

train_df['num_description_words'] = train_df['clean_description']\
                                        .apply(lambda x: 0 if len(x.strip()) == 0 else len(x.split(' ')))
test_df['num_description_words'] = test_df['clean_description']\
                                        .apply(lambda x: 0 if len(x.strip()) == 0 else len(x.split(' ')))

# train_df['word_sent_desc'] = train_df['num_description_words'] / train_df['num_description_sent']
# test_df['word_sent_desc'] = test_df['num_description_words'] / test_df['num_description_sent']

train_df = pd.concat([train_df,train_df['description_tokens']\
                      .apply(description_sentiment)],axis=1)
test_df = pd.concat([test_df,test_df['description_tokens']\
                     .apply(description_sentiment)],axis=1)

train_df.fillna(0, inplace=True)
test_df.fillna(0, inplace=True)

features_to_use.extend(["num_description_words", 'num_description_sent', 'compound', 'neg', 'neu', 'pos'])

In [10]:
categorical = ["display_address", "manager_id", "building_id", "street_address", "listing_id"]
for f in categorical:
        if train_df[f].dtype=='object':
            #print(f)
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f+'_lbl'] = lbl.transform(list(train_df[f].values))
            test_df[f+'_lbl'] = lbl.transform(list(test_df[f].values))
            features_to_use.append(f+'_lbl')

In [11]:
full_data=pd.concat([train_df[features_to_use],test_df[features_to_use]])
skewed_cols = full_data[features_to_use].apply(lambda x: skew(x.dropna()))
SSL = preprocessing.StandardScaler()
skewed_cols = skewed_cols[skewed_cols > 0.25].index.values

for skewed_col in skewed_cols:
    full_data[skewed_col], lam = boxcox(full_data[skewed_col] - full_data[skewed_col].min() + 1)
    print skewed_col, '\t', lam
    
for col in features_to_use:
    full_data[col] = SSL.fit_transform(full_data[col].values.reshape(-1,1))
    train_df[col] = full_data.iloc[:ntrain][col]
    test_df[col] = full_data.iloc[ntrain:][col]

    
del full_data

sc_price 	0.34122013083
sc_ba_price 	0.668968094532
sc_be_price 	0.462918736223
sc_bathrooms 	-0.974792935849
sc_bedrooms 	0.474243433469
sc_longitude 	-12.8479106804
num_photos 	0.475560723219
num_features 	0.359523382085
created_hour 	-0.389913999668
num_description_words 	0.561581571914
num_description_sent 	0.177791228437
compound 	-0.239175562191
neg 	-36.55973962
pos 	-2.19234913267




# Binary feature

In [12]:
features_to_use.extend(['price_outlier',"bedrooms0",'bathrooms0',"bathrooms_outlier", 'bedrooms_outlier',
                        'latitude_outlier', "longitude_outlier"])

In [13]:
train_df['address1'] = train_df['display_address']
train_df['address1'] = train_df['address1'].apply(lambda x: x.lower())

test_df['address1'] = test_df['display_address']
test_df['address1'] = test_df['address1'].apply(lambda x: x.lower())

address_map = {
    'w': 'west',
    'st.': 'street',
    'ave': 'avenue',
    'st': 'street',
    'e': 'east',
    'n': 'north',
    's': 'south'
}


def address_map_func(s):
    s = s.split(' ')
    out = []
    for x in s:
        if x in address_map:
            out.append(address_map[x])
        else:
            out.append(x)
    return ' '.join(out)


train_df['address1'] = train_df['address1'].apply(lambda x: address_map_func(x))
test_df['address1'] = test_df['address1'].apply(lambda x: address_map_func(x))

new_cols = ['street', 'avenue', 'east', 'west', 'north', 'south']

for col in new_cols:
    train_df[col] = train_df['address1'].apply(lambda x: 1 if col in x else 0)
    test_df[col] = test_df['address1'].apply(lambda x: 1 if col in x else 0)

train_df['other_address'] = train_df[new_cols].apply(lambda x: 1 if x.sum() == 0 else 0, axis=1)
test_df['other_address'] = test_df[new_cols].apply(lambda x: 1 if x.sum() == 0 else 0, axis=1)


features_to_use.extend(['street', 'avenue', 'east', 'west', 'north', 'south','other_address'])


In [14]:
tmp = pd.concat([train_df['manager_id_lbl'],test_df['manager_id_lbl']])
managers_count = tmp.value_counts()

train_df['top_10_manager'] = train_df['manager_id_lbl'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 90)] else 0)
train_df['top_25_manager'] = train_df['manager_id_lbl'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 75)] else 0)
train_df['top_5_manager'] = train_df['manager_id_lbl'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 95)] else 0)
train_df['top_50_manager'] = train_df['manager_id_lbl'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 50)] else 0)
train_df['top_1_manager'] = train_df['manager_id_lbl'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 99)] else 0)
train_df['top_2_manager'] = train_df['manager_id_lbl'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 98)] else 0)
train_df['top_15_manager'] = train_df['manager_id_lbl'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 85)] else 0)
train_df['top_20_manager'] = train_df['manager_id_lbl'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 80)] else 0)
train_df['top_30_manager'] = train_df['manager_id_lbl'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 70)] else 0)

test_df['top_10_manager'] = test_df['manager_id_lbl'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 90)] else 0)
test_df['top_25_manager'] = test_df['manager_id_lbl'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 75)] else 0)
test_df['top_5_manager'] = test_df['manager_id_lbl'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 95)] else 0)
test_df['top_50_manager'] = test_df['manager_id_lbl'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 50)] else 0)
test_df['top_1_manager'] = test_df['manager_id_lbl'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 99)] else 0)
test_df['top_2_manager'] = test_df['manager_id_lbl'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 98)] else 0)
test_df['top_15_manager'] = test_df['manager_id_lbl'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 85)] else 0)
test_df['top_20_manager'] = test_df['manager_id_lbl'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 80)] else 0)
test_df['top_30_manager'] = test_df['manager_id_lbl'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 70)] else 0)


features_to_use.extend(['top_10_manager','top_25_manager','top_5_manager','top_50_manager','top_1_manager',
                       'top_2_manager','top_15_manager','top_20_manager','top_30_manager'])

In [15]:
train_df['Zero_building_id'] = train_df['building_id'].apply(lambda x: 1 if x == '0' else 0)
test_df['Zero_building_id'] = test_df['building_id'].apply(lambda x: 1 if x == '0' else 0)
features_to_use.append('Zero_building_id')

In [16]:
tmp = pd.concat([train_df['building_id_lbl'],test_df['building_id_lbl']])
buildings_count = tmp.value_counts()

train_df['top_10_building'] = train_df['building_id_lbl'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 90)] else 0)
train_df['top_25_building'] = train_df['building_id_lbl'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 75)] else 0)
train_df['top_5_building'] = train_df['building_id_lbl'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 95)] else 0)
train_df['top_50_building'] = train_df['building_id_lbl'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 50)] else 0)
train_df['top_1_building'] = train_df['building_id_lbl'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 99)] else 0)
train_df['top_2_building'] = train_df['building_id_lbl'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 98)] else 0)
train_df['top_15_building'] = train_df['building_id_lbl'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 85)] else 0)
train_df['top_20_building'] = train_df['building_id_lbl'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 80)] else 0)
train_df['top_30_building'] = train_df['building_id_lbl'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 70)] else 0)

test_df['top_10_building'] = test_df['building_id_lbl'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 90)] else 0)
test_df['top_25_building'] = test_df['building_id_lbl'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 75)] else 0)
test_df['top_5_building'] = test_df['building_id_lbl'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 95)] else 0)
test_df['top_50_building'] = test_df['building_id_lbl'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 50)] else 0)
test_df['top_1_building'] = test_df['building_id_lbl'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 99)] else 0)
test_df['top_2_building'] = test_df['building_id_lbl'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 98)] else 0)
test_df['top_15_building'] = test_df['building_id_lbl'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 85)] else 0)
test_df['top_20_building'] = test_df['building_id_lbl'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 80)] else 0)
test_df['top_30_building'] = test_df['building_id_lbl'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 70)] else 0)


features_to_use.extend(['top_10_building','top_25_building','top_5_building','top_50_building','top_1_building',
                       'top_2_building','top_15_building','top_20_building','top_30_building'])

In [17]:
print train_df.shape, test_df.shape

(49352, 76) (74659, 75)


In [18]:
y_map = {'low': 2, 'medium': 1, 'high': 0}
train_y = train_df['interest_level'].apply(lambda x: y_map[x]).values

In [19]:
vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    stop_words = 'english',
    max_features = 200
)

vectorizer.fit(pd.concat([train_df['clean_description'],test_df['clean_description']]))

tr_desc_sparse = vectorizer.transform(train_df["clean_description"])
te_desc_sparse = vectorizer.transform(test_df["clean_description"])
desc_sparse_cols = vectorizer.get_feature_names()

In [32]:
vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    stop_words = 'english',
    max_features = 200
)

train_df['clean_features'] = train_df['features'].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
test_df['clean_features'] = test_df['features'].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))

vectorizer.fit(pd.concat([train_df['clean_features'],test_df['clean_features']]))

tr_feat_sparse = vectorizer.transform(train_df["clean_features"])
te_feat_sparse = vectorizer.transform(test_df["clean_features"])
feat_sparse_cols = vectorizer.get_feature_names()

In [35]:
now = datetime.now()

name_feautres = '../input/featurestouse_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.pkl'
name_train = '../input/train_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.pkl'
name_test = '../input/test_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.pkl'
name_y = '../input/y_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.pkl'

name_tr_desc_sparse = '../input/tr_desc_sparse_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.pkl'
name_tr_feat_sparse = '../input/tr_feat_sparse_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.pkl'
name_te_desc_sparse = '../input/te_desc_sparse_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.pkl'
name_te_feat_sparse = '../input/te_feat_sparse_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.pkl'

name_desc_sparse_cols = '../input/desc_sparse_cols_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.pkl'
name_feat_sparse_cols = '../input/feat_sparse_cols_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.pkl'


pd.to_pickle(features_to_use,name_feautres)
pd.to_pickle(train_df, name_train)
pd.to_pickle(test_df, name_test)
pd.to_pickle(train_y, name_y)

pd.to_pickle(tr_desc_sparse, name_tr_desc_sparse)
pd.to_pickle(tr_feat_sparse, name_tr_feat_sparse)
pd.to_pickle(te_desc_sparse, name_te_desc_sparse)
pd.to_pickle(te_feat_sparse, name_te_feat_sparse)

pd.to_pickle(desc_sparse_cols,name_desc_sparse_cols)
pd.to_pickle(feat_sparse_cols,name_feat_sparse_cols)

In [42]:
# train_df['features'] = train_df["features"]\
#                         .apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))\
#                         .apply(lambda x: x.lower())
# test_df['features'] = test_df["features"]\
#                         .apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))\
#                         .apply(lambda x: x.lower())

# print(train_df["features"].head())
# tfidf = CountVectorizer(stop_words='english', max_features=200)
# tr_sparse = tfidf.fit_transform(train_df["features"])
# te_sparse = tfidf.transform(test_df["features"])

# sparse_features = tfidf.get_feature_names()

In [9]:
# train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr()
# test_X = sparse.hstack([test_df[features_to_use], te_sparse]).tocsr()

# target_num_map = {'high':0, 'medium':1, 'low':2}
# weight_num_map = {'high':1, 'medium':1, 'low':1}
# train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
# W_train = np.array(train_df['interest_level'].apply(lambda x: weight_num_map[x]))

# all_features = features_to_use + sparse_features
# print train_X.shape, test_X.shape

(49352, 220) (74659, 220)
