In [1]:
import time
import numpy as np
import pandas as pd

from scipy import sparse
from datetime import datetime
from sklearn import preprocessing
from scipy.stats import skew, boxcox,boxcox_normmax
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from bayes_opt import BayesianOptimization
from sklearn.metrics import log_loss

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Load Data

In [2]:
data_path = "../input/"
train_file = data_path + "train.json"
test_file = data_path + "test.json"
train_df = pd.read_json(train_file).reset_index()
test_df = pd.read_json(test_file).reset_index()
ntrain = train_df.shape[0]
print train_df.shape
print test_df.shape
print ntrain

(49352, 16)
(74659, 15)
49352


In [3]:
train_df['FE_features'] = train_df["features"]\
                        .apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))\
                        .apply(lambda x: x.lower())
test_df['FE_features'] = test_df["features"]\
                        .apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))\
                        .apply(lambda x: x.lower())

print(train_df["FE_features"].head())
# tfidf = CountVectorizer(stop_words='english', max_features=200)
# tr_sparse = tfidf.fit_transform(train_df["FE_features"])
# te_sparse = tfidf.transform(test_df["FE_features"])

# sparse_features = tfidf.get_feature_names()

0                                                     
1    doorman elevator fitness_center cats_allowed d...
2    laundry_in_building dishwasher hardwood_floors...
3                               hardwood_floors no_fee
4                                              pre-war
Name: FE_features, dtype: object


In [6]:
train_df[['features',"FE_features"]].head(20)

Unnamed: 0,features,FE_features
0,[],
1,"[Doorman, Elevator, Fitness Center, Cats Allow...",doorman elevator fitness_center cats_allowed d...
2,"[Laundry In Building, Dishwasher, Hardwood Flo...",laundry_in_building dishwasher hardwood_floors...
3,"[Hardwood Floors, No Fee]",hardwood_floors no_fee
4,[Pre-War],pre-war
5,[],
6,"[prewar, elevator, Dogs Allowed, Cats Allowed,...",prewar elevator dogs_allowed cats_allowed lowr...
7,"[Doorman, Elevator, Pre-War, Terrace, Laundry ...",doorman elevator pre-war terrace laundry_in_un...
8,"[Cats Allowed, Dogs Allowed, Elevator, Laundry...",cats_allowed dogs_allowed elevator laundry_in_...
9,"[Dishwasher, Hardwood Floors]",dishwasher hardwood_floors


In [109]:
import re, nltk
from sklearn.feature_extraction.text import CountVectorizer        
from nltk.stem.porter import PorterStemmer

#######
# based on http://www.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    # remove non letters
    text = re.sub("[^a-zA-Z]", " ", text)
    # tokenize
    tokens = nltk.word_tokenize(text)
    # stem
    stems = stem_tokens(tokens, stemmer)
    return stems
######## 

vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    stop_words = 'english',
    max_features = 400
)


In [28]:
tr_sparse_tmp = vectorizer.fit_transform(train_df["description"])

In [110]:
train_df['clean_description'] = train_df['description'].apply(lambda x: x.replace('<p>', ''))
train_df['clean_description'] = train_df['clean_description'].apply(lambda x: x.replace('<a', ''))
train_df['clean_description'] = train_df['clean_description'].apply(lambda x: x.replace('website_redacted', '')) 
train_df['clean_description'] = train_df['clean_description'].apply(lambda x: x.replace('<br />', ''))
train_df['clean_description'] = train_df['clean_description'].apply(lambda x: x.replace('<br/>', ''))
train_df['clean_description'] = train_df['clean_description'].apply(lambda x: x.replace('&amp;', ''))

In [111]:
corpus_data_features = vectorizer.fit_transform(
    train_df.clean_description.tolist())

In [112]:
corpus_data_features_nd = corpus_data_features.toarray()
corpus_data_features_nd.shape

(49352, 400)

In [113]:
vocab = vectorizer.get_feature_names()
# Sum up the counts of each vocabulary word
dist = np.sum(corpus_data_features_nd, axis=0)
    
# For each, print the vocabulary word and the number of times it 
# appears in the data set
for tag, count in zip(vocab, dist):
    print count, tag

1575 abund
9752 access
1497 actual
2582 addit
1845 air
1411 alcov
3890 allow
7835 amaz
9594 amen
2324 ampl
5470 ani
2220 anytim
48336 apart
20704 applianc
4115 appoint
2283 apt
11636 area
4568 art
1671 associ
2240 attend
9182 avail
2935 ave
3333 avenu
5908 away
3194 b
2818 balconi
7305 bar
9879 bath
18694 bathroom
13675 beauti
6916 bed
36059 bedroom
5338 best
1780 big
2566 bike
1277 billiard
4171 blank
8563 block
1415 boast
4648 bond
13851 br
6139 brand
1531 breakfast
3930 brick
3915 bright
9550 broker
1930 brooklyn
1804 bu
37068 build
1448 built
5284 c
4980 cabinet
2537 cabinetri
2536 cafe
15119 ceil
10197 center
8327 central
1606 chang
2151 charm
1440 chef
1975 chelsea
1392 children
10923 citi
2817 clean
7459 close
19581 closet
3423 club
23973 com
3676 come
1624 comfort
2491 complet
6469 concierg
1337 condit
14707 contact
6402 conveni
2506 convert
1520 cooper
3504 coopercoop
2479 corner
7166 counter
4069 countertop
1710 court
1336 crown
4476 custom
2444 d
2048 day
3853 deal
7540 deck

In [125]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize



In [171]:
def description_sentiment(sentences):
    analyzer = SentimentIntensityAnalyzer()
    result = []
    for sentence in sentences:
        vs = analyzer.polarity_scores(sentence)
        result.append(vs)
    return pd.DataFrame(result).mean()

sdf = train_df.sample(50)
sdf['description_tokens'] = sdf['clean_description'].apply(sent_tokenize)
sdf['num'] = sdf['description_tokens'].apply(len)
sdf = pd.concat([sdf,sdf['description_tokens'].apply(description_sentiment)],axis=1)

In [172]:
sdf.head()

Unnamed: 0,index,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,...,price,street_address,FE_features,clean_description,description_tokens,num,compound,neg,neu,pos
6277,114305,1.0,0,08c77edf4b9f4ab2929bf41d202a1976,2016-04-08 04:29:57,,West 50th Street,"[Fitness Center, Cats Allowed, Dogs Allowed]",low,40.7625,...,3145,305 West 50th Street,fitness_center cats_allowed dogs_allowed,,[ ],1,0.0,0.0,0.0,0.0
8983,120368,1.0,0,0,2016-04-16 01:28:32,Great apartment. Priced to rent. Call Ariel Am...,Lexington Avenue,"[Dogs Allowed, Cats Allowed]",low,40.7416,...,2400,95 Lexington Avenue,dogs_allowed cats_allowed,Great apartment. Priced to rent. Call Ariel Am...,"[Great apartment., Priced to rent., Call Ariel...",3,0.2083,0.0,0.732,0.268
7080,116046,1.0,3,297160caf8a88c10718b492d4672c6b4,2016-04-21 05:17:19,NO FEE!! WELL MAINTAINED ELEVATOR BUILDING !! ...,E 30 St.,"[Elevator, Laundry in Building, Dishwasher, Ha...",low,40.7426,...,4795,230 E 30 St.,elevator laundry_in_building dishwasher hardwo...,NO FEE!! WELL MAINTAINED ELEVATOR BUILDING !! ...,"[NO FEE!!, WELL MAINTAINED ELEVATOR BUILDING !...",8,0.117938,0.197625,0.498,0.1795
2570,10588,1.0,2,f29591839a75f4f70544a5f1a9c982fc,2016-06-02 06:51:28,"Located Right off Lexington ave, this Luxury 2...",East 34th Street,"[Roof Deck, Doorman, Elevator, Pre-War, Laundr...",low,40.7469,...,3400,115 East 34th Street,roof_deck doorman elevator pre-war laundry_in_...,"Located Right off Lexington ave, this Luxury 2...","[Located Right off Lexington ave, this Luxury ...",4,0.22415,0.1165,0.74325,0.14025
37922,7407,1.0,2,65f6788d6d66716fbd8a4ef6d27d6f3f,2016-06-07 05:26:48,NO FEE**ULTRA LUXURY CONV 2 BEDROOM/ 1 BATHROO...,Crescent St.,"[Roof Deck, Doorman, Elevator, Fitness Center,...",medium,40.7487,...,2800,43-10 Crescent St.,roof_deck doorman elevator fitness_center laun...,NO FEE**ULTRA LUXURY CONV 2 BEDROOM/ 1 BATHROO...,[NO FEE**ULTRA LUXURY CONV 2 BEDROOM/ 1 BATHRO...,5,0.32708,0.0238,0.8704,0.1058


In [164]:
sdf.fillna(0, inplace=True)

In [165]:
sdf.loc[16354]

index                                                             25333
bathrooms                                                             2
bedrooms                                                              1
building_id                            e87f1236b102798c1fa104a372800444
created                                             2016-06-18 01:41:01
description                                                            
display_address                                        Greenwich Street
features              [Doorman, Prewar, Fitness Center, Elevator, Ga...
interest_level                                                      low
latitude                                                        40.7321
listing_id                                                      7179652
longitude                                                       -74.008
manager_id                             b7de4cb395920136663132057fa89d84
photos                                                          

In [167]:
sdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 35246 to 24652
Data columns (total 23 columns):
index                 5000 non-null int64
bathrooms             5000 non-null float64
bedrooms              5000 non-null int64
building_id           5000 non-null object
created               5000 non-null object
description           5000 non-null object
display_address       5000 non-null object
features              5000 non-null object
interest_level        5000 non-null object
latitude              5000 non-null float64
listing_id            5000 non-null int64
longitude             5000 non-null float64
manager_id            5000 non-null object
photos                5000 non-null object
price                 5000 non-null int64
street_address        5000 non-null object
FE_features           5000 non-null object
clean_description     5000 non-null object
description_tokens    5000 non-null object
compound              5000 non-null float64
neg                   5000 n