In [10]:
import pandas as pd 
import numpy as np
import json
import seaborn as sb 
from sklearn.metrics import log_loss
from sklearn import linear_model 
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from scipy.stats import zscore
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [11]:
estimators = [300]
learning = [0.3,0.4,0.5]

In [15]:
df = pd.read_json('data/train.json', encoding = 'utf-8', dtype = {'description': str})
df = df[:20]
#df = df[['description', 'interest_level']]

In [38]:
from sklearn.base import BaseEstimator, TransformerMixin
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk


class TextTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, column, max_features):
        self.tfidfVectorizer = TfidfVectorizer(use_idf=False, stop_words='english',
                                               tokenizer=self._custom_tokenizer, analyzer='word',
                                               max_features=max_features)
        self._vectorizer = None
        self._column = column

    def _custom_tokenizer(self, string):
        # string = re.sub('^[\w]', '', string)
        tokens = nltk.word_tokenize(string)
        cleaned = [x if not x.isdigit() else '_NUM_' for x in tokens]
        return [str(x.encode('utf-8')) for x in cleaned if (x.isalpha() or x == '_NUM_')]

    def _clean_html_tags(self, content):
        return BeautifulSoup(content, 'lxml').text

    def fit(self, df, y = None):
        if self._column == 'features':
            df[self._column] = df[self._column].apply(lambda x : ' '.join(x))
        self._vectorizer = self.tfidfVectorizer.fit(df[self._column].apply(self._clean_html_tags))
        return self
    
    def transform(self, df, y = None):
        return self._vectorizer.transform(df[self._column])

class ColumnExtractor(BaseEstimator, TransformerMixin):
    
    def __init__(self, cols):
        self.cols = cols
    
    def transform(self, df, y = None):
        return df[self.cols].values
    
    def fit(self, X, y=None):
        return self

class DateExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, cols, day = False, month= False, year = False):
        self.cols = cols
        self.day = day
        self.month = month
        self.year = year
        self.index = None
        
    def transform(self, df, y = None):
        frame = self._fit_date(df[self.cols])
        cols_names = frame.columns.values
        
        #get the columns that interesect to account for unseen labels
        interesect = set.intersection(set(cols_names), set(self.index))
        
        #get the differences to account for unseen labels
        diff = set.difference(set(self.index), set(cols_names))
        frame = frame[list(interesect)]
        frame = pd.concat((frame, pd.DataFrame(columns = list(diff))))
        frame.fillna(0, inplace=True)
        return frame
    
    def fit(self, X, y=None):
        df = self._fit_date(X[self.cols])
        self.index = df.columns.values
        return self
    
    def _fit_date(self, X):
        if self.day:
            prefix = 'day'
            when = pd.DatetimeIndex(X).day
        elif self.month:
            prefix = 'month'
            when = pd.DatetimeIndex(X).month
        else:
            prefix = 'year'
            when = pd.DatetimeIndex(X).year
        
        frame = pd.get_dummies(when, prefix = prefix)
        
        return frame

In [39]:
df.head()

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,longitude,manager_id,photos,price,street_address,new
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue,Metropolitan
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,-73.9667,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue,Columbus
100004,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,6887163,-74.0018,d9039c43983f6e564b1482b273bd7b01,[https://photos.renthop.com/2/6887163_de85c427...,2850,241 W 13 Street,W 13
100007,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",low,40.7539,6888711,-73.9677,1067e078446a7897d2da493d2f741316,[https://photos.renthop.com/2/6888711_6e660cee...,3275,333 East 49th Street,East 49th
100013,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],low,40.8241,6934781,-73.9493,98e13ad4b495b9613cef886d79a6291f,[https://photos.renthop.com/2/6934781_1fa4b41a...,3350,500 West 143rd Street,West 143rd


In [40]:
df['new'] = df['display_address'].apply(lambda x: x.replace('Street', '').replace('Avenue', ''))
df['new']

10           Metropolitan 
10000            Columbus 
100004               W 13 
100007          East 49th 
100013         West 143rd 
100014          West 18th 
100016         West 107th 
100020          West 21st 
100026    Hamilton Terrace
100027          522 E 11th
100030               York 
10004            W. 173rd 
100044           E 38th St
100048          West 63rd 
10005       East 56th St..
100051          East 34th 
100052            1st Ave.
100053           Thayer St
100055         West 106th 
100058                1st 
Name: new, dtype: object

In [41]:
dict(loss = 'hinge', dual = True)


{'dual': True, 'loss': 'hinge'}

In [43]:
#pg = dict(clf__verbose = ['True'])
pg = {'clf__learning_rate' : learning, 'clf__n_estimators' : estimators}

#import xgboost
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline, FeatureUnion
from Transformers import TextTransformer, ColumnExtractor
skf = StratifiedKFold(n_splits=5)
a = TextTransformer('description', max_features=3000)
b = TextTransformer('features', max_features=3000)
c = TextTransformer('street_address', max_features = 3000)
d = TextTransformer('display_address', max_features = 3000)
pipeline = Pipeline([
        ('test', FeatureUnion
         ([
            #('description', a ), # can pass in either a pipeline
            #('features', b),
            #('street', c),
            #('display', d),
            #('lat_long', ColumnExtractor(['latitude', 'longitude'])),
            ('year', DateExtractor('created', year = True)),
            ('month', DateExtractor('created', month = True)),
            ('day', DateExtractor('created', day = True))
                    
        ])),
    #('clf',xgboost.XGBClassifier(silent = False))
    ('clf',SVC(probability = True))
    ])
for train_index, test_index in skf.split(df, df['interest_level']):
    print 'starting code'
    train, train_labels = df.iloc[train_index], df['interest_level'].iloc[train_index]
    test, test_labels = df.iloc[test_index], df['interest_level'].iloc[test_index]
    pred = pipeline.fit(train,train_labels)
    print 'finished training'
    pred = (pred.predict_proba(test))
    
    #loss = logloss(pred, test_labels)
    sklearn_loss = log_loss(test_labels, pred)
    print 'Log loss from sklearn', sklearn_loss
#scores = cross_val_score(pipeline, df, df['interest_level'], cv=2)
#regr = 
#search = GridSearchCV(regr, param_grid, scoring = 'neg_log_loss', n_jobs = -1)
#pg = {'clf__C': [0.1,0.2]}

#print pipeline.get_params().keys()
#grid = GridSearchCV(pipeline, param_grid = pg, cv = 2, verbose = 10)
#grid.fit(df, df['interest_level'])

starting code
finished training
Log loss from sklearn 0.994068892176
starting code
finished training
Log loss from sklearn 0.92945600308
starting code
finished training


ValueError: y_true and y_pred contain different number of classes 2, 3. Please provide the true labels explicitly through the labels argument. Classes found in y_true: [u'low' u'medium']

In [None]:
def feature_transform(content_lst):
    return ' '.join(content_lst)
    

In [None]:
skf = StratifiedKFold(n_splits=5)

In [None]:
results = []
param_grid = dict(learning_rate = learning, n_estimators = estimators)
for train_index, test_index in skf.split(df_cleaned, df_target):
    print 'starting code'
    train, train_labels = df_cleaned.iloc[train_index], df_target.iloc[train_index]
    tf_transformer = TfidfVectorizer(use_idf=False,  stop_words = 'english', 
                                     tokenizer = custom_tokenizer, analyzer = 'word', max_features = 5000)
    tf_transformer_features = TfidfVectorizer(use_idf=False,  stop_words = 'english', 
                                     tokenizer = custom_tokenizer, analyzer = 'word', max_features = 3000)
    
    train_bow = tf_transformer.fit_transform(train['description'])
    train = train.drop(['description'], axis = 1)
    train_bow_features = tf_transformer_features.fit_transform(train['features'])
    train = train.drop(['features'], axis = 1)
    train_bow = pd.DataFrame(train_bow.todense())
    names = [str(x) for x in range(5000,5000 + train_bow_features.shape[1])]
    train_bow_features = pd.DataFrame(train_bow_features.todense())
    train_bow_features.columns = names
    
    train = train.join(train_bow)
    train = train.join(train_bow_features)
    train.fillna(0, inplace = True)
    print train.shape
    print 'Building the model'
     
    test, test_labels = df_cleaned.iloc[test_index], df_target.iloc[test_index]
    test_bow = tf_transformer.transform(test['description'])
    test_bow = pd.DataFrame(test_bow.todense())
    test_bow_features = tf_transformer_features.transform(test['features'])
    test_bow_features = pd.DataFrame(test_bow_features.todense())
    test_bow_features.columns = names
    test = test.drop(['description'], axis = 1)
    test = test.drop(['features'], axis = 1)
     
    #train.fillna('0', inplace = True)
    test = test.join(test_bow)
    test = test.join(test_bow_features)
    eval_set = [(train, train_labels), (test, test_labels)]
    regr = xgboost.XGBClassifier(silent = False)
    search = GridSearchCV(regr, param_grid, scoring = 'neg_log_loss', n_jobs = -1)
    res = search.fit(train, train_labels)
    results.append(res)

    #regr.fit(train, train_labels, eval_metric = 'mlogloss', eval_set = eval_set, verbose = True)
    
    
    #test.fillna('0', inplace = True)
    #regr = linear_model.LogisticRegression(class_weight = 'balanced', probability = True)

    print 'finished training'
    pred = (res.predict_proba(test))
    
    #loss = logloss(pred, test_labels)
    sklearn_loss = log_loss(test_labels, pred)
    print 'Log loss from sklearn', sklearn_loss
    
    #print confusion_matrix(pred, test_labels)
    #print accuracy_score(pred, test_labels)

In [None]:
for train_index, test_index in skf.split(df_cleaned, df_target):
    print 'starting code'
    train, train_labels = df_cleaned.iloc[train_index], df_target.iloc[train_index]
    tf_transformer = TfidfVectorizer(use_idf=False,  stop_words = 'english', 
                                     tokenizer = custom_tokenizer, analyzer = 'word', max_features = 5000)
    tf_transformer_features = TfidfVectorizer(use_idf=False,  stop_words = 'english', 
                                     tokenizer = custom_tokenizer, analyzer = 'word', max_features = 3000)
    
    train_bow = tf_transformer.fit_transform(train['description'])
    train = train.drop(['description'], axis = 1)
    train_bow_features = tf_transformer_features.fit_transform(train['features'])
    train = train.drop(['features'], axis = 1)
    train_bow = pd.DataFrame(train_bow.todense())
    names = [str(x) for x in range(5000,5000 + train_bow_features.shape[1])]
    train_bow_features = pd.DataFrame(train_bow_features.todense())
    train_bow_features.columns = names
    
    train = train.join(train_bow)
    train = train.join(train_bow_features)
    train.fillna(0, inplace = True)
    print train.shape
    print 'Building the model'
     
    test, test_labels = df_cleaned.iloc[test_index], df_target.iloc[test_index]
    test_bow = tf_transformer.transform(test['description'])
    test_bow = pd.DataFrame(test_bow.todense())
    test_bow_features = tf_transformer_features.transform(test['features'])
    test_bow_features = pd.DataFrame(test_bow_features.todense())
    test_bow_features.columns = names
    test = test.drop(['description'], axis = 1)
    test = test.drop(['features'], axis = 1)
     
    #train.fillna('0', inplace = True)
    test = test.join(test_bow)
    test = test.join(test_bow_features)
    eval_set = [(train, train_labels), (test, test_labels)]
    regr = xgboost.XGBClassifier(n_estimators  = 300, silent = False)
    regr.fit(train, train_labels, eval_metric = 'mlogloss', eval_set = eval_set, verbose = True)
    
    
    #test.fillna('0', inplace = True)
    #regr = linear_model.LogisticRegression(class_weight = 'balanced', probability = True)

    print 'finished training'
    pred = (regr.predict_proba(test))
    
    #loss = logloss(pred, test_labels)
    sklearn_loss = log_loss(test_labels, pred)
    print 'Log loss from sklearn', sklearn_loss
    
    #print confusion_matrix(pred, test_labels)
    #print accuracy_score(pred, test_labels)

In [None]:
test_df = pd.read_json('data/test.json')
listing_ids = test_df['listing_id']
test_df['description'] = test_df['description'].apply(clean_html_tags)
test_df = test_df[['bathrooms', 'bedrooms', 'latitude', 'longitude', 'price', 'description', 'features']]
test_df['description'] = test_df['description'].astype(unicode)
test_df['features'] = test_df['features'].astype(unicode)
test_df_bow = tf_transformer.transform(test_df['description'])
test_df_bow = pd.DataFrame(test_df_bow.todense())
test_df = test_df.drop(['description'], axis = 1)
test_df_bow_features = tf_transformer_features.transform(test_df['features'])
test_df_bow_features = pd.DataFrame(test_df_bow_features.todense())
test_df_bow_features.columns = names
test_df = test_df.drop(['features'], axis = 1)
test_df = test_df.join(test_df_bow)
test_df = test_df.join(test_df_bow_features)
    
pred = regr.predict_proba(test_df)

pred = pd.DataFrame(pred, columns = ['high', 'medium', 'low'])

In [None]:
pred['listing_id'] = listing_ids.values
pred = pred[['listing_id', 'high', 'medium', 'low']]
pred.to_csv('test_raw_xgboost.csv', index = False)


In [None]:
df.head()

In [None]:
results = regr.evals_result()

In [None]:
from matplotlib import pyplot

In [None]:
epochs = len(results['validation_0']['mlogloss'])
x_axis = range(0, epochs)


pyplot.plot(x_axis, results['validation_0']['mlogloss'], label='Train')

pyplot.plot(x_axis, results['validation_1']['mlogloss'], label='Test')
pyplot.legend()
pyplot.xlabel('epochs')
pyplot.ylabel('Log Loss')
pyplot.title('XGBoost Log Loss')
pyplot.show()
# plot classification error


In [None]:
"""
=================================================
Concatenating multiple feature extraction methods
=================================================

In many real-world examples, there are many ways to extract features from a
dataset. Often it is beneficial to combine several methods to obtain good
performance. This example shows how to use ``FeatureUnion`` to combine
features obtained by PCA and univariate selection.

Combining features using this transformer has the benefit that it allows
cross validation and grid searches over the whole process.

The combination used in this example is not particularly helpful on this
dataset and is only used to illustrate the usage of FeatureUnion.
"""

# Author: Andreas Mueller <amueller@ais.uni-bonn.de>
#
# License: BSD 3 clause

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest

iris = load_iris()

X, y = iris.data, iris.target
print iris.data
# This dataset is way too high-dimensional. Better do PCA:
pca = PCA(n_components=2)

# Maybe some original features where good, too?
selection = SelectKBest(k=1)

# Build estimator from PCA and Univariate selection:

combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])

# Use combined features to transform dataset:
X_features = combined_features.fit(X, y).transform(X)

svm = SVC(kernel="linear")

# Do grid search over k, n_components and C:

pipeline = Pipeline([("features", combined_features), ("svm", svm)])

#param_grid = dict(features__pca__n_components=[1, 2, 3],
                  #features__univ_select__k=[1, 2],
                  #svm__C=[0.1, 1, 10])
pipeline.fit(X,y)


In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk


class TextTransformer(BaseEstimator, TransformerMixin):
    def __init__(self,col, max_features = 10):
        self.tfidfVectorizer = TfidfVectorizer(use_idf=False, stop_words='english',
                                               tokenizer=self._custom_tokenizer, analyzer='word',
                                               max_features=max_features)
        self._vectorizer = None
        self._column = 'description'

    def _custom_tokenizer(self, string):
        # string = re.sub('^[\w]', '', string)
        tokens = nltk.word_tokenize(string)
        cleaned = [x if not x.isdigit() else '_NUM_' for x in tokens]
        return [str(x.encode('utf-8')) for x in cleaned if (x.isalpha() or x == '_NUM_')]

    def _clean_html_tags(self, content):
        return BeautifulSoup(content, 'lxml').text

    def fit(self, df, y = None):
        self._vectorizer = self.tfidfVectorizer.fit(df[self._column].apply(self._clean_html_tags))
        return self

    def transform(self, df, y=None):
        return self._vectorizer.transform(df[self._column]).todense()


In [None]:
df = pd.read_json('data/train.json', encoding = 'utf-8', dtype = {'description': str})
len(df)
#df = df[['description', 'interest_level']]

In [None]:


from sklearn.pipeline import Pipeline, FeatureUnion
a = TextTransformer('testing')
b = TextTransformer('features', max_features=10)
a = FeatureUnion([("pca", a), ('test', b)])
pipe = Pipeline([('description', a)])
X = df
y = df['interest_level'].values
pipe.fit(X,y)
# pg = {'clf__C': [0.1,1]}
# grid = GridSearchCV(pipeline, param_grid= pg ,cv = 2)
# grid.fit(df, df['interest_level'])

In [None]:
df.columns

In [None]:
a = df[['latitude', 'longitude', 'interest_level']]
a['interest_level'] = a['interest_level'].apply(repl)
#a = a.pivot_table('latitude', 'longitude', 'interest_level', aggfunc='sum')
a

In [None]:
max_lat = np.max(df['latitude'])
min_lat = np.min(df['latitude'])
mean = np.mean(df['latitude'])
std =  np.std(df['latitude'])
width = 3 * std
(n, bin, patch) = plt.hist(df['latitude'], bins = [min_lat, mean - width, mean , mean + width, max_lat])
print n 
print bin
print patch

In [None]:
df1 = df[['latitude', 'longitude', 'interest_level']]
df1.head
df2 = df1[df1['latitude'] >41]
sns.boxplot(y=df2['latitude'], data = df)

In [None]:
sns.lmplot('latitude', 'longitude', data = df1, hue = 'interest_level', fit_reg = False, size = 10)

In [None]:
df1 = df1[ (df1['longitude'] > -74.1) & (df1['longitude'] <-73.8)]
df1 = df1[(df1['latitude'] > 40.5) & (df1['latitude'] <40.9)]

In [None]:
def repl(label):
    if label == 'low':
        return 1
    elif label == 'medium':
        return 2
    else:
        return 3

In [None]:
df['latitude'] = pd.cut(df['latitude'], 10)


In [None]:
df['longitude'] = pd.cut(df['longitude'], 10)

In [None]:
a