In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from xgboost import XGBClassifier, XGBRegressor
import seaborn as sns
sns.set_style("dark")
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x))

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LinearRegression
#from warnings import filterwarnings
#filterwarnings('ignore')

data=None
def absolute_correlations(col, df=data):
    corrs = pd.DataFrame(df.select_dtypes(include=[np.number]).corrwith(df[col]), columns=['correlation'])
    corrs['absol'] = np.abs(corrs['correlation'])
    return corrs.sort_values('absol', ascending=False).drop('absol', axis=1).tail(len(corrs)-1)

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
data = pd.read_csv('train.tsv', sep='\t')

data = data.sample(frac=1.0)
combi = data.sample(20000)

combi['item_description'] = combi['item_description'].fillna('_BLANK_')

data = combi.head(10000) #training set
test = combi.tail(10000)

y = np.log1p(data.pop('price'))
ytest = np.log1p(test.pop('price'))

data.shape

(10000, 7)

In [3]:
# Split (sub)categories into separate columns 
cats = data['category_name'].str.split('/', expand=True) #can also add n=3
cats.columns = ['category_' + str(i) for i in cats.columns]
del data['category_name']

# Add to original dataframe
data = pd.concat([data, cats], axis=1)

In [4]:
brands = pd.get_dummies(data['brand_name'])
brands['y'] = y

top_brands = absolute_correlations('y', df=brands)

# Save the brands that are correlated with price
top_brands_by_corr = top_brands[abs(top_brands.correlation) > .01].index

print(len(top_brands_by_corr), 'brands meet this criteria. Brands most strongly correlated with price include:')

top_brands.head()

283 brands meet this criteria. Brands most strongly correlated with price include:


Unnamed: 0,correlation
Louis Vuitton,0.12
Michael Kors,0.113
Air Jordan,0.106
Kendra Scott,0.093
Lululemon,0.088


In [5]:
common_brands = data['brand_name'].value_counts()

# Take only those that appear in more than 1/1000 listings
common_brands = common_brands[common_brands > (len(data) * 1/1000)]

#Save their names
top_brands_by_commonality = common_brands.index

print(len(common_brands), 'brands meet this criteria. Most common brands include:')

common_brands[:5]

104 brands meet this criteria. Most common brands include:


Nike                 371
PINK                 356
Victoria's Secret    313
LuLaRoe              201
FOREVER 21           119
Name: brand_name, dtype: int64

In [6]:
# Combine brands meeting either criterion
top_brands_all = set(top_brands_by_commonality).union(set(top_brands_by_corr))

len(top_brands_all)

316

In [7]:
brands_old = data.pop('brand_name')

data['brand_name'] = 'Other'

for i in top_brands:
    data['brand_name'] = np.where(brands_old.isin(top_brands_all),brands_old, data['brand_name'])

In [8]:
# Lots of less-noteworthy brands are now labeled "Other"
data['brand_name'].sample(10)

1245237       Style&co.
637129            Other
193904            Other
824211            Other
492128              MAC
195074            Other
643940            Other
849893            Other
624957     Michael Kors
642388            Coach
Name: brand_name, dtype: object

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(min_df=10, max_df=.2).fit(combi['item_description'])

item_grid = pd.DataFrame(tf.transform(data['item_description']).todense())
item_grid.columns = ['contains_' + i for i in tf.get_feature_names()]

item_grid['train_id'] = data.index

In [10]:
# Concat item grid to original dataframe
data = pd.merge(data, item_grid, on=['train_id'])

data.shape

(10000, 3701)

In [11]:
# Dummify categories and brand names

cat0_dums = pd.get_dummies(data['category_0'], prefix='cat0')
cat1_dums = pd.get_dummies(data['category_1'], prefix='cat1')
cat2_dums = pd.get_dummies(data['category_2'], prefix='cat2')
cat3_dums = pd.get_dummies(data['category_3'], prefix='cat3')
cat4_dums = pd.get_dummies(data['category_4'], prefix='cat4')

data = pd.concat([data, cat0_dums], axis=1)
data = pd.concat([data, cat1_dums], axis=1)
data = pd.concat([data, cat2_dums], axis=1)
data = pd.concat([data, cat3_dums], axis=1)
data = pd.concat([data, cat4_dums], axis=1)

name_dums = pd.get_dummies(data['brand_name'], prefix='brandname')
data = pd.concat([data, name_dums], axis=1)

data.shape

(10000, 4634)

In [12]:
x = data.select_dtypes(include=[np.number])

In [13]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pl = Pipeline([
    ('pca', PCA()),
    ('xgboost', XGBRegressor())
])

params = [{
    'pca__n_components': [1000],
    'xgboost__n_estimators': [200],
    'xgboost__max_depth': [4],
    'xgboost__reg_lambda': [1.0,1.5]
}]

grid =\
GridSearchCV(pl, cv=3, n_jobs=-1, param_grid=params, scoring='r2')\
.fit(x, y)

model = grid.best_estimator_
print(model)
cv = cross_val_score(model, x, y, cv=4, scoring='r2')

print('Mean score:', cv.mean())
print('Std Dev:   ', cv.std())

Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=1000, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('xgboost', XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max...,
       reg_alpha=0, reg_lambda=1.0, scale_pos_weight=1, seed=0,
       silent=True, subsample=1))])
Mean score: 0.336957056036
Std Dev:    0.00891280115213


In [17]:
model.fit(x, y)

Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=1000, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('xgboost', XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max...,
       reg_alpha=0, reg_lambda=1.0, scale_pos_weight=1, seed=0,
       silent=True, subsample=1))])

In [None]:
preds = model.predict(test.select_dtypes(include=[np.number]))