In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from xgboost import XGBClassifier, XGBRegressor
import seaborn as sns
sns.set_style("dark")
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x))

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LinearRegression
#from warnings import filterwarnings
#filterwarnings('ignore')

data=None
def absolute_correlations(col, df=data):
    corrs = pd.DataFrame(df.select_dtypes(include=[np.number]).corrwith(df[col]), columns=['correlation'])
    corrs['absol'] = np.abs(corrs['correlation'])
    return corrs.sort_values('absol', ascending=False).drop('absol', axis=1).tail(len(corrs)-1)

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [19]:
data = pd.read_csv('train.tsv', sep='\t')

data = data.sample(frac=1.0)
data = data.sample(10000)

y = np.log1p(data.pop('price'))

data.shape

(10000, 7)

In [20]:
# Split (sub)categories into separate columns 
cats = data['category_name'].str.split('/', expand=True) #can also add n=3
cats.columns = ['category_' + str(i) for i in cats.columns]
del data['category_name']

# Add to original dataframe
data = pd.concat([data, cats], axis=1)

In [21]:
brands = pd.get_dummies(data['brand_name'])
brands['y'] = y

top_brands = absolute_correlations('y', df=brands)

# Save the brands that are correlated with price
top_brands_by_corr = top_brands[abs(top_brands.correlation) > .01].index

print(len(top_brands_by_corr), 'brands meet this criteria. Brands most strongly correlated with price include:')

top_brands.head()

311 brands meet this criteria. Brands most strongly correlated with price include:


Unnamed: 0,correlation
Louis Vuitton,0.121
Michael Kors,0.114
Lululemon,0.104
Kendra Scott,0.092
Apple,0.088


In [22]:
common_brands = data['brand_name'].value_counts()

# Take only those that appear in more than 1/1000 listings
common_brands = common_brands[common_brands > (len(data) * 1/1000)]

#Save their names
top_brands_by_commonality = common_brands.index

print(len(common_brands), 'brands meet this criteria. Most common brands include:')

common_brands[:5]

105 brands meet this criteria. Most common brands include:


Nike                 390
PINK                 341
Victoria's Secret    300
LuLaRoe              217
Apple                124
Name: brand_name, dtype: int64

In [23]:
# Combine brands meeting either criterion
top_brands_all = set(top_brands_by_commonality).union(set(top_brands_by_corr))

len(top_brands_all)

339

In [24]:
brands_old = data.pop('brand_name')

data['brand_name'] = 'Other'

for i in top_brands:
    data['brand_name'] = np.where(brands_old.isin(top_brands_all),brands_old, data['brand_name'])

In [25]:
# Lots of less-noteworthy brands are now labeled "Other"
data['brand_name'].sample(10)

390492              Other
1168773             Other
292061              Other
907625              Other
1332725             Other
1050822          Columbia
63172      American Eagle
454615         Kate Spade
1442357             Other
461411              Other
Name: brand_name, dtype: object

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

data['item_description'] = data['item_description'].fillna('_BLANK_')
tf = TfidfVectorizer(min_df=10, max_df=.2).fit(data['item_description'])

item_grid = pd.DataFrame(tf.transform(data['item_description']).todense())
item_grid.columns = ['contains_' + i for i in tf.get_feature_names()]

item_grid['train_id'] = data.index

In [27]:
# Concat item grid to original dataframe
data = pd.merge(data, item_grid, on=['train_id'])

data.shape

(10000, 2374)

In [28]:
# Dummify categories and brand names

cat0_dums = pd.get_dummies(data['category_0'], prefix='cat0')
cat1_dums = pd.get_dummies(data['category_1'], prefix='cat1')
cat2_dums = pd.get_dummies(data['category_2'], prefix='cat2')
cat3_dums = pd.get_dummies(data['category_3'], prefix='cat3')
cat4_dums = pd.get_dummies(data['category_4'], prefix='cat4')

data = pd.concat([data, cat0_dums], axis=1)
data = pd.concat([data, cat1_dums], axis=1)
data = pd.concat([data, cat2_dums], axis=1)
data = pd.concat([data, cat3_dums], axis=1)
data = pd.concat([data, cat4_dums], axis=1)

name_dums = pd.get_dummies(data['brand_name'], prefix='brandname')
data = pd.concat([data, name_dums], axis=1)

data.shape

(10000, 3305)

In [29]:
cv = cross_val_score(XGBRegressor(), data.select_dtypes(include=[np.number]), y, cv=4, scoring='r2')
# .266560686779
print('Mean score:', cv.mean())
print('Std Dev:   ', cv.std())

Mean score: 0.300726012807
Std Dev:    0.00642314300548


In [30]:
x = data.select_dtypes(include=[np.number])

In [31]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
pl = Pipeline([
    ('pca', PCA()),
    ('classify', XGBRegressor())
])

params = [{
    'pca__n_components': [1000,100],
}]

grid =\
GridSearchCV(pl, cv=3, n_jobs=-1, param_grid=params, scoring='r2')\
.fit(x, y)

model_binary = grid.best_estimator_
print(model_binary)
cv = cross_val_score(model_binary, x, y, cv=4, scoring='r2')

print('Mean score:', cv.mean())
print('Std Dev:   ', cv.std())

Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=100, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('classify', XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max...=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0, silent=True,
       subsample=1))])
Mean score: 0.289374246774
Std Dev:    0.00728806061989


In [33]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
pl = Pipeline([
    ('pca', PCA()),
    ('classify', XGBRegressor())
])

params = [{
    'pca__n_components': [1000,3297],
}]

grid =\
GridSearchCV(pl, cv=3, n_jobs=-1, param_grid=params, scoring='r2')\
.fit(x, y)

model_binary = grid.best_estimator_
print(model_binary)
cv = cross_val_score(model_binary, x, y, cv=4, scoring='r2')

print('Mean score:', cv.mean())
print('Std Dev:   ', cv.std())

Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=1000, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('classify', XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       ma...=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0, silent=True,
       subsample=1))])
Mean score: 0.303638788257
Std Dev:    0.0115789803086


In [34]:
x = PCA(n_components=1000).fit_transform(x)

In [None]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

xg = XGBRegressor()

params = [{
    'n_estimators': [50, 200],
    'max_depth': [2,3,4,5]
}]

grid =\
GridSearchCV(xg, cv=3, n_jobs=-1, param_grid=params, scoring='r2')\
.fit(x, y)

model = grid.best_estimator_
print(model)
cv = cross_val_score(model, x, y, cv=4, scoring='r2')

print('Mean score:', cv.mean())
print('Std Dev:   ', cv.std())

In [189]:
tf = TfidfVectorizer(min_df=5, max_df=.05).fit(data['name'])

name_grid = pd.DataFrame(tf.transform(data['name']).todense())
name_grid.columns = ['name_' + i for i in tf.get_feature_names()]

name_grid['train_id'] = data['train_id']

# data = pd.merge(data, name_grid, on=['train_id'])

Unnamed: 0,train_id,name,item_condition_id,shipping,item_description,category_0,category_1,category_2,category_3,category_4,...,name_zip,name_zone,cat0_Beauty,cat0_Electronics,cat0_Home,cat0_Kids,cat0_Men,cat0_Other,cat0_Sports & Outdoors,cat0_Women
0,2177,Chanel T SHIRT (L),1,1,Size Large Custom Made :),Women,Tops & Blouses,T-Shirts,,,...,0.0,0.0,0,0,0,0,0,0,0,1
1,4786,Black sandals,3,0,Sam & Libby brand sandals. Worn once for a wed...,Women,Shoes,Sandals,,,...,0.0,0.0,0,0,0,0,0,0,0,1
2,3089,Dr. Drew Beats Extra Cord Wire,1,1,This came with my Dr. Dre Solos but I don't ne...,Electronics,"TV, Audio & Surveillance",Portable Audio & Accessories,,,...,0.0,0.0,0,1,0,0,0,0,0,0
3,3842,Oral B precision clean brush heads,1,1,Both brand new still sealed,Other,Daily & Travel items,Personal Care,,,...,0.0,0.0,0,0,0,0,0,1,0,0
4,2905,Black Supreme New Era Headband,2,0,No description yet,Men,Men's Accessories,Hats,,,...,0.0,0.0,0,0,0,0,1,0,0,0
5,6508,Rae Dunn Shine Mug,1,0,"Rae Dunn Summer Collection, Shine Mug.",Home,Kitchen & Dining,Coffee & Tea Accessories,,,...,0.0,0.0,0,0,1,0,0,0,0,0
6,1810,"NWOT Coach bag & wallet, FINAL SALE[rm]",2,0,NWOT leather pebble two tone set. The colors a...,Women,Women's Handbags,Shoulder Bag,,,...,0.0,0.0,0,0,0,0,0,0,0,1
7,1153,4-pairs New VS Pink boyshort panties XS,1,0,"Please read, before buying :-). Don't ever rat...",Women,Underwear,Panties,,,...,0.0,0.0,0,0,0,0,0,0,0,1
8,2649,Rae Dunn Spoon and Dance Mug,2,0,***RESERVED FOR BARBARAARY**** 1 Brand New Rae...,Home,Kitchen & Dining,Kitchen Utensils & Gadgets,,,...,0.0,0.0,0,0,1,0,0,0,0,0
9,6742,Estée Lauder ampoules set,1,1,New Estee Lauder Advanced Night Repair Ampoule...,Beauty,Makeup,Face,,,...,0.0,0.0,1,0,0,0,0,0,0,0


In [123]:
cv = cross_val_score(XGBRegressor(), data.select_dtypes(include=[np.number]), y, cv=4, scoring='r2')
.15
print('Mean score:', cv.mean())
print('Std Dev:   ', cv.std())

ValueError: Found input variables with inconsistent numbers of samples: [14785, 7413]

In [119]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

x = data['name']
x.fillna('_BLANK_', inplace=True)

pl = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classify', XGBRegressor())
])

params = [{
    'tfidf__max_df': np.arange(.01,.11,.01),
    'tfidf__min_df': np.arange(1,11,1),
    'tfidf__ngram_range': [(1,1)],
    'tfidf__norm': ['l2'],
}]

grid =\
GridSearchCV(pl, cv=3, n_jobs=-1, param_grid=params, scoring='r2')\
.fit(x, y)

model = grid.best_estimator_
print(model)
cv = cross_val_score(model, x, y, cv=4, scoring='r2')

print('Mean score:', cv.mean())
print('Std Dev:   ', cv.std())

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.050000000000000003, max_features=None,
        min_df=5, ngram_range=(1, 1), norm='l2', preprocessor=None,
 ...=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0, silent=True,
       subsample=1))])
Mean score: 0.148614382105
Std Dev:    0.00774860153297


In [62]:
brands = pd.get_dummies(data['brand'], prefix='brand')
data = pd.concat([data, brands], axis=1)

cv = cross_val_score(XGBRegressor(), data.select_dtypes(include=[np.number]), y, cv=4, scoring='r2')

print('Mean score:', cv.mean())
print('Std Dev:   ', cv.std())

Mean score: -0.522018033764
Std Dev:    0.00297008563967


In [38]:
x = pd.get_dummies(cats)

cv = cross_val_score(XGBRegressor(), x, y, cv=4, scoring='r2')

print('Mean score:', cv.mean())
print('Std Dev:   ', cv.std())

Mean score: 0.161377200846
Std Dev:    0.00702022371952


In [39]:
cats = data['category_name'].str.split('/', expand=True, n=3)
cats.columns = ['category_' + str(i) for i in cats.columns]
data = pd.concat([data, cats], axis=1)

x = pd.get_dummies(cats)

cv = cross_val_score(XGBRegressor(), x, y, cv=3, scoring='r2')

print('Mean score:', cv.mean())
print('Std Dev:   ', cv.std())

Mean score: 0.159613443153
Std Dev:    0.00645306558285


In [67]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

x = data['item_description']
x.fillna(' ', inplace=True)

pl = Pipeline([
    ('tfidf', TfidfVectorizer(max_df=.1, min_df=10)),
    ('classify', XGBRegressor())
])

cv = cross_val_score(pl, x, y, cv=3, scoring='r2')

print('Mean score:', cv.mean())
print('Std Dev:   ', cv.std())

Mean score: 0.140731284087
Std Dev:    0.00187840333916


ValueError: Found input variables with inconsistent numbers of samples: [14825, 1482535]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

x = data['item_description']
x.fillna(' ', inplace=True)

pl = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classify', LinearRegression())
])

params = [{
    'tfidf__max_df': np.arange(.01,.36,.05),
    'tfidf__min_df': np.arange(1,51,10),
    'tfidf__ngram_range': [(1,1)],
    'tfidf__norm': ['l2'],
    'classify__normalize': [False, True],
}]

grid =\
GridSearchCV(pl, cv=3, n_jobs=-1, param_grid=params, scoring='r2')\
.fit(x, y)

model = grid.best_estimator_
print(model)
cv = cross_val_score(model, x, y, cv=4, scoring='r2')

print('Mean score:', cv.mean())
print('Std Dev:   ', cv.std())