In [1]:
!pip install eli5

Collecting eli5
[?25l  Downloading https://files.pythonhosted.org/packages/97/2f/c85c7d8f8548e460829971785347e14e45fa5c6617da374711dec8cb38cc/eli5-0.10.1-py2.py3-none-any.whl (105kB)
[K     |███                             | 10kB 20.3MB/s eta 0:00:01[K     |██████▏                         | 20kB 3.3MB/s eta 0:00:01[K     |█████████▎                      | 30kB 4.3MB/s eta 0:00:01[K     |████████████▍                   | 40kB 3.0MB/s eta 0:00:01[K     |███████████████▌                | 51kB 3.3MB/s eta 0:00:01[K     |██████████████████▋             | 61kB 4.0MB/s eta 0:00:01[K     |█████████████████████▊          | 71kB 4.2MB/s eta 0:00:01[K     |████████████████████████▊       | 81kB 4.5MB/s eta 0:00:01[K     |███████████████████████████▉    | 92kB 5.0MB/s eta 0:00:01[K     |███████████████████████████████ | 102kB 5.0MB/s eta 0:00:01[K     |████████████████████████████████| 112kB 5.0MB/s 
Installing collected packages: eli5
Successfully installed eli5-0.10.1


In [0]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
import eli5
from eli5.sklearn import PermutationImportance
from ast import literal_eval
from tqdm import tqdm_notebook

In [3]:
cd "/content/drive/My Drive/Colab Notebooks/dw_matrix"

/content/drive/My Drive/Colab Notebooks/dw_matrix


In [4]:
df = pd.read_csv('data/men_shoes.csv', low_memory=False)
df.shape

(18280, 48)

In [0]:
def run_model(feats, model=DecisionTreeRegressor(max_depth=5)):
  X = df[feats].values
  y = df.prices_amountmin.values

  scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error')
  return np.mean(scores), np.std(scores)

In [9]:
df['brand_cat'] = df.brand.map(lambda x: str(x).lower()).factorize()[0]
run_model(['brand_cat'])

(-58.133398968282776, 4.206122611474276)

In [10]:
model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
run_model(['brand_cat'], model)

(-57.31783843165656, 4.181246596160967)

In [0]:
def parse_features(x):
  output_dict = {}
  if str(x) == 'nan': return output_dict

  features = literal_eval(x.replace('\\"', '"'))
  for item in features:
    key, value = item['key'].lower().strip(), item['value'][0].lower().strip()
    output_dict[key] = value
  return output_dict


df['features_parsed'] = df.features.map(parse_features)

In [17]:
df['features_parsed'].head()

0    {'gender': 'men', 'shoe size': 'm', 'shoe cate...
1    {'gender': 'men', 'shoe size': 'm', 'shoe cate...
2    {'gender': 'men', 'color': 'black', 'shipping ...
3    {'gender': 'men', 'color': 'black', 'shipping ...
4    {'gender': 'men', 'color': 'black', 'shipping ...
Name: features_parsed, dtype: object

In [18]:
keys = set()
df['features_parsed'].map(lambda x: keys.update(x.keys()))
len(keys)

476

In [22]:
def get_name_feat(key):
  return 'feat_' + key

for key in tqdm_notebook(keys):
  df[get_name_feat(key)] = df['features_parsed'].map(lambda x: x[key] if key in x else np.nan)

HBox(children=(IntProgress(value=0, max=476), HTML(value='')))




In [23]:
df.columns

Index(['id', 'asins', 'brand', 'categories', 'colors', 'count', 'dateadded',
       'dateupdated', 'descriptions', 'dimension',
       ...
       'feat_watch shape', 'feat_picture', 'feat_boot height',
       'feat_amazonbestsellersrank', 'feat_has adaptive lenses',
       'feat_accessory type', 'feat_assembled product dimensions (l x w x h)',
       'feat_lens width', 'feat_shoe size', 'feat_is water-resistant'],
      dtype='object', length=1002)

In [25]:
df[False == df.feat_athlete.isnull()].shape[0] / df.shape[0] * 100

0.0437636761487965

In [0]:
keys_stat = {}

for key in keys:
  keys_stat[key] = df[False == df[get_name_feat(key)].isnull()].shape[0] / df.shape[0] * 100

In [30]:
{k:v for k,v in keys_stat.items() if v > 30}

{'brand': 48.62691466083151,
 'color': 47.784463894967175,
 'gender': 50.17505470459519,
 'manufacturer part number': 36.252735229759296,
 'material': 34.9070021881838}

In [0]:
df['feat_brand_cat'] = df.feat_brand.map(lambda x: str(x).lower()).factorize()[0]
df['feat_color_cat'] = df.feat_color.map(lambda x: str(x).lower()).factorize()[0]
df['feat_gender_cat'] = df.feat_gender.map(lambda x: str(x).lower()).factorize()[0]
df['feat_manufacturer part number_cat'] = df['feat_manufacturer part number'].map(lambda x: str(x).lower()).factorize()[0]
df['feat_material_cat'] = df.feat_material.map(lambda x: str(x).lower()).factorize()[0]

df['feat_sport_cat'] = df.feat_sport.map(lambda x: str(x).lower()).factorize()[0]
df['feat_style_cat'] = df.feat_style.map(lambda x: str(x).lower()).factorize()[0]

for key in keys:
  df[get_name_feat(key) + '_cat'] = df[get_name_feat(key)].map(lambda x: str(x).lower()).factorize()[0]

In [42]:
df['brand'] = df['brand'].map(lambda x: str(x).lower())
df[df.brand == df.feat_brand].shape

(8846, 1002)

In [43]:
df[df.brand == df.feat_brand][['brand', 'feat_brand']].head()

Unnamed: 0,brand,feat_brand
0,josmo,josmo
1,josmo,josmo
2,servus by honeywell,servus by honeywell
3,servus by honeywell,servus by honeywell
4,servus by honeywell,servus by honeywell


In [41]:
run_model(['brand_cat'])

(-58.133398968282776, 4.206122611474276)

In [44]:
model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
run_model(['brand_cat'], model)

(-57.31783843165656, 4.181246596160967)

In [65]:
feats_cat = [x for x in df.columns if '_cat' in x]
feats_cat

['brand_cat',
 'feat_catalog',
 'feat_brand_cat',
 'feat_color_cat',
 'feat_gender_cat',
 'feat_manufacturer part number_cat',
 'feat_material_cat',
 'feat_sport_cat',
 'feat_style_cat',
 'feat_heel height_cat',
 'feat_is dark sky-compliant_cat',
 'feat_chain length_cat',
 'feat_mechanic_cat',
 'feat_authenticity_cat',
 'feat_type of sole_cat',
 'feat_protects against_cat',
 'feat_manufacturer_cat',
 'feat_gold karat_cat',
 'feat_manufacturer number_cat',
 'feat_guaranteed authentic_cat',
 'feat_chronograph_cat',
 'feat_autographed_cat',
 'feat_colour code_cat',
 'feat_antiscratch lens coating_cat',
 'feat_part type_cat',
 'feat_main color_cat',
 'feat_outer material_cat',
 'feat_waist size_cat',
 'feat_age_cat',
 'feat_size/dimensions_cat',
 'feat_insulated_cat',
 'feat_fishing_cat',
 'feat_fabric care instructions_cat',
 'feat_item color_cat',
 'feat_is wheeled_cat',
 'feat_date_cat',
 'feat_release_cat',
 'feat_case type:_cat',
 'feat_frame style_cat',
 'feat_band material_cat',
 'f

In [72]:
df['weight'].unique()

array([nan, '3.0 lbs', '9 g', '1.45 lbs', '0.45 lbs', '1.0 lbs',
       '0.23 lbs', '5.0 lbs', '5.5 lbs', '7.45 lbs', '4.0 lbs',
       '2.7969 lbs', '3.9 lbs', '4.6 pounds', '2.1 lbs', '1.1057 lbs',
       '15.0 lbs', '2.4 ounces', '454 g', '0.105 lbs', '9.1 ounces',
       '4.8 lbs', '6.1 lbs', '6.5 lbs', '1.1041 lbs', '1.3 Kg', '91 g',
       '20.0 lbs', '6.0 lbs', '386 g', '0.81 lbs', '4.5 lbs',
       '0.5 ounces', '2.0 lbs', '3.13 lbs', '5.9 lbs', '6.15 lbs',
       '1 pounds', '1.95 lbs', '2.15 lbs', '2 pounds', '2.1 pounds',
       '14 Kg', '0.4788 lbs', '10.0 lbs', '0.38 lbs', '2.5 lbs',
       '68.912 lbs', '45 g', '13.09 lbs', '2.5 pounds', '0.21 lbs',
       '16.75 lbs', '6.3 lbs', '272 g', '1.8 Kg', '2.8 pounds', '0.1 lbs',
       '5.05 lbs', '0.28 lbs', '76.08 lbs', '0.15 lbs', '200 g',
       '7.8 pounds', '399 g', '4.95 lbs', '64.144 lbs', '24 pounds',
       '73.696 lbs', '1.6 lbs', '6.6 ounces', '5 g', '1.2 Kg', '862 g',
       '3.05 lb', '8.6 ounces', '3.6 lbs', '71.

In [0]:
#df['weight'].map(lambda x: str(x).split(' ')[1] if (str) != 'nan' else 'nan').unique()

In [0]:
def norm_weight(weight):
  if weight is np.nan:
    return 0.0
  data = weight.split(' ')
  if data[1] == 'g':
    return float(data[0])
  if data[1] == 'lbs' or data[1] == 'pounds':
    return float(data[0]) * 453.59237
  if data[1] == 'Kg':
    return float(data[0]) * 1000.0
  if data[1] == 'ounces':
    return float(data[0]) * 28.35
  return 0.0

In [0]:
df['weight_conv'] = df.weight.map(norm_weight)

In [84]:
df[['weight','weight_conv']].sample(20)

Unnamed: 0,weight,weight_conv
4127,,0.0
9913,,0.0
17456,,0.0
12940,,0.0
2815,,0.0
7049,,0.0
17799,,0.0
7118,,0.0
17008,5.0 lbs,2267.96185
17678,,0.0


In [85]:
feats = ['brand_cat', 'feat_brand_cat', 'feat_metal type_cat', 'feat_shape_cat',
         'feat_gender_cat', 'feat_material_cat', 'feat_sport_cat', 'feat_style_cat',
         'weight_conv']
#feats += feats_cat
#feats = list(set(feats))
model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
run_model(feats, model)

(-57.26144791976621, 4.212702302495503)

In [86]:
X = df[feats].values
y = df.prices_amountmin.values

m = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
m.fit(X, y)

perm = PermutationImportance(m, random_state=1).fit(X, y)
eli5.show_weights(perm, feature_names=feats)

Weight,Feature
0.2595  ± 0.0112,brand_cat
0.1042  ± 0.0136,feat_material_cat
0.0246  ± 0.0021,feat_gender_cat
0.0161  ± 0.0010,feat_brand_cat
0.0127  ± 0.0007,feat_shape_cat
0.0090  ± 0.0016,feat_metal type_cat
0.0046  ± 0.0010,weight_conv
0.0026  ± 0.0004,feat_style_cat
0.0001  ± 0.0000,feat_sport_cat


In [52]:
feats = ['brand_cat', 'feat_brand_cat', 'feat_color_cat', 'feat_gender_cat', 
         'feat_material_cat']
model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
run_model(feats, model)

(-57.2539454079282, 4.121338845921999)

In [53]:
feats = ['brand_cat', 'feat_brand_cat', 'feat_gender_cat', 'feat_material_cat']
model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
run_model(feats, model)

(-57.22201938669362, 4.166477756011591)

In [58]:
feats = ['brand_cat', 'feat_brand_cat', 'feat_gender_cat', 'feat_material_cat', 'feat_sport_cat']
model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
run_model(feats, model)

(-57.212879749570995, 4.151278041440552)

In [59]:
feats = ['brand_cat', 'feat_brand_cat', 'feat_gender_cat', 'feat_material_cat', 'feat_style_cat']
model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
run_model(feats, model)

(-57.19788018267518, 4.254936581971709)

In [60]:
feats = ['brand_cat', 'feat_brand_cat', 'feat_gender_cat', 'feat_material_cat', 'feat_sport_cat',
         'feat_style_cat']
model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
run_model(feats, model)

(-57.19151948197191, 4.2530159198220225)

In [55]:
df.brand.value_counts(normalize=True)

nike            0.097210
puma            0.033315
ralph lauren    0.028775
vans            0.021116
new balance     0.020295
                  ...   
raywinter       0.000055
eames           0.000055
givenchy        0.000055
fit & fresh     0.000055
gitzo           0.000055
Name: brand, Length: 1732, dtype: float64

In [56]:
df[df.brand == 'nike'].features_parsed.head().values

array([{'sport': 'soccer', 'main color': 'orange', 'type': 'cleats', 'condition': 'new without box'},
       {'sport': 'soccer', 'main color': 'orange', 'type': 'cleats', 'condition': 'new without box'},
       {'sport': 'soccer', 'main color': 'orange', 'type': 'cleats', 'condition': 'new without box'},
       {'style': 'athletic sneakers', 'condition': 'new with box'}, {}],
      dtype=object)