In [1]:
!pip install eli5

Collecting eli5
[?25l  Downloading https://files.pythonhosted.org/packages/97/2f/c85c7d8f8548e460829971785347e14e45fa5c6617da374711dec8cb38cc/eli5-0.10.1-py2.py3-none-any.whl (105kB)
[K     |███                             | 10kB 17.9MB/s eta 0:00:01[K     |██████▏                         | 20kB 1.9MB/s eta 0:00:01[K     |█████████▎                      | 30kB 2.7MB/s eta 0:00:01[K     |████████████▍                   | 40kB 1.8MB/s eta 0:00:01[K     |███████████████▌                | 51kB 2.2MB/s eta 0:00:01[K     |██████████████████▋             | 61kB 2.6MB/s eta 0:00:01[K     |█████████████████████▊          | 71kB 3.0MB/s eta 0:00:01[K     |████████████████████████▊       | 81kB 3.5MB/s eta 0:00:01[K     |███████████████████████████▉    | 92kB 3.9MB/s eta 0:00:01[K     |███████████████████████████████ | 102kB 3.0MB/s eta 0:00:01[K     |████████████████████████████████| 112kB 3.0MB/s 
Installing collected packages: eli5
Successfully installed eli5-0.10.1


In [0]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

import eli5
from eli5.sklearn import PermutationImportance

from ast import literal_eval
from tqdm import tqdm_notebook

In [6]:
cd "/content/drive/My Drive/Colab Notebooks/dw_matrix"

/content/drive/My Drive/Colab Notebooks/dw_matrix


In [0]:
df = pd.read_csv('data/men_shoes.csv', low_memory=False)

In [0]:
def run_model(feats, model = DecisionTreeRegressor(max_depth=5)):
  X = df[feats].values
  y = df['prices_amountmin'].values

  scores = cross_val_score(model, X, y, scoring = 'neg_mean_absolute_error')
  return np.mean(scores), np.std(scores)

In [16]:
df['brand_cat'] = df['brand'].map(lambda x: str(x).lower()).factorize()[0]
run_model(['brand_cat'])

(-58.133398968282776, 4.206122611474276)

In [17]:
model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
run_model(['brand_cat'], model)

(-57.31783843165656, 4.181246596160967)

In [22]:
test = {'key': 'value'}
test['key']

str(test)

array(['[{"key":"Gender","value":["Men"]},{"key":"Shoe Size","value":["M"]},{"key":"Shoe Category","value":["Men\'s Shoes"]},{"key":"Color","value":["Multicolor"]},{"key":"Manufacturer Part Number","value":["8190-W-NAVY-7.5"]},{"key":"Brand","value":["Josmo"]}]',
       '[{"key":"Gender","value":["Men"]},{"key":"Shoe Size","value":["M"]},{"key":"Shoe Category","value":["Men\'s Shoes"]},{"key":"Color","value":["Multicolor"]},{"key":"Manufacturer Part Number","value":["8190-W-NAVY-7.5"]},{"key":"Brand","value":["Josmo"]}]',
       '[{"key":"Gender","value":["Men"]},{"key":"Color","value":["Black"]},{"key":"Shipping Weight (in pounds)","value":["0.45"]},{"key":"Condition","value":["New"]},{"key":"Brand","value":["SERVUS BY HONEYWELL"]},{"key":"manufacturer_part_number","value":["ZSR101BLMLG"]}]',
       '[{"key":"Gender","value":["Men"]},{"key":"Color","value":["Black"]},{"key":"Shipping Weight (in pounds)","value":["0.45"]},{"key":"Condition","value":["New"]},{"key":"Brand","value":["SER

In [19]:
test = {'key': 'value'}
test['key']

str(test)

"{'key': 'value'}"

In [25]:
str_dict = '[{"key":"Gender","value":["Men"]},{"key":"Shoe Size","value":["M"]},{"key":"Shoe Category","value":["Men\'s Shoes"]},{"key":"Color","value":["Multicolor"]},{"key":"Manufacturer Part Number","value":["8190-W-NAVY-7.5"]},{"key":"Brand","value":["Josmo"]}]'

literal_eval(str_dict)[0]['value'][0]

'Men'

In [0]:
def parse_features(x):
  output_dict = {}
  if str(x) == 'nan': return output_dict

  features = literal_eval(x.replace('\\"','"'))
  for item in features:
    key = item['key'].lower().strip()
    value = item['value'][0].lower().strip()

    output_dict[key] = value

  return output_dict

df['features_parsed'] = df['features'].map(parse_features)

In [32]:
keys = set()


df['features_parsed'].map( lambda x: keys.update(x.keys()) )

len(keys)

476

In [33]:
df.features_parsed.head().values

array([{'gender': 'men', 'shoe size': 'm', 'shoe category': "men's shoes", 'color': 'multicolor', 'manufacturer part number': '8190-w-navy-7.5', 'brand': 'josmo'},
       {'gender': 'men', 'shoe size': 'm', 'shoe category': "men's shoes", 'color': 'multicolor', 'manufacturer part number': '8190-w-navy-7.5', 'brand': 'josmo'},
       {'gender': 'men', 'color': 'black', 'shipping weight (in pounds)': '0.45', 'condition': 'new', 'brand': 'servus by honeywell', 'manufacturer_part_number': 'zsr101blmlg'},
       {'gender': 'men', 'color': 'black', 'shipping weight (in pounds)': '0.45', 'condition': 'new', 'brand': 'servus by honeywell', 'manufacturer_part_number': 'zsr101blmlg'},
       {'gender': 'men', 'color': 'black', 'shipping weight (in pounds)': '0.45', 'condition': 'new', 'brand': 'servus by honeywell', 'manufacturer_part_number': 'zsr101blmlg'}],
      dtype=object)

In [34]:
def get_name_feat(key):
  return 'feat_' + key

for key in tqdm_notebook(keys):
  df[get_name_feat(key)] = df.features_parsed.map(lambda feats: feats[key] if key in feats else np.nan)

HBox(children=(IntProgress(value=0, max=476), HTML(value='')))




In [35]:
df.columns

Index(['id', 'asins', 'brand', 'categories', 'colors', 'count', 'dateadded',
       'dateupdated', 'descriptions', 'dimension',
       ...
       'feat_clothing type', 'feat_water resistant', 'feat_garment care',
       'feat_recommended use', 'feat_lens socket width', 'feat_safety feature',
       'feat_fabric care instructions',
       'feat_assembled product dimensions (l x w x h)', 'feat_nickel free',
       'feat_is wheeled'],
      dtype='object', length=526)

In [0]:
keys_stat = {}
for key in keys:  
 keys_stat[key] = df [False == df[get_name_feat(key)].isnull()].shape[0] / df.shape[0] *100

In [49]:
{k:v for k,v in keys_stat.items() if v > 30}

{'brand': 48.62691466083151,
 'color': 47.784463894967175,
 'gender': 50.17505470459519,
 'manufacturer part number': 36.252735229759296,
 'material': 34.9070021881838}

In [0]:
df['feat_brand_cat'] = df['feat_brand'].factorize()[0]
df['feat_color_cat'] = df['feat_color'].factorize()[0]
df['feat_gender_cat'] = df['feat_gender'].factorize()[0]
df['feat_manufacturer part number_cat'] = df['feat_manufacturer part number'].factorize()[0]
df['feat_material_cat'] = df['feat_material'].factorize()[0]

df['feat_sport_cat'] = df['feat_sport'].factorize()[0]
df['feat_style_cat'] = df['feat_style'].factorize()[0]

for key in keys:
  df[get_name_feat(key) + '_cat'] = df[get_name_feat(key)].factorize()[0]

In [66]:
df['brand'] = df['brand'].map(lambda x: str(x).lower() )
df[ df.brand == df.feat_brand ].shape

(8846, 533)

In [0]:
feats = ['']

In [67]:
model = RandomForestRegressor(max_depth=5, n_estimators=100)
run_model(['brand_cat'], model)

(-57.3376469441659, 4.174591309447629)

In [79]:
feats_cat = [x for x in df.columns if 'cat' in x]
feats_cat

['categories',
 'brand_cat',
 'feat_location - country',
 'feat_shoe category',
 'feat_recommended location',
 'feat_catalog',
 'feat_multi pack indicator',
 'feat_fabrication',
 'feat_location - city/state',
 'feat_clothing category',
 'feat_certifications and listings',
 'feat_brand_cat',
 'feat_color_cat',
 'feat_gender_cat',
 'feat_manufacturer part number_cat',
 'feat_material_cat',
 'feat_sport_cat',
 'feat_style_cat',
 'feat_design_cat',
 'feat_alarm_cat',
 'feat_athlete_cat',
 'feat_light transmission (vlt)_cat',
 'feat_dial color_cat',
 'feat_ean_cat',
 'feat_has mercury_cat',
 'feat_shoe size_cat',
 'feat_width_cat',
 'feat_main colour_cat',
 'feat_batteries required?_cat',
 'feat_parts_cat',
 'feat_date first available at amazon.co.uk_cat',
 'feat_full product manual_cat',
 'feat_vendor description_cat',
 'feat_reinforced knee_cat',
 'feat_style #_cat',
 'feat_sports league_cat',
 'feat_is weather-resistant_cat',
 'feat_case type:_cat',
 'feat_sub type_cat',
 'feat_polarized

In [88]:
df['weight'].unique()

array([nan, '3.0 lbs', '9 g', '1.45 lbs', '0.45 lbs', '1.0 lbs',
       '0.23 lbs', '5.0 lbs', '5.5 lbs', '7.45 lbs', '4.0 lbs',
       '2.7969 lbs', '3.9 lbs', '4.6 pounds', '2.1 lbs', '1.1057 lbs',
       '15.0 lbs', '2.4 ounces', '454 g', '0.105 lbs', '9.1 ounces',
       '4.8 lbs', '6.1 lbs', '6.5 lbs', '1.1041 lbs', '1.3 Kg', '91 g',
       '20.0 lbs', '6.0 lbs', '386 g', '0.81 lbs', '4.5 lbs',
       '0.5 ounces', '2.0 lbs', '3.13 lbs', '5.9 lbs', '6.15 lbs',
       '1 pounds', '1.95 lbs', '2.15 lbs', '2 pounds', '2.1 pounds',
       '14 Kg', '0.4788 lbs', '10.0 lbs', '0.38 lbs', '2.5 lbs',
       '68.912 lbs', '45 g', '13.09 lbs', '2.5 pounds', '0.21 lbs',
       '16.75 lbs', '6.3 lbs', '272 g', '1.8 Kg', '2.8 pounds', '0.1 lbs',
       '5.05 lbs', '0.28 lbs', '76.08 lbs', '0.15 lbs', '200 g',
       '7.8 pounds', '399 g', '4.95 lbs', '64.144 lbs', '24 pounds',
       '73.696 lbs', '1.6 lbs', '6.6 ounces', '5 g', '1.2 Kg', '862 g',
       '3.05 lb', '8.6 ounces', '3.6 lbs', '71.

In [0]:
feats =['brand_cat','feat_metal type_cat','feat_shape_cat', 'feat_brand_cat','feat_gender_cat','feat_material_cat','feat_style_cat', 'feat_sport_cat']

model = RandomForestRegressor(max_depth=5, n_estimators=100)
result = run_model(feats, model)

In [90]:
X = df[feats].values
y = df['prices_amountmin'].values


m = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
m.fit(X,y)

print(result)
perm = PermutationImportance(m, random_state=1).fit(X,y);
eli5.show_weights(perm, feature_names=feats)

(-57.29190569391575, 4.239396380045537)


Weight,Feature
0.2557  ± 0.0080,brand_cat
0.1032  ± 0.0075,feat_material_cat
0.0252  ± 0.0026,feat_gender_cat
0.0175  ± 0.0015,feat_brand_cat
0.0127  ± 0.0011,feat_shape_cat
0.0091  ± 0.0015,feat_metal type_cat
0.0033  ± 0.0012,feat_style_cat
0.0002  ± 0.0000,feat_sport_cat


In [87]:
df [ df['brand'] == 'nike'].features_parsed.sample(5).values

array([{'season': 'all-season', 'material': 'synthetic', 'gender': 'men', 'shoe size': '9.5', 'size': '9.5', 'color': 'gray', 'model': '616190 003', 'manufacturer part number': '616190 003', 'brand': 'nike', 'age group': 'adult'},
       {'sport': 'baseball & softball', 'style': 'shorts', 'condition': 'new with tags'},
       {'sport': 'soccer', 'type': 'cleats', 'condition': 'new with box'},
       {'sport': 'football', 'main color': 'purple & green', 'type': 'cleats'},
       {'style': 'cleats', 'pattern': 'solid', 'country/region of manufacture': 'vietnam', 'condition': 'new with box'}],
      dtype=object)

In [77]:
df['feat_age group'].value_counts()

adult               4563
men                  350
child                 77
men's                 33
unisex                 6
mens                   4
infant                 4
toddler                4
boys'                  3
youth                  2
women                  2
women ,�� unisex       2
men||women             2
adult ,�� teen         1
12 up                  1
Name: feat_age group, dtype: int64

In [94]:
!git add matrix_one/

Day4.ipynb  Day5.ipynb  Dzien3.ipynb
