In [0]:
!pip install eli5

Collecting eli5
[?25l  Downloading https://files.pythonhosted.org/packages/97/2f/c85c7d8f8548e460829971785347e14e45fa5c6617da374711dec8cb38cc/eli5-0.10.1-py2.py3-none-any.whl (105kB)
[K     |████████████████████████████████| 112kB 3.0MB/s 
Installing collected packages: eli5
Successfully installed eli5-0.10.1


In [0]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

import eli5
from eli5.sklearn import PermutationImportance

from ast import literal_eval
from tqdm import tqdm_notebook

In [11]:
 cd "drive/My Drive/Colab Notebooks/matrix_transformacja"

[Errno 2] No such file or directory: 'drive/My Drive/Colab Notebooks/matrix_transformacja'
/content/drive/My Drive/Colab Notebooks/matrix_transformacja


In [12]:
ls data

shoe_prices.csv  shoe_prices_filtered.csv


In [0]:
df = pd.read_csv("data/shoe_prices_filtered.csv",low_memory=False)

In [14]:
df.columns

Index(['id', 'asins', 'brand', 'categories', 'colors', 'count', 'dateadded',
       'dateupdated', 'descriptions', 'dimension', 'ean', 'features',
       'flavors', 'imageurls', 'isbn', 'keys', 'manufacturer',
       'manufacturernumber', 'merchants', 'name', 'prices_amountmin',
       'prices_amountmax', 'prices_availability', 'prices_color',
       'prices_condition', 'prices_count', 'prices_currency',
       'prices_dateadded', 'prices_dateseen', 'prices_flavor', 'prices_issale',
       'prices_merchant', 'prices_offer', 'prices_returnpolicy',
       'prices_shipping', 'prices_size', 'prices_source', 'prices_sourceurls',
       'prices_warranty', 'quantities', 'reviews', 'sizes', 'skus',
       'sourceurls', 'upc', 'vin', 'websiteids', 'weight'],
      dtype='object')

In [0]:
def run_model(features, model=DecisionTreeRegressor(max_depth=5)):
  X = df[ features ].values
  y = df.prices_amountmin.values

  scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error')
  return np.mean(scores), np.std(scores)

In [23]:
df['brand_fact'] = df.brand.map(lambda x: str(x).lower() ).factorize()[0]
run_model(['brand_fact'])

(-59.74991436028463, 5.857301054060102)

In [60]:
model = RandomForestRegressor(max_depth=8, random_state=16)
run_model(['brand_fact'],model)

(-55.12814424858031, 5.781434924718213)

In [0]:
def parse_features(x):
  output_dict = {}
  if str(x) == 'nan': return output_dict 
  features = literal_eval(x.replace('\\"','"'))
  
  for item in features:
    key = item['key'].lower().strip()
    value = item['value'][0].lower().strip()
    output_dict[key] = value
  
  return output_dict 

df['features_parsed'] = df.features.map(parse_features)

In [37]:
df['features_parsed'].head().values

array([{'gender': 'men', 'shoe size': 'm', 'shoe category': "men's shoes", 'color': 'multicolor', 'manufacturer part number': '8190-w-navy-7.5', 'brand': 'josmo'},
       {'gender': 'men', 'shoe size': 'm', 'shoe category': "men's shoes", 'color': 'multicolor', 'manufacturer part number': '8190-w-navy-7.5', 'brand': 'josmo'},
       {'gender': 'men', 'color': 'black', 'shipping weight (in pounds)': '0.45', 'condition': 'new', 'brand': 'servus by honeywell', 'manufacturer_part_number': 'zsr101blmlg'},
       {'gender': 'men', 'color': 'black', 'shipping weight (in pounds)': '0.45', 'condition': 'new', 'brand': 'servus by honeywell', 'manufacturer_part_number': 'zsr101blmlg'},
       {'gender': 'men', 'color': 'black', 'shipping weight (in pounds)': '0.45', 'condition': 'new', 'brand': 'servus by honeywell', 'manufacturer_part_number': 'zsr101blmlg'}],
      dtype=object)

In [38]:
keys = set()
df.features_parsed.map(lambda x: keys.update(x.keys()))
len(keys)

476

In [41]:
def get_feat_name(key):
  return "feat_" + key

for key in tqdm_notebook(keys):
  df[get_feat_name(key)] = df.features_parsed.map(lambda feats: feats[key] if key in feats else np.nan)

HBox(children=(IntProgress(value=0, max=476), HTML(value='')))




In [43]:
df.columns

Index(['id', 'asins', 'brand', 'categories', 'colors', 'count', 'dateadded',
       'dateupdated', 'descriptions', 'dimension',
       ...
       'feat_recommended location', 'feat_overall material',
       'feat_watch style', 'feat_watch shape', 'feat_lens width',
       'feat_work shoes', 'feat_guaranteed authentic', 'feat_part type',
       'feat_number of items', 'feat_batteries included?'],
      dtype='object', length=526)

In [44]:
key_stats =  {}

for key in tqdm_notebook(keys):
  key_stats[key] = df[ False == df[get_feat_name(key)].isnull() ].shape[0] / df.shape[0] * 100

HBox(children=(IntProgress(value=0, max=476), HTML(value='')))




In [49]:
{k:v for k,v in key_stats.items() if v > 30}

{'brand': 48.46254497873733,
 'color': 47.62294188201941,
 'gender': 50.00545196816051,
 'manufacturer part number': 36.130192999672886,
 'material': 34.78900883218842}

In [0]:
df['feat_brand_fact'] = df.feat_brand.factorize()[0]
df['feat_color_fact'] = df.feat_color.factorize()[0]
df['feat_gender_fact'] = df.feat_gender.factorize()[0]
df['feat_material_fact'] = df.feat_material.factorize()[0]
df['feat_sport_fact'] = df.feat_sport.factorize()[0]
df['feat_style_fact'] = df.feat_style.factorize()[0]
df['feat_condition_fact'] = df.feat_condition.factorize()[0]

for key in keys:
  df[get_feat_name(key) + 'fact'] = df[get_feat_name].factorize()[0]

In [0]:
#feats_fact = [x for x in df.columns if 'fact' in x]

In [0]:
feats = ['feat_gender_fact', 'feat_material_fact', 'brand_fact','feat_style_fact']
#feats = list(set(feats+feats_fact))
result = run_model(feats,model)



In [58]:
df.brand = df.brand.map(lambda x: str(x).lower())
df[ df.brand != df.feat_brand ][['feat_brand', 'brand']].head()

Unnamed: 0,feat_brand,brand
21,generic,rubies
22,generic,rubies
23,generic,rubies
24,,unbranded
31,,american fighter


In [0]:
X = df[feats].values
y = df.prices_amountmin.values

In [85]:
model.fit(X,y)

print(result)
perm = PermutationImportance(model, random_state=16).fit(X,y)
eli5.show_weights(perm, feature_names=feats)

(-55.376853264731665, 5.9571141877184965)


Weight,Feature
0.4138  ± 0.0081,brand_fact
0.1836  ± 0.0140,feat_gender_fact
0.1516  ± 0.0190,feat_material_fact
0.0337  ± 0.0053,feat_style_fact


In [86]:
!git status

On branch master
Your branch is up to date with 'origin/master'.

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git checkout -- <file>..." to discard changes in working directory)

	[31mmodified:   matrix#3.ipynb[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)

	[31mmatrix#2.ipynb[m
	[31mmatrix#3 (1).ipynb[m
	[31mmatrix#4.ipynb[m
	[31mmatrix#5.ipynb[m

no changes added to commit (use "git add" and/or "git commit -a")


In [0]:
!git add matrix#3.ipynb matrix#2.ipynb 