In [70]:
!pip install eli5




In [0]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

import eli5
from eli5.sklearn import PermutationImportance

from ast import literal_eval
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt

import seaborn as sns
sns.set()

In [3]:
cd "/content/drive/My Drive/Colab Notebooks/dw_matrix"

/content/drive/My Drive/Colab Notebooks/dw_matrix


In [4]:
ls

[0m[01;34mdata[0m/  HelloGithub.ipynb  LICENSE  [01;34mmatrix_one[0m/  README.md


In [9]:
ls data

men_shoes.csv  shoes_prices.csv


In [0]:
df=pd.read_csv('data/men_shoes.csv', low_memory=False)

In [0]:
def normalize_to_gram(line):
    if str(line) == 'nan': return -1

    value, unit = line.lower().split(' ')
    value = float(value)

    if unit == 'lbs' or unit == 'lb' or unit == 'pounds':
        value *=453.59237
    elif 'kg' == unit:
        value *= 1000
    elif 'ounces' == unit:
        value *= 28.3495231
    
    elif 'g' == unit:
        pass
    else:
        print('ERROR: ', unit)

    return value
df['weight_norm'] = df['weight'].map(normalize_to_gram)

In [188]:
df['weight_norm'].unique()

array([-1.00000000e+00,  1.36077711e+03,  9.00000000e+00,  6.57708937e+02,
        2.04116567e+02,  4.53592370e+02,  1.04326245e+02,  2.26796185e+03,
        2.49475804e+03,  3.37926316e+03,  1.81436948e+03,  1.26865250e+03,
        1.76901024e+03,  2.08652490e+03,  9.52543977e+02,  5.01537084e+02,
        6.80388555e+03,  6.80388554e+01,  4.54000000e+02,  4.76271988e+01,
        2.57980660e+02,  2.17724338e+03,  2.76691346e+03,  2.94835041e+03,
        5.00811336e+02,  1.30000000e+03,  9.10000000e+01,  9.07184740e+03,
        2.72155422e+03,  3.86000000e+02,  3.67409820e+02,  2.04116566e+03,
        1.41747615e+01,  9.07184740e+02,  1.41974412e+03,  2.67619498e+03,
        2.78959308e+03,  8.84505121e+02,  9.75223595e+02,  1.40000000e+04,
        2.17180027e+02,  4.53592370e+03,  1.72365101e+02,  1.13398093e+03,
        3.12579574e+04,  4.50000000e+01,  5.93752412e+03,  9.52543977e+01,
        7.59767220e+03,  2.85763193e+03,  2.72000000e+02,  1.80000000e+03,
        1.27005864e+03,  

In [189]:
df['weight_norm'].describe()

count    18280.000000
mean        61.990607
std        852.667090
min         -1.000000
25%         -1.000000
50%         -1.000000
75%         -1.000000
max      34509.307510
Name: weight_norm, dtype: float64

In [0]:
def run_model(feats, model= DecisionTreeRegressor(max_depth=5)):
    X=df[feats].values
    y=df['prices_amountmin'].values

    scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error')
    return np.mean(scores), np.std(scores)

In [0]:
df['brand_cat'] = df['brand'].map(lambda x: str(x).lower()).factorize()[0]

In [192]:
run_model(['brand_cat'])

(-58.133398968282776, 4.206122611474276)

In [193]:
model=RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
run_model(['brand_cat'], model)

(-57.31783843165656, 4.181246596160967)

In [194]:
df.features.head().values

array(['[{"key":"Gender","value":["Men"]},{"key":"Shoe Size","value":["M"]},{"key":"Shoe Category","value":["Men\'s Shoes"]},{"key":"Color","value":["Multicolor"]},{"key":"Manufacturer Part Number","value":["8190-W-NAVY-7.5"]},{"key":"Brand","value":["Josmo"]}]',
       '[{"key":"Gender","value":["Men"]},{"key":"Shoe Size","value":["M"]},{"key":"Shoe Category","value":["Men\'s Shoes"]},{"key":"Color","value":["Multicolor"]},{"key":"Manufacturer Part Number","value":["8190-W-NAVY-7.5"]},{"key":"Brand","value":["Josmo"]}]',
       '[{"key":"Gender","value":["Men"]},{"key":"Color","value":["Black"]},{"key":"Shipping Weight (in pounds)","value":["0.45"]},{"key":"Condition","value":["New"]},{"key":"Brand","value":["SERVUS BY HONEYWELL"]},{"key":"manufacturer_part_number","value":["ZSR101BLMLG"]}]',
       '[{"key":"Gender","value":["Men"]},{"key":"Color","value":["Black"]},{"key":"Shipping Weight (in pounds)","value":["0.45"]},{"key":"Condition","value":["New"]},{"key":"Brand","value":["SER

In [0]:
str_dict='[{"key":"Gender","value":["Men"]},{"key":"Shoe Size","value":["M"]},{"key":"Shoe Category","value":["Men\'s Shoes"]},{"key":"Color","value":["Multicolor"]},{"key":"Manufacturer Part Number","value":["8190-W-NAVY-7.5"]},{"key":"Brand","value":["Josmo"]}]'

In [196]:
literal_eval(str_dict)

[{'key': 'Gender', 'value': ['Men']},
 {'key': 'Shoe Size', 'value': ['M']},
 {'key': 'Shoe Category', 'value': ["Men's Shoes"]},
 {'key': 'Color', 'value': ['Multicolor']},
 {'key': 'Manufacturer Part Number', 'value': ['8190-W-NAVY-7.5']},
 {'key': 'Brand', 'value': ['Josmo']}]

In [197]:
literal_eval(str_dict)[0]['value']

['Men']

In [0]:
def parse_features(x):
    output_dict = {}
    if str(x) == 'nan': return output_dict
    
    features = literal_eval(x.replace('\\"','"'))
    for item in features:
        key = item['key'].lower().strip()
        value = item['value'][0].lower().strip()

        output_dict[key]=value
    return output_dict

df['features_parse'] = df['features'].map(parse_features)

In [199]:
keys=set()


df['features_parse'].map( lambda x: keys.update(x.keys()) )

len(keys)

476

In [200]:
df['features_parse'].head().values

array([{'gender': 'men', 'shoe size': 'm', 'shoe category': "men's shoes", 'color': 'multicolor', 'manufacturer part number': '8190-w-navy-7.5', 'brand': 'josmo'},
       {'gender': 'men', 'shoe size': 'm', 'shoe category': "men's shoes", 'color': 'multicolor', 'manufacturer part number': '8190-w-navy-7.5', 'brand': 'josmo'},
       {'gender': 'men', 'color': 'black', 'shipping weight (in pounds)': '0.45', 'condition': 'new', 'brand': 'servus by honeywell', 'manufacturer_part_number': 'zsr101blmlg'},
       {'gender': 'men', 'color': 'black', 'shipping weight (in pounds)': '0.45', 'condition': 'new', 'brand': 'servus by honeywell', 'manufacturer_part_number': 'zsr101blmlg'},
       {'gender': 'men', 'color': 'black', 'shipping weight (in pounds)': '0.45', 'condition': 'new', 'brand': 'servus by honeywell', 'manufacturer_part_number': 'zsr101blmlg'}],
      dtype=object)

In [201]:
def get_name_feat(key):
    return 'feat_' + key
for key in tqdm_notebook(keys): 
    df[get_name_feat(key)] = df.features_parse.map( lambda feats: feats[key] if key in feats else np.nan)


HBox(children=(IntProgress(value=0, max=476), HTML(value='')))




In [264]:
!cat /proc/cpuinfo

processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 63
model name	: Intel(R) Xeon(R) CPU @ 2.30GHz
stepping	: 0
microcode	: 0x1
cpu MHz		: 2300.000
cache size	: 46080 KB
physical id	: 0
siblings	: 2
core id		: 0
cpu cores	: 1
apicid		: 0
initial apicid	: 0
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm invpcid_single ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid xsaveopt arat md_clear arch_capabilities
bugs		: cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs
bogomips	: 4600.00
clflush size	: 64
cache_alignment	: 64
address sizes	: 46 bits physical, 48 bits virtual
power management:

processor	:

In [205]:
df['feat_athlete'].isnull().shape[0]

18280

In [0]:
keys_stat = {}
for key in keys:
    keys_stat[key]=df[False == df[get_name_feat(key)].isnull()].shape[0] / df.shape[0] * 100

In [207]:
{k:v for k,v in keys_stat.items() if v>30}

{'brand': 48.62691466083151,
 'color': 47.784463894967175,
 'gender': 50.17505470459519,
 'manufacturer part number': 36.252735229759296,
 'material': 34.9070021881838}

In [0]:
#df['feat_brand_cat'] = df['feat_brand'].factorize()[0]
#df['feat_color_cat'] = df['feat_color'].factorize()[0]
#df['feat_gender_cat'] = df['feat_gender'].factorize()[0]
#df['feat_manufacturer part number_cat'] = df['feat_manufacturer part number'].factorize()[0]
#df['feat_material_cat'] = df['feat_material'].factorize()[0]

#df['feat_sport_cat'] = df['feat_sport'].factorize()[0]
#df['feat_style_cat'] = df['feat_style'].factorize()[0]

for key in keys:
    df[get_name_feat(key) + '_cat'] = df[get_name_feat(key)].factorize()[0]

In [212]:
df['weight_norm']

0       -1.0
1       -1.0
2       -1.0
3       -1.0
4       -1.0
        ... 
18275   -1.0
18276   -1.0
18277   -1.0
18278   -1.0
18279   -1.0
Name: weight_norm, Length: 18280, dtype: float64

In [213]:
df['brand'] = df['brand'].map(lambda x : str(x).lower())
df[df.brand == df.feat_brand].shape
#df[df.brand == df.feat_brand][ ['brand','feat_brand'] ].head()

(8846, 1003)

In [0]:
feats = ['']

In [215]:
model = RandomForestRegressor(max_depth=5, n_estimators=100)
run_model(['brand_cat'], model)

(-57.264003593674545, 4.230482289195837)

In [0]:
feats_cat =[x for x in df.columns if x.endswith('_cat')]
#feats_cat - tymczasowo wyłączone


In [0]:
######
#lic=0
#kasuj = [0,2,3,4,5,6,7,8,9,10]
#for kas in kasuj:
 #   del feats_cat[kas-lic]
#    lic+=1

In [229]:
len(feats_cat)

477

In [0]:
del feats_cat[1]

In [0]:
feats_cat

In [0]:
#@title
#feats_test=['brand_cat', 'feat_material_cat', 'feat_brand_cat', 'feat_fabric content_cat', 'feat_adjustable_cat', 'feat_shoe category_cat', 
#            'feat_color_cat', 'feat_case diameter_cat', 'feat_style_cat']
feats=['brand_cat', 'feat_material_cat', 'feat_brand_cat', 'feat_fabric content_cat', 'feat_adjustable_cat', 
       'feat_color_cat', 'feat_case diameter_cat', 'feat_style_cat','feat_resizable_cat', 'weight_norm']
#feats = feats_cat
#feats += feats_cat
#feats = list(set(feats))
model = RandomForestRegressor(max_depth=5, n_estimators=100)
result = run_model(feats, model)

In [270]:
feats_test=['brand_cat', 'feat_material_cat', 'weight_norm', 'feat_brand_cat', 'feat_fabric content_cat', 'feat_adjustable_cat', 'feat_resizable_cat', 'feat_shoe category_cat', 
            'feat_color_cat', 'feat_fabric material_cat', 'feat_jacket length_cat', 'feat_case thickness_cat', 'feat_gender_cat', 'feat_case diameter_cat', 'feat_metal type_cat',
            'feat_item package quantity_cat', 'feat_style_cat']

#feats_test=['brand_cat', 'feat_material_cat', 'feat_brand_cat', 'feat_color_cat', 'feat_case thickness_cat']
#feats_test=feats_cat
#feats = feats_cat
#feats += feats_cat
#feats = list(set(feats))
test=[]
ramka=pd.DataFrame(columns=['nazwa', 'mean', 'std'])
licznik=0
for f in feats_test:
    test.append(f)
    model_test = RandomForestRegressor(max_depth=5, n_estimators=100)
    result_test = run_model(test, model_test)
    #print(f, result_test)
    wsad=[f,result_test[0],result_test[1]]
    ramka.loc[licznik]=wsad
    licznik+=1
ramka

Unnamed: 0,nazwa,mean,std
0,brand_cat,-57.349977,4.203289
1,feat_material_cat,-57.190554,4.09918
2,weight_norm,-57.180277,3.967561
3,feat_brand_cat,-57.114008,3.961564
4,feat_fabric content_cat,-57.155331,3.78913
5,feat_adjustable_cat,-57.336617,4.297345
6,feat_resizable_cat,-57.30209,4.229422
7,feat_shoe category_cat,-57.415646,4.4372
8,feat_color_cat,-57.348808,4.353311
9,feat_fabric material_cat,-57.444903,4.226708


In [0]:
plt.plot(ramka['nazwa'], ramka['mean'], '--')

In [250]:
result_test[0]

-56.99828541377301

In [286]:
X = df[ feats ].values
y = df['prices_amountmin'].values
#y = df.pop(['prices_amountmin']).values
#X = df[ feats ].values

m = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
m.fit(X,y)

print(result)
perm = PermutationImportance(m, random_state=1).fit(X,y);
eli5.show_weights(perm, feature_names = feats, top=30)

(-57.0376539008, 4.320244729069656)


Weight,Feature
0.2563  ± 0.0080,brand_cat
0.1106  ± 0.0104,feat_material_cat
0.0237  ± 0.0006,feat_brand_cat
0.0213  ± 0.0012,feat_case diameter_cat
0.0161  ± 0.0016,feat_fabric content_cat
0.0095  ± 0.0023,feat_adjustable_cat
0.0091  ± 0.0013,feat_color_cat
0.0089  ± 0.0026,feat_resizable_cat
0.0050  ± 0.0009,weight_norm
0.0033  ± 0.0009,feat_style_cat


In [0]:
eli5.show_weights(perm, feature_names = feats, top=30)

In [0]:
df['brand'].value_counts(normalize=True)

nike                       0.097210
puma                       0.033315
ralph lauren               0.028775
vans                       0.021116
new balance                0.020295
                             ...   
andanines                  0.000055
cabela's                   0.000055
polo sport ralph lauren    0.000055
chalktalksports            0.000055
inmonarch                  0.000055
Name: brand, Length: 1732, dtype: float64

In [0]:
df[ df['brand']=='nike'].features_parse.sample(5).values

array([{'season': 'all-season', 'occasion': 'athletic', 'material': 'suede', 'gender': 'men', 'shoe size': '11.5', 'size': '11.5', 'color': 'black', 'model': '599277 006', 'manufacturer part number': '599277 006', 'brand': 'nike', 'age group': 'adult'},
       {'condition': 'new with box', 'type': 'cleats'},
       {'sport': 'running shoes', 'occasion': 'running shoes', 'material': 'mesh', 'gender': 'men', 'shoe size': '9 us men', 'shoe category': "men's shoes", 'assembled product dimensions (l x w x h)': '12.00 x 8.00 x 6.00 inches', 'fabric content': 'mesh', 'color': 'lt retro/white/volt', 'model': '644425 402', 'shoe closure': 'lace-up', 'casual & dress shoe style': 'running shoes', 'manufacturer part number': '644425 402', 'brand': 'nike', 'shoe width': 'd(m)'},
       {'material': 'acylic', 'type': 'holiday set (beanie & stocking)'},
       {'sport': 'skate shoes', 'occasion': 'skate shoes', 'material': 'suede', 'gender': 'men', 'shoe size': '10.5 us men', 'shoe category': "men's 

In [0]:
!git add matrix_one/day4.ipynb

In [0]:
#!git config --global user.email "kozlowski.k@gmail.com"
#!git config --global user.name "Karol"

In [0]:
!git commit -m "Simple model"

In [0]:
!git push -u origin master