In [1]:
import pickle

import pandas as pd

## Heuristics 

Find discount percentage by looking at past transaction data.

In [2]:
df = pd.read_csv('./data/processed_source_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11040 entries, 0 to 11039
Data columns (total 13 columns):
brand     11040 non-null object
date      11040 non-null object
diff      11040 non-null int64
image     11040 non-null object
length    11040 non-null int64
link      11040 non-null object
nwt       11040 non-null bool
price     11040 non-null int64
size      10068 non-null float64
sold      11040 non-null bool
status    1542 non-null object
stock     3410 non-null object
title     11040 non-null object
dtypes: bool(2), float64(1), int64(3), object(7)
memory usage: 970.4+ KB


In [3]:
df2 = df[['brand', 'price', 'size', 'diff', 'length', 'nwt', 'sold']]
df2.head()

Unnamed: 0,brand,price,size,diff,length,nwt,sold
0,big_star,25,30.0,2,20,False,False
1,big_star,35,33.0,2,25,False,False
2,big_star,50,38.0,2,25,True,False
3,big_star,15,33.0,2,20,False,False
4,big_star,80,32.0,2,22,True,False


In [4]:
listed_df = df[df['sold'] == False]
listed_agg = listed_df.groupby('brand')['price', 'diff', 'length'].median().reset_index().rename(
    columns={'brand':'Brand', 'price':'Listed Price', 'diff':'Days Listed', 'length':'Title Length'})
listed_agg

Unnamed: 0,Brand,Listed Price,Days Listed,Title Length
0,big_star,35.0,59.0,34.0
1,diesel,55.0,20.0,34.0
2,gap,21.5,21.0,27.0
3,hugo_boss,45.0,94.0,34.0
4,j_crew,28.0,39.0,32.0
5,levi_s,25.0,3.0,31.0
6,lucky_brand,30.0,10.0,37.0
7,mavi,35.0,114.0,33.0
8,naked_famous_denim,75.0,131.0,44.0
9,true_religion,65.0,10.0,31.0


In [5]:
sold_df = df[df['sold'] == True]
sold_agg = sold_df.groupby('brand')['price', 'diff', 'length'].median().reset_index().rename(
    columns={'brand':'Brand', 'price':'Sold Price', 'diff':'Days Listed', 'length':'Title Length'})
sold_agg

Unnamed: 0,Brand,Sold Price,Days Listed,Title Length
0,big_star,25.0,81.0,37.0
1,diesel,30.0,30.0,31.0
2,gap,14.0,32.0,29.5
3,hugo_boss,25.0,165.5,35.0
4,j_crew,15.0,52.0,36.0
5,levi_s,17.5,4.0,26.0
6,lucky_brand,20.0,13.0,38.0
7,mavi,24.0,188.5,35.0
8,naked_famous_denim,40.0,260.0,44.0
9,true_religion,40.0,16.0,35.0


In [6]:
merged_inner = pd.merge(left=listed_agg[['Brand', 'Listed Price']], 
                        right=sold_agg[['Brand', 'Sold Price']], 
                        left_on='Brand', right_on='Brand')

In [7]:
merged_inner['Discount'] = round(1 - (merged_inner['Sold Price'] / merged_inner['Listed Price']), 2) * 100

In [8]:
merged_inner

Unnamed: 0,Brand,Listed Price,Sold Price,Discount
0,big_star,35.0,25.0,29.0
1,diesel,55.0,30.0,45.0
2,gap,21.5,14.0,35.0
3,hugo_boss,45.0,25.0,44.0
4,j_crew,28.0,15.0,46.0
5,levi_s,25.0,17.5,30.0
6,lucky_brand,30.0,20.0,33.0
7,mavi,35.0,24.0,31.0
8,naked_famous_denim,75.0,40.0,47.0
9,true_religion,65.0,40.0,38.0


In [9]:
records = merged_inner[['Brand', 'Discount']].to_dict('records')

In [10]:
discount_obj = {}

for item in records:
    brand = item['Brand']
    discount = item['Discount']
    discount_obj[brand] = discount

In [11]:
discount_obj

{'big_star': 28.999999999999996,
 'diesel': 45.0,
 'gap': 35.0,
 'hugo_boss': 44.0,
 'j_crew': 46.0,
 'levi_s': 30.0,
 'lucky_brand': 33.0,
 'mavi': 31.0,
 'naked_famous_denim': 47.0,
 'true_religion': 38.0,
 'uniqlo': 32.0,
 'wrangler': 25.0}

In [12]:
pickle.dump(discount_obj, open("models/heuristic_model.p", "wb"))