In [1]:
import pandas as pd
import os
import datetime
import re
import statsmodels.api as sm
from math import sqrt
from itertools import permutations
from collections import defaultdict, Counter
from statistics import mode

In [2]:
brand = 'Louis Vuitton'
directory = '/Users/yulia/Desktop/chanel/LV_Alma_Pm'
os.chdir(directory)

In [3]:
fabrics = {
    'Chanel': [['lambskin', 'lamb', 'lamskin'], ['patent', 'vernis'], ['caviar', 'cavier'], ['calfskin', 'calf'], ['tweed'],
               ['canvas'], ['velvet'], ['denim'], ['jersey'], ['suede'], ['goat skin', 'goatskin'], ['fabric'], ['python'],
               ['cotton'], ['satin'], ['nylon'], ['ostrich'], ['snakeskin']],
    'Fendi': [['calfskin', 'calf leather'], ['velvet'], ['python'], ['patent']],
    'Gucci': [['calfskin', 'calf', 'calfakin'], ['velvet'], ['canvas'], ['jacquard'], ['velure']],
    'Saint Laurent': [['calfskin', 'calf', 'calf-skin leather'], ['grained leather'], ['suede', 'suded'], ['crocodile', 'croc leather'],
                     ['smooth'], ['patent'], ['lambskin']],
    'Louis Vuitton': [['canvas'], ['epi leather', 'epi', 'epic'], ['patent leather', 'patent'], ['canvas leather'],
                      ['vernis leather', 'vernis']]
}

sizes = {
    'Chanel': {
        'Medium': (10, 3, 6.3, 3)
    },
    'Louis Vuitton': {
        'Pm': (12.6, 9.4, 6, 5)
    },
    'Saint Laurent': {
        'Small': (12.5, 9.8, 6.4, 5)
    },
    'Gucci' : {
        'Handle': (12.5, 8.5, 4.25, 4),
        'Mini': (7.5, 5.5, 3, 4),
        'Shoulder': (10, 6.75, 3, 4)
    },
    'Fendi': {
        'Small': (5.2, 7.5, 3.5, 3),
        'Medium': (7.3, 9.8, 4.3, 3)
    }
}

In [4]:
df = pd.DataFrame()
for file in [file for file in os.listdir() if file.endswith('.csv')]:
    MONTH, DAY = int(re.search('\_(\d+)\_(\d+)', file)[1]), int(re.search('\_(\d+)\_(\d+)', file)[2])
    df1 = pd.read_csv(file)
    df1['sc_date'] = datetime.date(year=2019, month=MONTH, day=DAY)
    df = pd.DataFrame(df.append(df1))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


In [5]:
df = df.replace({pd.np.nan: None})

# clean dataset

In [6]:
df = df[df['bags_brand'] == brand]
size_before = df.shape[0]

In [7]:
def material(string, brand):
    list_of_materials = []
    string_refined = str(string).lower()
    for material_list in fabrics[brand]:
        for material in material_list:
            r = r'\b%s\b'%material
            if re.search(r, string_refined):
                list_of_materials.append(material_list[0])
    if not list_of_materials:
        if 'leather' in string_refined:
            list_of_materials.append('leather')
    if list_of_materials:
        return str(list_of_materials)
    return

In [8]:
def material_refine(row, brand):
    for string in [row['bags_name'], row['bags_description'], row['bags_fabric']]:
        found_materials = material(string, brand)
        if found_materials:
            return found_materials
    return None

In [9]:
if brand in fabrics:
    df['materials_list'] = df.apply(lambda x: material_refine(x, brand), axis = 1)

In [10]:
def extract_price(text_string):
    if type(text_string) == str:
        try:
            digits = re.search('\$(.*)\.\d{2}$', text_string)[1]
            digits = ''.join(digits.split(','))
            return int(digits)
        except:
            return 'Error'
    return

In [11]:
df['bags_price_refined'] = df['bags_price'].map(lambda x: extract_price(x))
df['sold_price_refined'] = df['bags_sold_for'].map(lambda x: extract_price(x))
df['retail_price_refined'] = df['bags_retail_price'].map(lambda x: extract_price(x))

In [12]:
def bag_id(bag_dict):
    if type(bag_dict['bags_on_sale']) == str:
        res = re.search('\/(\d{5,})\/', bag_dict['bags_on_sale'])
        if res:
            return res[1]
    elif type(bag_dict['bags_sold']) == str:
        res = re.search('\/(\d{5,})\/', bag_dict['bags_sold'])
        if res:
            return res[1]
    else:
        return

In [13]:
df['id'] = df.apply(lambda x: bag_id(x), axis = 1)

In [14]:
def popular(text):
    number = str(text).split(' ')[0]
    if number.isdigit():
        return number
    else:
        return 0

In [15]:
df['likes'] = df['bags_popular'].map(popular)

In [16]:
def extract_size(text):
    try:
        l = float(re.search('(\d*\.*\d*)"L', text)[1])
        w = float(re.search('(\d*\.*\d*)"W', text)[1])
        h = float(re.search('(\d*\.*\d*)"H', text)[1])
        return l, w, h
    except:
        return (100, 100, 100)

In [17]:
def vector_distance(x, y):
    if len(x) == len(y):
        return sqrt(sum((x[i] - y[i])**2 for i in range(len(x))))
    else:
        raise Exception('Different dimension of vectors')

In [18]:
def correct_size(text):
    min_diff = 1000
    correct_size = None
    for size in sizes[brand]:
        l, w, h, diff = sizes[brand][size]
        L, W, H = extract_size(text)
        cur_diff = min([vector_distance((L,W,H), c) for c in permutations([l, w, h])])
        if cur_diff < diff and cur_diff < min_diff:
            min_diff = cur_diff
            correct_size = size
    return correct_size

In [19]:
#filter out incorrect sizes
df['size'] = df.apply(lambda x: 
                 correct_size(x['bags_size']) or correct_size(x['bags_fabric']) or None, axis = 1)

In [20]:
# add missing values if the same id has this feature in other row

for feature in ['size', 'bags_color', 'materials_list']:
    feature_dict = defaultdict(list)
    for i, row in df.iterrows():
        if row[feature] is not None:
            feature_dict[row['id']].append(row[feature])
    for item in feature_dict.keys():
        value = Counter(feature_dict[item]).most_common()[0][0]
        if value != []
            feature_dict[item] = value
        else:
            feature_dict[item] = None
    df[feature] = df['id'].map(feature_dict)

In [22]:
df['id'] = df['id'].astype(int)
df['likes'] = df['likes'].astype(int)

In [23]:
print("{0:.0%}".format(df.shape[0] / size_before))

100%


In [24]:
export_columns = ['bags_brand', 'id', 'sc_date', 'bags_color', 'bags_condition', 'materials_list', 'bags_price_refined',
                 'sold_price_refined', 'retail_price_refined', 'likes', 'size']

In [25]:
if brand == 'Gucci':
    df = df[df['sc_date'].map(lambda x: datetime.date(2019,2,13)<=x<=datetime.date(2019,5,10))]


In [26]:
df.reset_index()[export_columns].to_pickle('%s.pkl' % brand)

In [27]:
# df.reset_index()[export_columns].to_csv('refined_data_example.csv')

In [29]:
df.reset_index()[export_columns].isnull().sum()

bags_brand                   0
id                           0
sc_date                      0
bags_color                   0
bags_condition               0
materials_list               0
bags_price_refined      145456
sold_price_refined       19197
retail_price_refined     50508
likes                        0
size                         0
dtype: int64

In [35]:
df[df['bags_on_sale'].isnull()]

Unnamed: 0,bags_brand,bags_collection,bags_color,bags_condition,bags_description,bags_fabric,bags_name,bags_on_sale,bags_original_listing_price,bags_popular,...,bags_type,bags_type_url,sc_date,materials_list,bags_price_refined,sold_price_refined,retail_price_refined,id,likes,size
219,Louis Vuitton,,Damier-ebene,Gently used,Louis Vuitton Alma PM\nDamier Ebene\nCoated ca...,Coated Canvas/Leather,Alma Pm Damier Ebene Coated Canvas/Leather Sat...,,,278 people saved this item,...,Satchels,,2019-02-11,['canvas'],,550.0,,5214937,278,Pm
220,Louis Vuitton,,Damier-ebene,Gently used,Louis Vuitton Damier Ebene canvas Alma PM hand...,Coated Canvas,Alma Pm Damier Ebene Coated Canvas Satchel,,,254 people saved this item,...,Satchels,,2019-02-11,['canvas'],,572.0,1500.0,10931881,254,Pm
221,Louis Vuitton,,Purple,Gently used,"Comes with lock, keys and dust bag. LOUIS VUIT...",leather,Alma Vernis Pm In Amarante Purple Leather Satchel,,,196 people saved this item,...,Satchels,,2019-02-11,['vernis leather'],,1585.0,2600.0,11115349,196,Pm
222,Louis Vuitton,,Brown,Gently used,Condition - Very Good\nSKU - 1469\nOriginal Re...,Canvas,Alma Monogram Pm Brown Canvas Satchel,,,104 people saved this item,...,,,2019-02-11,['canvas'],,695.0,1500.0,22091523,104,Pm
223,Louis Vuitton,,Red,New with tags,Make this beauty yours!!! Authentic Louis Vuit...,NEW,Alma Epi Pm Red New Satchel,,,200 people saved this item,...,,,2019-02-11,['epi leather'],,1720.0,2120.0,6856225,200,Pm
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1772,Louis Vuitton,,Monogram-lv,Like new,Zipper closure shows signs of wear and inside ...,Canvas,Alma Pm Monogram Lv Canvas Satchel,,,3 people saved this item,...,,,2019-02-21,['canvas'],,975.0,1595.0,24018182,3,Pm
1773,Louis Vuitton,,Brown,Gently used,Beautiful authentic Louis Vuitton Bag in good ...,Leather,Alma Pm Monogram Canvas Brown Leather Satchel,,,,...,,,2019-02-21,['canvas'],,400.0,1200.0,24478159,0,Pm
1774,Louis Vuitton,,Brown,Gently used,-12.8 x 9.5 x 6 inches\n- Leather key tag\n- D...,Leather,Alma Pm Damier Ebene N53151 Brown Leather Satchel,,,9 people saved this item,...,,,2019-02-21,['leather'],,1110.0,1500.0,22883663,9,Pm
1775,Louis Vuitton,,[],Like new,,"Comes with original Dustbag ,lock and key.Alma...",Alma Comes with Dustbag Lock and Key.alma Pm M...,,,,...,Louis Vuitton Satchels,,2019-02-21,[],,699.0,2000.0,22019498,0,[]
