# Chapter 3: Eniac Data Cleaning (Categories)
## Import

In [1]:
import os
os.chdir(os.path.join('..', 'data'))
tables_csv = [ csv for csv in os.listdir() if csv.endswith('.csv') ]

In [2]:
import pandas as pd
tables = {}
for table_csv in tables_csv:
    varname = table_csv.removesuffix('.csv')
    print(f'Reading {table_csv} into {varname} and {varname}_orig')
    tables[varname] = pd.read_csv(table_csv)
    tables[f'{varname}_orig'] = tables[varname].copy()

locals().update(tables)

Reading brands.csv into brands and brands_orig
Reading orderlines.csv into orderlines and orderlines_orig
Reading orderlines_cl.csv into orderlines_cl and orderlines_cl_orig
Reading orders.csv into orders and orders_orig
Reading orders_cl.csv into orders_cl and orders_cl_orig
Reading products.csv into products and products_orig
Reading products_cl.csv into products_cl and products_cl_orig
Reading type_to_name.csv into type_to_name and type_to_name_orig
Reading type_to_name_empty.csv into type_to_name_empty and type_to_name_empty_orig


## Config

In [3]:
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: f'{x:.2f}')
pd.set_option('display.max_colwidth', 250)

## Make `products.price` useful

In [4]:
THREEDIGITS=r'.*\.\d\d\d'
products = (
    products
    .assign(price=lambda x: x.price.fillna(''))
    .assign(price=lambda x: x.price.where(
        ~x.price.str.fullmatch(THREEDIGITS),
        ''))
    .assign(price=lambda x: pd.to_numeric(x.price))
)

## Word statistics by `products.type`

In [5]:
products_word_statistics = (
    products
    # join `name` and `desc`
    .assign(words=lambda x: (x.name + ' ' + x.desc).str.lower())
    # make sure "for" stays with the next word
    .assign(words=lambda x: x.words.str.replace('for ', 'for_'))
    # keep fixed phrase together
    .assign(words=lambda x: x.words.str.replace(
        'including parts and labor',
        'including_parts_and_labor'))
    # fixup misformatting
    .assign(words=lambda x: x.words.str.replace(' "', '" '))
    # split the words into a list
    .assign(words=lambda x: x.words.str.split())
    # collect all words for each type into (huge) lists and count products
    .groupby('type')
    .agg(count = ('type','count'), avg_price=('price', 'mean'), words = ('words', 'sum'))
    # now make separate rows for each word, so we can make statistics
    .explode('words')
    # remove words that are not useful in statistics
    [lambda x: ~x.words.isin([
        'and',
        'with',
        'of',
        'including',
        'for',
        'your',
        'to',
        '-',
        '/',
        '+',
        '|',
      ])]
    # canonify words
    .assign(words=lambda x:x.words.replace({
        'wi-fi': 'wifi'
    }))
    # count statistics of each word within each type
    # also group by `count` as it is unique for each type and such that
    # the aggregation is not done on it
    .groupby(['type', 'count', 'avg_price'])
    .value_counts()
    # name the counts `freq`
    .rename('freq')
    # make the new `words` index an ordinary column again
    .reset_index('words')
    # select at most top-ten most frequent words
    .groupby(['type', 'count', 'avg_price'])
    .head(10)
    # combine words and frequencies into a single column for display
    .assign(word_freqs=lambda x: x.words.map(str)+':'+x.freq.map(str))
    # collect all words/statistics into a single row per type
    .groupby(['type', 'count', 'avg_price'])
    .agg({'words': list, 'word_freqs': list})
    # make the `count` index an ordinary column again
    .reset_index(['count', 'avg_price'])
    .sort_values('count',ascending=False)
)

In [6]:
products_word_statistics = (
    products_word_statistics
    .join(
        orderlines
        [lambda x: x.id_order.isin(
            orders.query('state=="Completed"').order_id
        )]
        .join(products.set_index('sku'), on='sku')
        .groupby('type')
        .agg(sales_count=('product_quantity','sum')),
        on='type'
    )
)

In [7]:
products_word_statistics.insert(2,'name','')
products_word_statistics.sample(20)

Unnamed: 0_level_0,count,avg_price,name,words,word_freqs,sales_count
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5384,182,148.07,,"[headphones, ipad, headset, wireless, ipod, for_iphone, bluetooth, black, beats, iphone]","[headphones:183, ipad:137, headset:123, wireless:122, ipod:116, for_iphone:114, bluetooth:113, black:57, beats:56, iphone:46]",2644.0
9094,58,272.29,,"[camera, arlo, surveillance, hd, netgear, 2, ipad, pro, night, vision]","[camera:85, arlo:45, surveillance:41, hd:39, netgear:37, 2:33, ipad:33, pro:29, night:27, vision:26]",201.0
12085400,6,27.99,,"[cleaner, ipad, spray, ipod, spray., screens, mac, iphone, display, cleaning]","[cleaner:3, ipad:3, spray:3, ipod:2, spray.:2, screens:2, mac:2, iphone:2, display:2, cleaning:2]",46.0
12655397,107,193.92,,"[hard, drive, pc, sata, 35"", mac, nas, wd, digital, western]","[hard:163, drive:138, pc:94, sata:85, 35"":65, mac:63, nas:57, wd:56, digital:53, western:53]",2985.0
85641716,33,797.54,,"[iphone, 7, apple, free, new, gold, black, 128gb, 256gb, 32gb]","[iphone:66, 7:66, apple:64, free:32, new:31, gold:24, black:24, 128gb:24, 256gb:21, 32gb:20]",328.0
1375,9,116.1,,"[microphone, usb, blue, microphones, for_mac, snowball, professional, connection, yeti, pattern]","[microphone:18, usb:14, blue:10, microphones:9, for_mac:9, snowball:5, professional:5, connection:5, yeti:4, pattern:4]",47.0
1416,24,353.39,,"[adobe, for_mac, cc, software, pc., license, mac, update, office, home]","[adobe:21, for_mac:19, cc:18, software:15, pc.:11, license:11, mac:9, update:8, office:8, home:8]",56.0
24885185,68,540.79,,"[apple, watch, 38mm, strap, case, sport, series, aluminum, gps, steel]","[apple:136, watch:136, 38mm:93, strap:65, case:62, sport:59, series:56, aluminum:47, gps:47, steel:34]",170.0
11859,4,68.99,,"[sensor, homekit, fibaro, windows, window, multi-sensor, multi-function, movement, for_doors, door]","[sensor:7, homekit:7, fibaro:5, windows:2, window:2, multi-sensor:2, multi-function:2, movement:2, for_doors:2, door:2]",25.0
24215399,41,46.19,,"[watch, charging, apple, for_apple, dock, stand, support, base, watch., cable]","[watch:66, charging:49, apple:48, for_apple:34, dock:23, stand:23, support:17, base:16, watch.:15, cable:14]",288.0


In [8]:
products_word_statistics.to_csv('type_to_name_empty.csv')

## Specific types

In [9]:
products.query('type == "5,74E+15"').sample(10)

Unnamed: 0,sku,name,desc,price,promo_price,in_stock,type
5226,PAC1048,"Apple iMac 27 ""Core i5 3.3GHz Retina 5K | 16GB | 1TB Flash",IMac desktop computer 27 inch 5K Retina i5 3.3GHz 16GB Flash RAM 1TB (MK482Y / A).,3589.0,32.739.902,0,"5,74E+15"
6443,PAC1069,"Apple iMac 27 ""Core i7 Retina 5K 4Hz | 16GB | 1TB Flash | R9 M395X 4GB",IMac desktop computer 27 inch 5K Retina i5 3.3GHz RAM 16GB 1TB Flash R9 M395X 4GB (MK482Y / A).,4189.0,38.449.904,0,"5,74E+15"
3877,APP1383,"Apple iMac 27 ""Core i5 3.3GHz Retina 5K | 8GB | 2TB Fusion | R9 M395X 4GB",IMac desktop computer 27 inch 8GB RAM 2TB 5K Retina Fusion (MK482Y / A).,2929.0,27.895.848,0,"5,74E+15"
6856,PAC1608,"Apple iMac 27 ""Core i5 3.3GHz Retina 5K | 16GB RAM | 1TB SSD",Desktop computer iMac 27-inch 3.3GHz Core i5 5K Retina | 16GB RAM | 1TB SSD | R9 M395X 4GB (MK482Y / A),3589.0,29.859.896,0,"5,74E+15"
7519,PAC1620,"Apple iMac 27 ""Core i5 3.3GHz Retina 5K | 16GB RAM | 1TB SSD | Radeon R9 M395X 4GB",Desktop computer iMac 27-inch 3.3GHz Core i5 5K Retina | 16GB RAM | 1TB SSD | R9 M395X 4GB (MK482Y / A),3889.0,32.959.904,0,"5,74E+15"
4362,PAC1033,"Apple iMac 27 ""Core i7 Retina 5K 4GHz | 16GB | 2TB Fusion",IMac desktop computer 27 inch 5K Retina 4GHz i7 RAM 16GB 2TB Fusion (MK482Y / A).,3169.0,28.739.896,0,"5,74E+15"
3590,APP1378,"Apple iMac 27 ""Core i7 Retina 5K 4GHz | 8GB | 256GB Flash",IMac desktop computer 27 inch 8GB RAM 256GB Retina 5K Flash (MK482Y / A).,2929.0,27.895.848,0,"5,74E+15"
9868,PAC0974,"Apple iMac 27 ""Core i5 3.2GHz Retina 5K | 32GB | 256GB Flash",IMac desktop computer 27 inch 5K Retina i5 3.2GHz 256GB Flash RAM 32GB (MK472Y / A).,3169.0,26.309.901,0,"5,74E+15"
3445,APP1376,"Apple iMac 27 ""Core i7 Retina 5K 4GHz | 8GB | 3TB Fusion",IMac desktop computer 27 inch 5K Retina 8GB RAM 3TB Fusion (MK482Y / A).,3049.0,2.903.585,0,"5,74E+15"
5676,PAC1056,"Apple iMac 27 ""Core i5 3.3GHz Retina 5K | 16GB | 3TB Fusion | R9 M395X 4GB",IMac desktop computer 27 inch 5K Retina i5 3.3GHz RAM 16GB 3TB Fusion R9 M395X 4GB (MK482Y / A).,3289.0,29.889.904,0,"5,74E+15"


## Assigned category names

In [10]:
type_to_name

Unnamed: 0,type,name
0,"5,74E+15",imac_5k
1,"1,02E+12",macbook_i5
2,1282,imac
3,11935397,ext_hard_drive
4,2158,macbook_i7
5,12175397,nas
6,"2,17E+11",macbook_13
7,12215397,ssd_kit_air
8,1296,monitor
9,1405,graphics_tablet
