# Brand analysis notebook

In [4]:
import numpy as np
import pandas as pd
import ast
import matplotlib.pyplot as plt
import json
import seaborn as sns
import bisect
from IPython.display import IFrame
%matplotlib inline

In [5]:
import findspark
findspark.init()

import pyspark
conf = pyspark.SparkConf()\
    .setMaster('local[*]')\
    .set('spark.executor.memory', '1g')\
    .set('spark.driver.memory', '1g')\
    .set('spark.executor.instances', '4')
    
sc = pyspark.SparkContext(conf=conf)

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=pyspark-shell, master=local[*]) created by __init__ at <ipython-input-2-9dcf605673d1>:7 

### Test and example

##### @todo: why do I need to run it twice?

In [7]:
# New dataset format. salesRankCat and salesRankPos are always included together
sc.textFile('data/reduced.json')\
    .map(lambda x: json.loads(x))\
    .filter(lambda x: 'price' in x and 'related' in x and 'salesRankCat' in x)\
    .take(1)

[{'asin': '9868238854',
  'average_rating': 5.0,
  'category': ['Musical Instruments',
   'Band & Orchestra',
   'Wind & Woodwinds',
   'Recorders'],
  'description': 'Woodnote Tenor Recorder  was made of High performance ABS Plastic Resin and are highly recommended by many buyers and collectors.',
  'helpful_fraction': 1.0,
  'imUrl': 'http://ecx.images-amazon.com/images/I/51h8cIN5FaL._SY300_.jpg',
  'num_reviews': 2,
  'price': 61.41,
  'related': {'also_bought': ['078661790X',
    'B001LQW2KI',
    '0882848143',
    'B001EA71R0',
    '0769219020',
    'B001LQU5W0',
    'B008DYGKEU'],
   'buy_after_viewing': ['B0002MS7O2',
    'B0002D00QE',
    'B000RW20Q8',
    '0882848143']},
  'salesRankCat': 'Musical Instruments',
  'salesRankPos': 60253,
  'title': 'Woodnote Wood Grain Tenor Recorder-Baroque fingering/3 pieces construction'}]

### Load dataset, select category, filter entries

In [8]:
# Collect the products in the "Headphones" category
#cat = ['Electronics', 'Accessories & Supplies', 'Audio & Video Accessories', 'Headphones']
cat = ['Electronics']
records = sc.textFile('data/reduced.json')\
    .map(lambda x: json.loads(x))\
    .filter(lambda x: 'salesRankCat' in x and 'price' in x and 'brand' in x and 'related' in x and 'average_rating' in x)\
    .filter(lambda x: len(x['category']) >= len(cat) and x['category'][:len(cat)] == cat)\
    .collect()

### Create graph and cliques

In [9]:
# Convert string IDs to numeric IDs
name_id_mapping = {}
for i, record in enumerate(records):
    name_id_mapping[record['asin']] = i
    
adj_list = []
adj_set = []
for record in records:
    # Buy after viewing: add edge
    if 'related' in record and 'buy_after_viewing' in record['related']:
        av = record['related']['buy_after_viewing']
    else:
        av = []
    av = list(filter(lambda x: x in name_id_mapping, av))
    adj_list.append(sorted(set([name_id_mapping[x] for x in av])))
    
    # Bought together: remove edge (if added earlier)
    if 'related' in record and 'bought_together' in record['related']:
        bt = record['related']['bought_together']
    else:
        bt = []
    bt = list(filter(lambda x: x in name_id_mapping, bt))
    bt = [name_id_mapping[x] for x in bt]
    for o in bt:
        if o in adj_list[-1]:
            adj_list[-1].remove(o)
    adj_set.append(frozenset(adj_list[-1]))

# Transposed adjacency list (represents the incoming edges in every node)
adj_list_incoming = []
for adj in adj_list:
    adj_list_incoming.append([])
    
for i, adj in enumerate(adj_list):
    for node in adj:
        adj_list_incoming[node].append(i)

In [10]:
cliques = []
def extract_all_cliques(nodes, prev_list):
    count = 0
    m = 0 if len(prev_list) == 0 else bisect.bisect_right(nodes, prev_list[-1])
    for a in nodes[m:]:
        connected = True
        for test in prev_list:
            if test not in adj_set[a] or a not in adj_set[test]:
                connected = False
                break
        
        if connected:
            prev_list.append(a)
            if extract_all_cliques(adj_list[a], prev_list) == 0:
                cliques.append(prev_list.copy())
                count += 1
            prev_list.pop()
    return count

In [11]:
cliques = []
print(extract_all_cliques(range(len(adj_list)), []), 'cliques found')

25276 cliques found


In [12]:
for c in cliques:
    if len(c) < 3:
        continue
    for i in c:
        print('*',records[i]['title'], ' - $', records[i]['price'])
    print('------')

* 9 Cell,11.10V,6600mAh,Li-ion,Replacement Laptop Battery for Dell Latitude E5400, Latitude E5410, Latitude E5500, Latitude E5510,Compatible part number of Dell:0RM668, 312-0762, 312-0769, 312-0902, 451-10616, 451-10617, KM742, KM769, KM771, WU841,  - $ 16.1
* Replacement Laptop Battery for Dell Latitude E5500, 7200mAh 9-Cell  - $ 36.1
* 11.10V,4800mAh,Li-ion,Hi-quality Replacement Laptop Battery for Dell Latitude E5400, Latitude E5410, Latitude E5500, Latitude E5510, This laptop battery can replace the following part numbers of Dell: 312-0762, 312-0769, 451-10616, KM742, KM769  - $ 14.87
------
* Laptop/Notebook Battery for IBM lbitt30l LIB-45 T30 2366 2367 02K7034 02K7033 02K7037 02K7038 02K7050 02K7051 02K7072 02K7073  - $ 28.14
* New Replacement Li-Ion Battery for IBM ThinkPad T30 Series Laptops / Replacem...  - $ 32.76
* Laptop Battery for IBM 02k7033 nib033 ThinkPad T30 02K7072 02K7034  - $ 31.77
------
* Genuine Dell PA-3E 90 Watt 3-Prong Slim AC Adapter with 6.56 ft  Power Cord

* HP PAVILION G7-1260US Laptop Screen 17.3 LED BOTTOM LEFT WXGA++ 1600x900  - $ 65.01
* HP Pavilion G7 17.3&quot; HD (1600 x 900) Glossy Replacement LED LCD Screen Bottom Left Connection fits G7-1260US, G7-1310US, G7-2220US, G7-2022US, G7-2240US, G7-2243US, G7-2270US  - $ 65.01
* HP PAVILION G7-1070US LAPTOP LCD SCREEN 17.3&quot; WXGA++ LED DIODE (SUBSTITUTE REPLACEMENT LCD SCREEN ONLY. NOT A LAPTOP )  - $ 65.01
------
* HP PAVILION G7-1260US Laptop Screen 17.3 LED BOTTOM LEFT WXGA++ 1600x900  - $ 65.01
* HP PAVILION G7-1219WM Laptop Screen 17.3 LED BOTTOM LEFT WXGA++ 1600x900  - $ 65.01
* HP PAVILION G7-1070US LAPTOP LCD SCREEN 17.3&quot; WXGA++ LED DIODE (SUBSTITUTE REPLACEMENT LCD SCREEN ONLY. NOT A LAPTOP )  - $ 65.01
------
* Celestron COSMOS FirstScope Telescope  - $ 54.95
* Orion 10033 FunScope 76mm TableTop Reflector Telescope Moon Kit (Blue)  - $ 59.99
* Celestron 21024 FirstScope Telescope  - $ 49.95
------
* ATC 10.8v, 6 Cells, 5200mAh/56whr, High Capacity Battery for ASUS E

* NEW DELL N5040 15.6&quot; LAPTOP LCD LED SCREEN (LED Replacement Screen Only. Not A Laptop )  - $ 37.44
* New 15.6&quot; LED Screen for Dell Inspiron N5040 Laptop HD Glossy LCD  - $ 49.17
------
* HP 593572-001 Laptop Battery - Premium Superb Choice&reg; 6-cell Li-ion battery  - $ 16.99
* Original Battery For HP ProBook 4321s 4320s 4325s 4520s 4525s HSTNN-DB1A HSTNN-CB1A  - $ 38.68
* HP ProBook 4321s 4320t 4325s 4326s 4420s 4421s 4425s 4525s 4520 520s Laptop Battery - Premium Superb Choice&reg; 6-cell Li-ion battery  - $ 20.99
------
* Acer Aspire 5250-BZ873 Laptop LCD Screen Replacement 15.6&quot; WXGA HD LED  - $ 40.86
* ACER ASPIRE 5250-BZ455 LAPTOP LCD SCREEN 15.6&quot; WXGA HD LED DIODE (SUBSTITUTE REPLACEMENT LCD SCREEN ONLY. NOT A LAPTOP )  - $ 37.8
* NEW ACER AS5250-BZ873 P5WE6 15.6 WXGA 1366X768 LED Screen (LED Replacement Screen Only. Not A Laptop )  - $ 54.5
------
* AC Adapter Power Supply Charger+Cord for Acer Aspire 1412LCi 3610WLCi Extensa 4420 4630z 5420 5620 5620-1A1

### Compute fan-in and fan-out

In [13]:
fan_in = list(map(lambda x: len(x), adj_list_incoming))
fan_out = list(map(lambda x: len(x), adj_list))
df = pd.DataFrame()
df['fan_in'] = fan_in
df['fan_out'] = fan_out
df.index = pd.DataFrame(records)['asin']
df.sort_values('fan_in', ascending=False).head()

Unnamed: 0_level_0,fan_in,fan_out
asin,Unnamed: 1_level_1,Unnamed: 2_level_1
B003U8CRGY,186,1
B007W1QBO4,165,2
B0051B4FP4,141,4
B005C31HC0,140,3
B005J963W6,117,4


### Brand analysis

#### Find brands in cliques

In [16]:
brands = []
dimension_min = 2

for c in cliques:
    if len(c) < dimension_min:
        continue
    for i in c:
        if 'brand' in records[i]:
            brands.append(records[i]['brand'])
            
print(str(len(brands)) + " products are associated to a brand.")
brands_set= sorted(set(brands))
print("There are " + str(len(brands_set)) + " brands in cliques.")
#print(brands_set)

7935 products are associated to a brand.
There are 523 brands in cliques.


In [17]:
brand_data = pd.DataFrame(brands_set, columns = ['brand'])
#brand_data = brand_data.replace("", "Empty")
brand_data = brand_data.set_index('brand')
#brand_data = brand_data.loc[brand_data['most sold'] > 10]

brand_data.sort_index().head()

(CHARGER4U)
3CLeader
6Ave
ADC


#### Find most sold brand in each clique according to sale rank

In [18]:
brand_data['most sold'] = 0

for c in cliques:
    # check cliques have min dimension
    if len(c) < dimension_min:
        continue
    # check brand is different for each product
    brand_check = []
    for i in c:
        brand_check.append(records[i]['brand'])
    if len(list(set(brand_check))) < len(c):
        continue
    # check sale rank category is the same for every article
    checkCat = []
    for i in c:
        checkCat.append(records[i]['salesRankCat'])
    if len(set(checkCat)) != 1:
        continue
    # min sale rank
    sr_min = min([records[i]['salesRankPos'] for i in c])
    # most sold products (some products may have the same rate)
    most_sold = [i for i in c if records[i]['salesRankPos'] == sr_min]
    # check that not every product is most sold
    if len(most_sold) < len(c):
    # increment most sold brands
        for i in most_sold:
            brand_data.loc[records[i]['brand'], 'most sold'] += 1
            

#### Find most rated brand in each clique according to avg rate

In [20]:
brand_data['most rated'] = 0

for c in cliques:
    # check cliques have min dimension
    if len(c) < dimension_min:
        continue
    # check brand is different for each product
    brand_check = []
    for i in c:
        brand_check.append(records[i]['brand'])
    if len(list(set(brand_check))) < len(c):
        continue
    # max rating
    r_max = max([records[i]['average_rating'] for i in c])
    # most rated products (some products may have the same rate)
    most_rate = [i for i in c if records[i]['average_rating'] == r_max]
    # check that there is at least a product less rated than the others
    if len(most_rate) < len(c):
    # increment most rated brands
        for i in most_rate:
            brand_data.loc[records[i]['brand'], 'most rated'] += 1


#### Find  most expensive and cheapest brand in each clique 

In [22]:
brand_data['cheapest'] = 0
brand_data['most expensive'] = 0

for c in cliques:
    # check cliques have min dimension
    if len(c) < dimension_min:
        continue
    # check brand is different for each product
    brand_check = []
    for i in c:
        brand_check.append(records[i]['brand'])
    if len(list(set(brand_check))) < len(c):
        continue
    # max and min price
    p_max = max([records[i]['price'] for i in c])
    p_min = min([records[i]['price'] for i in c])
    # most expensive products (some products may have the same price)
    most_exp = [i for i in c if records[i]['price'] == p_max]
    # check that there is at least a product cheaper than the others
    if len(most_exp) < len(c):
    # increment most expensive brands
        for i in most_exp:
            brand_data.loc[records[i]['brand'], 'most expensive'] += 1
    # cheapest products (some products may have the same price)
    cheap = [i for i in c if records[i]['price'] == p_min]
    # check that there is at least a product more expensive than the others
    if len(cheap) < len(c):
    # increment cheapest brands
        for i in cheap:
            brand_data.loc[records[i]['brand'], 'cheapest'] += 1
            

#### Find most sold brand in each clique according to fan in

In [24]:
brand_data['most bought'] = 0

for c in cliques:
    # check cliques have min dimension
    if len(c) < dimension_min:
        continue
    # check brand is different
    brand_check = []
    for i in c:
        brand_check.append(records[i]['brand'])
    if len(list(set(brand_check))) < len(c):
        continue
    v_max = max([fan_in[i] for i in c])
    most_bought = [i for i in c if fan_in[i] == v_max]
    # the products cannot be compared because they ALL have the same fan in
    if len(most_bought) == len(c):
        continue
    # however, if multiple products have same fan in and there are other products, the each of the former is incremented
    for i in most_bought:
        brand_data.loc[records[i]['brand'], 'most bought'] += 1
    

In [25]:
# filter dataframe
#brand_data = brand_data.rename(index={'': 'empty'})
#brand_data = brand_data.loc[brand_data['most sold'] > 10]

brand_data.sort_index()

Unnamed: 0_level_0,most sold,most rated,cheapest,most expensive,most bought
brand,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
,25,24,33,28,34
(CHARGER4U),2,0,2,0,2
3CLeader,0,2,1,3,2
6Ave,1,1,1,0,0
ADC,0,0,0,0,0
AGPtek,8,15,6,19,17
AKG Pro Audio,0,0,0,0,0
AMA,1,0,0,1,0
AOC,2,2,4,1,5
AP,1,0,1,0,1


In [26]:
print(brand_data.loc['Samsung'])
print(brand_data.loc['LG'])

most sold         12
most rated        20
cheapest          22
most expensive    17
most bought       16
Name: Samsung, dtype: int64
most sold         13
most rated        19
cheapest          15
most expensive    17
most bought       11
Name: LG, dtype: int64


#### "Co occurence" matrix

For each bran row, the brand column indicates how many time it lost in terms of fan in

In [27]:
print("Fan-in analysis (most bought).")
bought_occ = pd.DataFrame(brands_set, columns = ['brand'])
bought_occ = bought_occ.set_index('brand')
for b in brands_set:
    bought_occ[b] = 0
bought_occ.index.rename('most bought', inplace = True)
    
# auxiliary statistics
clique_count = 0
valid_clique_count = 0
for c in cliques:
    # check cliques have min dimension
    if len(c) < dimension_min:
        continue
    # check all different brands
    clique_count += 1
    brand_check = []
    for i in c:
        brand_check.append(records[i]['brand'])
    if len(list(set(brand_check))) < len(c):
        continue
    #print(brand_check)
    valid_clique_count += 1
    v_max = max([fan_in[i] for i in c])
    # if same fan in there is no best: eg (samsung, 4) (HP, 4) none of the two is incremented
    most_bought = [i for i in c if fan_in[i] == v_max]
    others = [i for i in c if fan_in[i] != v_max]
    for i in most_bought:
        for j in others:
            bought_occ.loc[records[i]['brand'], records[j]['brand']] += 1
    
    
#brand_data
print("Percentage of valid cliques: ", valid_clique_count/clique_count)
print("Table dimension: ", bought_occ.shape)
#bought_occ

Fan-in analysis (most bought).
Percentage of valid cliques:  0.46147672552166935
Table dimension:  (523, 523)


In [28]:
# filter table
bought_occ = bought_occ.loc[(bought_occ.sum(axis=1) != 0), (bought_occ.sum(axis=0) != 0)]
#bought_occ.drop(["", "Unknown"], inplace=True, axis=1)
#bought_occ.drop(["", "Unknown"], inplace=True, axis=0)
#bought_occ.loc['Samsung']
bought_occ

Unnamed: 0_level_0,Unnamed: 1_level_0,6Ave,AGPtek,ASA,ASR,AU OPTRONIC,AU Optronics,AUO,Acer,Achi,...,fds,generic,iLive,iRulu,laptop-computer-batteries,leegoal,sunvalleytek,tMate Powers,upbright,yallstore
most bought,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
(CHARGER4U),0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3CLeader,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AGPtek,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AOC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AP,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ASA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AU Optronics,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
AUO,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Accessory Power,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
for b in bought_occ.index:
    #print('Brand: ', b, ' , other choices: ', bought_occ.loc[b, bought_occ.loc[b] != 0])
    print('Brand: ', b)
    print(bought_occ.loc[b, bought_occ.loc[b] != 0])
    print('------')


Brand:  
AU Optronics                         1
Apple                                1
Asus                                 8
Bay Valley Parts                     1
GATEWAY                              2
GEP                                  1
Gateway                              4
Generic                              3
LG                                   2
Laptopmate                           1
MLC Distribution                     2
SIB                                  3
Superb Choice                        2
Tech Rover                           4
[Ships from and sold by power198]    1
Name: , dtype: int64
------
Brand:  (CHARGER4U)
Bay Valley Parts    1
battery_king        1
Name: (CHARGER4U), dtype: int64
------
Brand:  3CLeader
Apple      1
Generic    1
Name: 3CLeader, dtype: int64
------
Brand:  AGPtek
Atc                     1
Battery1inc             3
BrainyTrade             2
Etree                   1
Fusion                  1
IBM                     2
LB1 High Performance    1

Brand:  Dahua
HIKVISION USA INC    1
Name: Dahua, dtype: int64
------
Brand:  Dayton Audio
Theater Solutions    1
Name: Dayton Audio, dtype: int64
------
Brand:  Deer Park Distributors
Crosley    1
Name: Deer Park Distributors, dtype: int64
------
Brand:  DekCell
Atc                  3
Bay Valley Parts     4
Best Compu           2
Electricking         2
Laptop Battery HQ    3
MPower               2
PowerSmart           1
SIB                  5
SIB-CORP             2
Superb Choice        5
Unknown              5
battery_king         2
Name: DekCell, dtype: int64
------
Brand:  Dekcell
Atc           1
Brainydeal    1
IBM           2
SIB           3
Unknown       1
Xtend         1
Name: Dekcell, dtype: int64
------
Brand:  Dell
AU OPTRONIC               3
AUO                       2
Acer                      1
Amsahr                    1
Bay Valley Parts          3
BetterStuff LowerPrice    2
Buy-Batteries             1
CBD&amp;reg;              2
CHIMEI                    1
Delta Electro

In [30]:
b_dict = {}
for b in bought_occ.index:
    row = bought_occ.loc[b, bought_occ.loc[b] != 0]
    l = [(i, row[i]) for i in row.index]
    #print('Brand: ', b, ', alternatives: ', l)
    b_dict[b] = l

brand = 'Samsung'
print('Alternatives to: ', brand)
print(b_dict[brand], sum(list(map((lambda x: x[1]), b_dict[brand]))))

Alternatives to:  Samsung
[('AU Optronics', 2), ('Asus', 1), ('Chargerbuy', 1), ('Compaq', 1), ('Dell', 2), ('HP', 2), ('LG', 1), ('Lenovo', 1), ('MLC Distribution', 2), ('Sony', 1), ('Toshiba', 2)] 16
