#### Creating a matrix of sentiment-aspect pairs.
#### Scaling down the frequencies, discounting pairs that occur too often using IDF & normalizing the vector

In [10]:
import pickle
with open("aspect_freq/pair_clusters_50.pickle", "rb") as f:
    final_pool = pickle.load(f)
with open("aspect_freq/final_prods_aspects.pickle", "rb") as f:
    products = pickle.load(f)


In [11]:
revmap = {}
for key, vals in final_pool.items():
    for val in vals:
        revmap[val] = key

In [12]:
reduced_product = {}
for key, val_dict in products.items():
    print(f"\r[*] Reducing dimension of product {key}",end="")
    reduced_product[key] = {}
    for val_key, val_val in val_dict.items():
        if (rm:=revmap.get(val_key, None)) is not None:
            reduced_product[key].setdefault(revmap[val_key], 0)
            reduced_product[key][revmap[val_key]] += val_val
del products

[*] Reducing dimension of product 59193

In [None]:
import pandas as pd
df = pd.DataFrame.from_dict(reduced_product, orient='index')
df

In [None]:
df.sort_index(inplace=True)
df

Unnamed: 0,small head,dark beer,good beer,good carbonation,spotty lacing,light chocolate,dark malt,roasted malt,artificial sweetener,medium body,...,royersford brewpub,longshot pack,longshot beer,saranac line,coastal extreme,regular pannepot,adirondack lager,regular aventinus,regular guardian,youngs beer
0,6.0,3.0,6.0,3.0,2.0,2.0,2.0,5.0,2.0,2.0,...,,,,,,,,,,
1,31.0,,22.0,9.0,4.0,,,3.0,,24.0,...,,,,,,,,,,
2,14.0,1.0,4.0,,,,,,,2.0,...,,,,,,,,,,
3,1.0,,1.0,,,,,,,2.0,...,,,,,,,,,,
4,14.0,5.0,12.0,1.0,,2.0,5.0,10.0,,4.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59189,60.0,,48.0,16.0,6.0,,,4.0,,24.0,...,,,,,,,,,,
59190,4.0,,2.0,2.0,1.0,2.0,3.0,4.0,,,...,,,,,,,,,,
59191,75.0,,75.0,21.0,4.0,,,4.0,,33.0,...,,,,,,,,,,
59192,,,,1.0,,,,,,1.0,...,,,,,,,,,,


#### Dropping columns which are 99.5% empty. Done because of hardware constraints
#### Now we are left with 2571 pairs

In [None]:

ndf = df.dropna(axis=1,thresh=int(len(df)*0.005))
ndf

Unnamed: 0,small head,dark beer,good beer,good carbonation,spotty lacing,light chocolate,dark malt,roasted malt,artificial sweetener,medium body,...,oily pour,nice smokiness,simple stout,stale corn,good quaffer,pure malt,old system,fine hop,secondary hop,shy note
0,6.0,3.0,6.0,3.0,2.0,2.0,2.0,5.0,2.0,2.0,...,,,,,,,,,,
1,31.0,,22.0,9.0,4.0,,,3.0,,24.0,...,,,,,,,,,,
2,14.0,1.0,4.0,,,,,,,2.0,...,,,,,,,,,,
3,1.0,,1.0,,,,,,,2.0,...,,,,,,,,,,
4,14.0,5.0,12.0,1.0,,2.0,5.0,10.0,,4.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59189,60.0,,48.0,16.0,6.0,,,4.0,,24.0,...,,,,,,,,,,1.0
59190,4.0,,2.0,2.0,1.0,2.0,3.0,4.0,,,...,,,,,,,,,,
59191,75.0,,75.0,21.0,4.0,,,4.0,,33.0,...,,,,,,,,,,
59192,,,,1.0,,,,,,1.0,...,,,,,,,,,,


### TF-IDF

In [None]:

#! TF-IDF

import numpy as np

ndf = ndf.div(ndf.sum(axis=1), axis=0)
idf_col = np.log10(len(ndf)/(ndf.count()+1))
ndf.fillna(value=0,inplace=True)
ndf = ndf.mul(idf_col)

### Normalization

In [None]:

#! Normalizing each document

ndf = ndf.div(np.linalg.norm(ndf.values, axis=1), axis=0)
ndf


Unnamed: 0,small head,dark beer,good beer,good carbonation,spotty lacing,light chocolate,dark malt,roasted malt,artificial sweetener,medium body,...,oily pour,nice smokiness,simple stout,stale corn,good quaffer,pure malt,old system,fine hop,secondary hop,shy note
0,0.016710,0.183863,0.086504,0.122237,0.121565,0.116051,0.122588,0.156657,0.369789,0.022888,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1,0.014085,0.000000,0.051745,0.059825,0.039664,0.000000,0.000000,0.015334,0.000000,0.044807,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
2,0.022396,0.035205,0.033126,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.013147,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
3,0.002851,0.000000,0.014759,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.023430,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
4,0.012149,0.095483,0.053907,0.012696,0.000000,0.036160,0.095492,0.097625,0.000000,0.014263,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59189,0.018098,0.000000,0.074952,0.070608,0.039499,0.000000,0.000000,0.013574,0.000000,0.029747,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.021536
59190,0.008203,0.000000,0.021231,0.060003,0.044755,0.085450,0.135395,0.092279,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
59191,0.014444,0.000000,0.074771,0.059168,0.016812,0.000000,0.000000,0.008666,0.000000,0.026114,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
59192,0.000000,0.000000,0.000000,0.042074,0.000000,0.000000,0.000000,0.000000,0.000000,0.011817,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000


In [9]:
ndf.to_parquet("product_map.parquet", engine="fastparquet")