# Deep-MINE Framework

## 1. Importing libraries

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import pandas as pd
import numpy as np

import json

## 2. Loading data

### 2.1 Reviews

In [2]:
# load json
reviews = []
with open('/content/drive/MyDrive/Mestrado/Prime_Pantry.json') as f:
    for line in f:
        reviews.append(json.loads(line))

# create dataframe
df_reviews = pd.DataFrame(reviews)
print(df_reviews.shape)
df_reviews.head()

(471614, 12)


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,image,style
0,5.0,True,"12 14, 2014",A1NKJW0TNRVS7O,B0000DIWNZ,Tamara M.,Good clinging,Clings well,1418515200,,,
1,4.0,True,"11 20, 2014",A2L6X37E8TFTCC,B0000DIWNZ,Amazon Customer,Fantastic buy and a good plastic wrap. Even t...,Saran could use more Plus to Cling better.,1416441600,,,
2,4.0,True,"10 11, 2014",A2WPR4W6V48121,B0000DIWNZ,noname,ok,Four Stars,1412985600,,,
3,3.0,False,"09 1, 2014",A27EE7X7L29UMU,B0000DIWNZ,ZapNZs,Saran Cling Plus is kind of like most of the C...,"The wrap is fantastic, but the dispensing, cut...",1409529600,4.0,,
4,4.0,True,"08 10, 2014",A1OWT4YZGB5GV9,B0000DIWNZ,Amy Rogers,This is my go to plastic wrap so there isn't m...,has been doing it's job for years,1407628800,,,


In [3]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 471614 entries, 0 to 471613
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   overall         471614 non-null  float64
 1   verified        471614 non-null  bool   
 2   reviewTime      471614 non-null  object 
 3   reviewerID      471614 non-null  object 
 4   asin            471614 non-null  object 
 5   reviewerName    471578 non-null  object 
 6   reviewText      471233 non-null  object 
 7   summary         471473 non-null  object 
 8   unixReviewTime  471614 non-null  int64  
 9   vote            41716 non-null   object 
 10  image           3568 non-null    object 
 11  style           6933 non-null    object 
dtypes: bool(1), float64(1), int64(1), object(9)
memory usage: 40.0+ MB


In [6]:
# data cleaning
df_reviews = df_reviews.dropna(subset=['reviewerID', 'asin', 'reviewText'])
df_reviews = df_reviews[['reviewerID', 'asin', 'reviewText']]
print(df_reviews.shape)
df_reviews.head()

(471233, 3)


Unnamed: 0,reviewerID,asin,reviewText
0,A1NKJW0TNRVS7O,B0000DIWNZ,Good clinging
1,A2L6X37E8TFTCC,B0000DIWNZ,Fantastic buy and a good plastic wrap. Even t...
2,A2WPR4W6V48121,B0000DIWNZ,ok
3,A27EE7X7L29UMU,B0000DIWNZ,Saran Cling Plus is kind of like most of the C...
4,A1OWT4YZGB5GV9,B0000DIWNZ,This is my go to plastic wrap so there isn't m...


### 2.2 Products

In [7]:
# load json
products = []
with open('/content/drive/MyDrive/Mestrado/meta_Prime_Pantry.json') as f:
    for line in f:
        products.append(json.loads(line))

# create dataframe
df_products = pd.DataFrame(products)
print(df_products.shape)
df_products.head(2)

(10813, 19)


Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes
0,[],,[Sink your sweet tooth into MILK DUDS Candya d...,,"HERSHEY'S Milk Duds Candy, 5 Ounce(Halloween C...","[B019KE37WO, B007NQSWEU]",,Milk Duds,[],[],[],"{'ASIN: ': 'B00005BPJO', 'Item model number:':...","<img src=""https://m.media-amazon.com/images/G/...",,,$5.00,B00005BPJO,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
1,[],,[Sink your sweet tooth into MILK DUDS Candya d...,,"HERSHEY'S Milk Duds Candy, 5 Ounce(Halloween C...","[B019KE37WO, B007NQSWEU]",,Milk Duds,[],[],[],"{'ASIN: ': 'B00005BPJO', 'Item model number:':...","<img src=""https://m.media-amazon.com/images/G/...",,,$5.00,B00005BPJO,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...


In [8]:
# replace empty lists with null values
df_products = df_products.map(lambda x: np.nan if len(x) == 0 else x)
df_products.head(2)

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes
0,,,[Sink your sweet tooth into MILK DUDS Candya d...,,"HERSHEY'S Milk Duds Candy, 5 Ounce(Halloween C...","[B019KE37WO, B007NQSWEU]",,Milk Duds,,,,"{'ASIN: ': 'B00005BPJO', 'Item model number:':...","<img src=""https://m.media-amazon.com/images/G/...",,,$5.00,B00005BPJO,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
1,,,[Sink your sweet tooth into MILK DUDS Candya d...,,"HERSHEY'S Milk Duds Candy, 5 Ounce(Halloween C...","[B019KE37WO, B007NQSWEU]",,Milk Duds,,,,"{'ASIN: ': 'B00005BPJO', 'Item model number:':...","<img src=""https://m.media-amazon.com/images/G/...",,,$5.00,B00005BPJO,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...


In [9]:
df_products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10813 entries, 0 to 10812
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   category         0 non-null      float64
 1   tech1            0 non-null      float64
 2   description      10715 non-null  object 
 3   fit              0 non-null      float64
 4   title            10813 non-null  object 
 5   also_buy         4059 non-null   object 
 6   tech2            0 non-null      float64
 7   brand            10810 non-null  object 
 8   feature          1036 non-null   object 
 9   rank             4876 non-null   object 
 10  also_view        5978 non-null   object 
 11  details          10789 non-null  object 
 12  main_cat         10813 non-null  object 
 13  similar_item     0 non-null      float64
 14  date             0 non-null      float64
 15  price            6750 non-null   object 
 16  asin             10813 non-null  object 
 17  imageURL    

In [10]:
# data cleaning
df_products = df_products.dropna(subset=['description', 'asin', 'imageURL', 'imageURLHighRes'])
df_products = df_products[['description', 'asin', 'imageURL', 'imageURLHighRes']]
print(df_products.shape)
df_products.head(2)

(8942, 4)


Unnamed: 0,description,asin,imageURL,imageURLHighRes
0,[Sink your sweet tooth into MILK DUDS Candya d...,B00005BPJO,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
1,[Sink your sweet tooth into MILK DUDS Candya d...,B00005BPJO,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...


In [11]:
# get number of unique products
df_products['asin'].nunique()

8941

In [14]:
# remove the duplicate one
df_products = df_products.drop_duplicates(subset=['asin'])
df_products.shape

(8941, 4)

In [15]:
df_reviews.merge(df_products, on='asin')['reviewerID'].value_counts()

Unnamed: 0_level_0,count
reviewerID,Unnamed: 1_level_1
AMMNGUJK4HQJ5,195
A35Q0RBM3YNQNF,175
AKPG8VQBS0MWR,143
A13J2PGKNMJG1K,143
AXK37UZY8UPYP,139
...,...
A3CZPYMZUVA4RD,1
A3QM69AUBKPFVN,1
A39BG3UYXESFOF,1
A2M0Q5315N6R7V,1


Matrix -> reviewerID (236,741) x asin (8,941)

## 3. Models

In [None]:
# https://discuss.pytorch.org/t/how-to-share-weights-between-two-layers/55541/2
# https://www.kaggle.com/code/ignazio/autoencoder-for-text-in-pytorch


def init_weights():
    pass


class ImageAutoEncoder(nn.Module):

    def __init__(self, tie_weights=True):
        super().__init__()

        # encoder
        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
        self.fc3 = nn.Linear(25600, 100, bias=True)

        # decoder
        self.fc4 = nn.Linear(100, 25600, bias=True)
        self.conv5 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
        self.conv6 = nn.Conv2d(64, 3, kernel_size=3, stride=1, padding=1)

        # utils
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.upsample = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
        self.relu = nn.ReLU()

        # share encoder decoder weight matrices
        if tie_weights:
            self._tie_weights()

    def _tie_weights(self):
        self.fc4.weight.data = self.fc3.weight.data.transpose(0,1)
        self.conv5.weight.data = self.conv2.weight.data.transpose(0,1)
        self.conv6.weight.data = self.conv1.weight.data.transpose(0,1)

    def forward(self, x):
        # encoder
        h = self.relu(self.conv1(x))
        h = self.relu(self.conv2(h))
        h = self.pool(h)
        h = self.fc3(h.reshape(-1, 25600))
        print(h.shape)

        # decoder
        h = self.fc4(h).T
        h = h.reshape(-1, 64, 20, 20)
        h = self.upsample(h)
        h = self.conv5(h)
        x_hat = self.conv6(h)
        return x_hat


class TextAutoEncoder(nn.Module):

    def __init__(self, tie_weights=True):
        super().__init__()

        # encoder
        self.fc1 = nn.Linear(32, 400, bias=True)
        self.fc2 = nn.Linear(400, 100, bias=True)

        # decoder
        self.fc3 = nn.Linear(100, 400, bias=True)
        self.fc4 = nn.Linear(400, 32, bias=True)

        # utils
        self.relu = nn.ReLU()

        # share encoder decoder weight matrices
        if tie_weights:
            self._tie_weights()

    def _tie_weights(self):
        self.fc3.weight.data = self.fc2.weight.data.transpose(0,1)
        self.fc4.weight.data = self.fc1.weight.data.transpose(0,1)

    def forward(self, x):
        # encoder
        h = self.relu(self.fc1(x))
        h = self.fc2(h)
        print(h.shape)

        # decoder
        h = self.relu(self.fc3(h))
        x_hat = self.fc4(h)
        return x_hat

model = TextAutoEncoder()
print(sum(p.numel() for p in model.parameters()))
print(model)

x = torch.randn(1, 32)
print(x.shape)
model(x).shape

106532
TextAutoEncoder(
  (fc1): Linear(in_features=32, out_features=400, bias=True)
  (fc2): Linear(in_features=400, out_features=100, bias=True)
  (fc3): Linear(in_features=100, out_features=400, bias=True)
  (fc4): Linear(in_features=400, out_features=32, bias=True)
  (relu): ReLU()
)
torch.Size([1, 32])
torch.Size([1, 100])


torch.Size([1, 32])

In [None]:
model = ImageAutoEncoder()
print(sum(p.numel() for p in model.parameters()))
print(model)

x = torch.randn(1, 3, 40, 40)
print(x.shape)
model(x).shape

5223079
ImageAutoEncoder(
  (conv1): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (fc3): Linear(in_features=25600, out_features=100, bias=True)
  (fc4): Linear(in_features=100, out_features=25600, bias=True)
  (conv5): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv6): Conv2d(64, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (upsample): Upsample(scale_factor=2.0, mode='bilinear')
  (relu): ReLU()
)
torch.Size([1, 3, 40, 40])
torch.Size([1, 100])


torch.Size([1, 3, 40, 40])

In [None]:
# hyparameters
lambda_m = 1/40
lambda_d = 1/32
lambda_r = 1/32

loss = lambda_m/2*mse(x_hat-x) + lambda_w/2

Perhaps check these references:

- https://github.com/sh0416/bpr/blob/master/train.py
- https://github.com/recommenders-team/recommenders/blob/main/examples/02_model_collaborative_filtering/cornac_bpr_deep_dive.ipynb