In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
# !wget https://github.com/explosion/spacy-models/releases/download/en_vectors_web_lg-2.1.0/en_vectors_web_lg-2.1.0.tar.gz -O en_vectors_web_lg-2.1.0.tar.gz
# !pip install en_vectors_web_lg-2.1.0.tar.gz
data_dir = '/content/gdrive/Shareddrives/520_Project'
# !pip install /content/gdrive/Shareddrives/520_Project/en_vectors_web_lg-2.1.0.tar.gz

Mounted at /content/gdrive


In [None]:
import pandas as pd
import numpy as np
import os
import torch
import re
import time
import copy
import math
import pickle
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
import torch.optim as optimizer
import torch.nn.functional as F
from torch import nn
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")
# import en_vectors_web_lg

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = "sans-serif"
plt.rcParams['font.sans-serif'] = ['Times New Roman']
sns.set_style("whitegrid")
sns.set_style({'font.family':'serif', 'font.serif':'Times New Roman'})
sns.set(font_scale=1.2)

In [None]:
data = pd.read_csv(os.path.join(data_dir, "Womens_Clothing_E-Commerce_Reviews.csv"))
data.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [None]:
def clean_text(w):
    return re.sub(
            r"([.,'!?\"()*#:;])",
            '',
            w.lower()
            ).replace('-', ' ').replace('/', ' ')

def get_glove_embedding(reviews, data_dir):
  token_file = os.path.join(data_dir,'token_to_ix.pkl')
  glove_file = os.path.join(data_dir,'train_glove.npy')
  if os.path.exists(glove_file) and os.path.exists(token_file):
        print("Loading saved embedding")
        return pickle.load(open(token_file, "rb")), np.load(glove_file)
  all_reviews = {}
  for idx, s in enumerate(reviews):
    all_reviews[idx] = clean_text(s).split()

  from collections import defaultdict
  token_to_ix = defaultdict(int)
  token_to_ix['UNK'] = 1

  spacy_tool = en_vectors_web_lg.load()
  pretrained_emb = []
  pretrained_emb.append(spacy_tool('UNK').vector)
  
  for k, v in all_reviews.items():
      for word in v:
          if word not in token_to_ix:
              token_to_ix[word] = len(token_to_ix)
              pretrained_emb.append(spacy_tool(word).vector)

  pretrained_emb = np.array(pretrained_emb)
  np.save(glove_file, pretrained_emb)
  pickle.dump(token_to_ix, open(token_file, "wb"))
  return token_to_ix, pretrained_emb

def embed_text(x, max_len, token2ix):
  ques_ix = np.zeros(max_len, np.int64)
  x = clean_text(x).split()
  for ix, word in enumerate(x):
    if word in token2ix:
      ques_ix[ix] = token2ix[word]
    else:
      ques_ix[ix] = 1
    if ix + 1 == max_len:
      break
  return ques_ix
def tokenize(reviews):
  token2ix = {'PAD': 0, 'UNK': 1, 'SS' : 2,}
  for r in reviews:
    r = clean_text(r).split()
    for word in r:
      if word not in token2ix:
        token2ix[word] = len(token2ix)
  return token2ix
def category_from_output(output):
  top_n, top_i = output[0].topk(1)
  category_i = top_i[0].item()
  return category_i

In [None]:
class cloth_dataset(Dataset):
  def __init__(self, encodings, labels, metadata):
    self.embedded = np.array(encodings)
    self.label = np.array(labels)
    self.meta = np.array(metadata)
  def __getitem__(self, index):
    return self.embedded[index],\
          self.label[index],\
          self.meta[index]
  def __len__(self):
    # print(len(self.label))
    return len(self.label)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
EMBEDDING_DIM = 300
BATCH_SIZE = 8
LEARNING_RATE = 0.005
EPOCH = 11
data = pd.read_csv(os.path.join(data_dir, "Womens_Clothing_E-Commerce_Reviews.csv"))
data['Review Text'] = data['Review Text'].fillna(' ')
token2ix, pretrained_emb = get_glove_embedding(data['Review Text'],data_dir)
print(pretrained_emb.shape) # (len(vocab), embedding_dim)

lengths = [len(x.split()) for x in data['Review Text']]
max_len = int(np.percentile(lengths,90))
data['embedded'] = data['Review Text'].apply(lambda x : embed_text(x,max_len,token2ix))

meta_cols = ['Division Name', 'Department Name', 'Class Name']
def col2ix(x,col_cnt):
  if x in col_cnt:
    return col_cnt[x]
  return len(col_cnt.keys())
dummy_names = []
for c in meta_cols:
  col_cnt = {value:idx for idx,value in enumerate(list(set(data[c])))}
  data[c+'_'] = data[c].apply(lambda x: col2ix(x, col_cnt))
  dummies = pd.get_dummies(data[c+'_'], prefix=c.split()[0])
  names = list(dummies.columns)
  data = pd.concat((data,dummies),axis = 1)
  dummy_names += names
all_cols = dummy_names+['Age']

cuda:0
Loading saved embedding
(16335, 300)


In [None]:

X_train_meta, X_dev_meta, y_train, y_dev = train_test_split(data[all_cols+['embedded']],\
                                                                  data['Recommended IND'], test_size=0.2, random_state=42)
X_dev_meta, X_test_meta, y_dev, y_test = train_test_split(X_dev_meta,\
                                                                  y_dev, test_size=0.5, random_state=42)
# print(X_train_meta.columns)
# X_train = [X_train[i] for i in range(len(X_train))]
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_train_meta, y_train = ros.fit_resample(X_train_meta, y_train)

X_train, X_dev, X_test = np.array(X_train_meta['embedded']), X_dev_meta['embedded'], X_test_meta['embedded']
X_train_meta, X_dev_meta, X_test_meta = X_train_meta.drop(columns=['embedded']), X_dev_meta.drop(columns=['embedded']), X_test_meta.drop(columns=['embedded'])


# print('a',len(y_train), len(X_train), len(X_train_meta))
train_dataset = cloth_dataset(X_train, y_train, X_train_meta)
train_data_iter = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
dev_dataset = cloth_dataset(X_dev, y_dev, X_dev_meta)
dev_data_iter = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataset = cloth_dataset(X_test, y_test, X_test_meta)
test_data_iter = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
from collections import Counter
Counter(y_train)

Counter({0: 15467, 1: 15467})

In [None]:
# 824-80x
class CNN_model(nn.Module):
    def __init__(self, token_size, pretrained_emb):
        super(CNN_model, self).__init__()
        dropout_rate = 0.8
        self.embedding = nn.Embedding(
            num_embeddings=token_size,
            embedding_dim=300
        )
        self.embedding.weight.data.copy_(torch.from_numpy(pretrained_emb))

        hidden_size = 16
        k = 2
        print('hidden_size', hidden_size)
        self.conv1d = torch.nn.Conv1d(in_channels=300, out_channels=hidden_size, kernel_size=k)
        self.conv1d1 = torch.nn.Conv1d(in_channels=hidden_size, out_channels=hidden_size, kernel_size=k)
        self.conv_unit = nn.Sequential(self.conv1d, nn.ReLU(),nn.Dropout(dropout_rate),
                                       self.conv1d1, nn.ReLU(),
                                      #  nn.Dropout(0.5),
                                      #  self.conv1d1, nn.ReLU(),
                                       )

        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_size+33, 1)

    def forward(self, x, metadata):
      x = self.embedding(x) # print('0',x.shape)  # [bsz, len, 300]
      x = torch.transpose(x,1,2) # print('1',x.shape) # [bsz, 300, len]
      x = self.conv_unit(x) # print('2',x.shape) # [bsz, 128, len]
      out = torch.transpose(x, 1, 2) 
      length = out.shape[1]
      embedding_v = out[:,-1,:] # print('3',embedding_v.shape) # [bsz, 128]

      embedding_v = torch.concat((embedding_v,metadata), dim = 1)
      # print(metadata.shape)

      logit = self.fc(embedding_v)
      # logit = self.fc_unit(embedding_v) # print('logit.shape',logit.shape) # [bsz, 1]
      return logit
    
net = CNN_model(len(token2ix), pretrained_emb).to(device)
criteon = nn.BCEWithLogitsLoss().to(device)
for batch_idx, (text, label, metadata) in enumerate(train_data_iter):
      text, label, metadata = text.to(device), label.to(device), metadata.to(device)
      output = net(text,metadata)
      output = output.squeeze(-1)
      # print(output.shape)
      # print(label.shape)
      loss = criteon(output,label.float())
      break

hidden_size 16


In [None]:
def train(epoch, train_data_iter, dev_data_iter ,opt,criteon,net,device):
  def timeSince(since):
      now = time.time()
      s = now - since
      m = math.floor(s / 60)
      s -= m * 60
      return '%dm %ds' % (m, s)
  train_losses, dev_losses, dev_acc_list = [], [], []
  best_model, best_val_acc = None, float('-inf')
  cnt_step = 0
  current_loss = 0
  plot_every = 2
  dev_every = 2
  print('train len:',len(train_data_iter),'dev len:',len(dev_data_iter))
  print('learning_rate',LEARNING_RATE,'n_iters',epoch, 'optim','Adam','batch size ',BATCH_SIZE, 'lr_scheduler',None, 'device',device)
  start = time.time()
  for e in range(epoch): 
    print('Epoch', e)
    net.train()
    for batch_idx, (text, label, metadata) in enumerate(train_data_iter):
      text, label, metadata = text.to(device), label.to(device), metadata.to(device)
      net.zero_grad()
      opt.zero_grad()
      output = net(text,metadata)
      # loss = criteon(output[0],label)
      output = output.squeeze(-1)
      loss = criteon(output,label.float())
      current_loss += loss
      cnt_step += 1
      loss.backward()
      opt.step()
    if e==0:
      print(time.time()-start)
    if e % plot_every == 0:
      tmp_loss = current_loss.item() / cnt_step
      train_losses.append(tmp_loss)
      current_loss, cnt_step = 0, 0
      print('%d %d%% (%s) loss: %.4f ' % (e, e / EPOCH * 100, timeSince(start), tmp_loss))
    if e % dev_every ==0:
      net.eval()
      eval_loss = 0
      y_pred, y_true = [], []
      cnt_eval_step = 0
      for batch_idx, (text, label, metadata) in enumerate(dev_data_iter):
        text, label, metadata = text.to(device), label.to(device), metadata.to(device)
        output = net(text,metadata)
        output = output.squeeze(-1)
        loss = criteon(output,label.float())
        # loss = criteon(output[0],label)
        result = torch.gt(torch.sigmoid(output),0.5).int() 
        eval_loss += loss
        cnt_eval_step += 1
        y_pred += result.tolist()
        y_true += label.tolist()
      # print(cnt_eval_step, eval_loss, len(dev_data_iter))
      dev_losses.append(eval_loss.item()/cnt_eval_step)
      acc = accuracy_score(y_pred,y_true)
      dev_acc_list.append(acc)
      if acc>best_val_acc:
        best_val_acc = acc
        best_model = copy.deepcopy(net)
        torch.save(net.state_dict(), 'cloth_CNN.pth')
      print('%d %d%% (%s) loss:%.4f %s %s acc:%.4f' % (e, e / EPOCH * 100, timeSince(start), eval_loss.item()/cnt_eval_step, result.tolist()[:4], label.tolist()[:4], acc))
  print('best acc', best_val_acc)
  return train_losses, dev_losses, dev_acc_list, best_model,net # best_model

In [None]:
LEARNING_RATE = 0.001

In [None]:
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net = CNN_model(len(token2ix), pretrained_emb).to(device)
# opt = optimizer.Adam(net.parameters(), lr=LEARNING_RATE)
opt = optimizer.SGD(net.parameters(), lr=LEARNING_RATE)
# criteon = nn.CrossEntropyLoss().to(device)
criteon = nn.BCEWithLogitsLoss().to(device)
train_losses, dev_losses, dev_acc_list, best_model, net = train(10,train_data_iter, dev_data_iter ,opt,criteon,net,device)

In [None]:
y_pred, y_true = [], []
for batch_idx, (text, label, metadata) in enumerate(dev_data_iter):
  text, label, metadata = text.to(device), label.to(device), metadata.to(device)
  output = best_model(text,metadata)
  output = output.squeeze(-1)
  loss = criteon(output,label.float())
  result = torch.gt(torch.sigmoid(output),0.5).int()
  y_pred += result.tolist()
  y_true += label.tolist()
acc = accuracy_score(y_pred,y_true)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
def evaluate_p_r_f1_acc(y_pred, y_true):
  precision = precision_score(y_pred, y_true)
  recall = recall_score(y_pred, y_true)
  fscore = f1_score(y_pred, y_true)
  acc = accuracy_score(y_pred, y_true)
  return precision, recall, fscore, acc
def evaluate_cnn(model, data_iter):
  net.eval()
  y_pred, y_true = [], []
  for batch_idx, (text, label, metadata) in enumerate(data_iter):
    text, label, metadata = text.to(device), label.to(device), metadata.to(device)
    output = net(text,metadata)
    output = output.squeeze(-1)
    loss = criteon(output,label.float())
    result = torch.gt(torch.sigmoid(output),0.5).int()
    y_pred += result.tolist()
    y_true += label.tolist()
  p,r,fscore, acc = evaluate_p_r_f1_acc(y_pred, y_true)
  print('Precision: ',p, '\tRecall: ',r,'\tF-score: ',fscore,'\tacc: ', acc)