In [2]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

import numpy as np
import torch

import gzip
import os
import pickle

import importlib
import matplotlib.pyplot as plt

from sklearn.metrics.pairwise import cosine_similarity

In [3]:
pwd


'/Users/willtong/Documents/AI_models/wine_review_model/wine_libraries/wine_recommender/notebooks'

In [4]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
os.chdir('../../')

# These are my personally tailored stop words that include foreign articles such as 'la', 'el', 'les', 'il'
from functions.featurization import stop_words

# About 150 typical English stopwords
nltk_words = list(stopwords.words('english'))
# Remove negative words from stop word list.  In other words, I want to keep them in the text.
# I believe those words are important in conveying negative sentiments.
nltk_words = [word for word in nltk_words if 'not' not in word and word[-3:] != "n't" and word[-2:] != "n'"][:141]

# Add the nltk words to my original stop word list
stop_words.extend(nltk_words)


[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>
[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>


## These are the stop words I had used in training the price model.

In [5]:
print(stop_words)

['&', 'la', 'le', 'les', 'the', 'da', 'dal', 'dalla', 'della', 'del', 'de', 'al', 'alla', 'gli', 'i', 'dos', 'das', 'di', 'du', 'do', 'lo', 'of', 'and', 'with', 'to', 'the', 'a', 'an', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 

## Remove all negative words since negative words are useful (e.g. not dry is opposite of dry).

In [6]:
print([word for word in stop_words if 'not' not in word and word[-3:] != "n't" and word[-2:] != "n'"])

['&', 'la', 'le', 'les', 'the', 'da', 'dal', 'dalla', 'della', 'del', 'de', 'al', 'alla', 'gli', 'i', 'dos', 'das', 'di', 'du', 'do', 'lo', 'of', 'and', 'with', 'to', 'the', 'a', 'an', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 

In [7]:
# negative words can be useful information and should be kept
stop_words.remove('nor')
stop_words.remove('no')
stop_words.remove('don')
stop_words.remove('t')

In [8]:
print(sorted(stop_words))

['&', 'a', 'a', 'about', 'above', 'after', 'again', 'against', 'al', 'all', 'alla', 'am', 'an', 'an', 'and', 'and', 'any', 'are', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'd', 'da', 'dal', 'dalla', 'das', 'de', 'del', 'della', 'di', 'did', 'do', 'do', 'does', 'doing', 'dos', 'down', 'du', 'during', 'each', 'few', 'for', 'from', 'further', 'gli', 'had', 'has', 'have', 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'i', 'if', 'in', 'into', 'is', 'it', "it's", 'its', 'itself', 'just', 'la', 'le', 'les', 'll', 'lo', 'm', 'me', 'more', 'most', 'my', 'myself', 'now', 'o', 'of', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'she', "she's", 'should', "should've", 'so', 'some', 'such', 'than', 'that', "that'll", 'the', 'the', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these', 'they', 'this', 'those'

In [9]:
from sentence_transformers import SentenceTransformer 


  from tqdm.autonotebook import tqdm, trange


## Load data set

In [10]:
df = pd.read_json("data/deduped_mod_winemag-data-130k-v2.json")
df.shape

(111593, 15)

In [10]:
info_features = df.columns
info_features = ['title',  'designation', 'variety', 'region-1',
       'region-2', 'province', 'country', 'winery']
df.fillna(" ", inplace = True)
# This step is not strictly necessary.  It just reduces the total number of words by removing redundant words.
# This is done because the transformer has a word limit.
df["merged_info_text"] = df[info_features].apply(lambda info_features: ' '.join(list(set(info_features))), axis = 1)
df["merged_info_text"] = df["merged_info_text"].apply(lambda merged_info_text: " ".join(list(set(merged_info_text.split(" ")))))

#  Augment info_features to text to enrich it.
df["merged_info_text"] = df[['description', "merged_info_text"]].apply(lambda info_features: ' '.join(list(info_features)), axis = 1)

# Remove stop words
df["merged_info_text"] = df["merged_info_text"].apply(lambda text: " ".join([w for w in text.split() if not w in stop_words]))
df["len"] = df["merged_info_text"].apply(len)


df.drop(info_features, axis = 1, inplace = True)
# df["merged_info_features"] = df[info_features].apply(lambda info_features: info_features)

In [25]:
# The removes the ceiling for memory usage for application on my laptop.  
# It would run out of money if I don't remove it.
# Exercise extreme caution.   
os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '-0.0'

In [26]:
# This is the sentence transformer that can take the longest text (512 words).   
# Our description can be as long as > 900 words (about 100 or so).  
# Wheir embeddings will not be perfect, still well over half their words are captured.
# It is the best we can do.
model_names = ['msmarco-distilbert-dot-v5']
trust_remote_code = False

for model_name in model_names:
    model = SentenceTransformer(model_name, trust_remote_code = trust_remote_code)
    df["desc_embed"] = df["merged_info_text"].apply(lambda text: model.encode(text, convert_to_tensor=True))
    desc_embeds = model.encode(df["merged_info_text"].values, convert_to_tensor=False)
    df_embeds = pd.DataFrame(columns = ['embed_'+ str(num).zfill(3) for num in range(desc_embeds.shape[1])],
                             data = desc_embeds,
                             index = df.index
                            )

In [28]:
with gzip.open("data/embedded_desc_newstopwds"+model_name.split("/")[-1]+".pckl", "wb") as f:
    pickle.dump(df_embeds, f)