In [0]:
%tensorflow_version 1.x
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras import backend as K
import matplotlib.pyplot as plt


In [0]:
tf.test.is_gpu_available()

True

In [0]:
!pip install contractions



In [0]:
import contractions
from bs4 import BeautifulSoup
import unicodedata
import re

In [0]:
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    return stripped_text


def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text


def expand_contractions(text):
    return contractions.fix(text)



def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text = re.sub(pattern, '', text)
    return text


def pre_process_document(document):
    
    # strip HTML
    document = strip_html_tags(document)
    
    # lower case
    document = document.lower()
    
    # remove extra newlines (often might be present in really noisy text)
    document = document.translate(document.maketrans("\n\t\r", "   "))
    
    # remove accented characters
    document = remove_accented_chars(document)
    
    # expand contractions    
    document = expand_contractions(document)
               
    # remove special characters and\or digits    
    # insert spaces between special characters to isolate them    
    special_char_pattern = re.compile(r'([{.(-)!}])')
    document = special_char_pattern.sub(" \\1 ", document)
    document = remove_special_characters(document, remove_digits=True)  
        
    # remove extra whitespace
    document = re.sub(' +', ' ', document)
    document = document.strip()
    
    return document


pre_process_corpus = np.vectorize(pre_process_document)

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
local_data_path = '/content/drive/My Drive/ColabNotebooks/winereview/' # change as needed
reviews = pd.read_csv(local_data_path+"winemag-data-130k-v2.csv")
def point_assign(x):
  if x <= 85:
    return 0
  elif x <= 90:
    return 1
  elif x <=95:
    return 2
  else:
    return 3
reviews["points_cat"] = reviews["points"].apply(point_assign)

In [0]:
# drop NA
reviews.dropna(subset=['description','price'], inplace=True)
reviews.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,points_cat
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,1
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,1
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,1
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,1
5,5,Spain,Blackberry and raspberry aromas show a typical...,Ars In Vitro,87,15.0,Northern Spain,Navarra,,Michael Schachner,@wineschach,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem,1


In [0]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(np.array(reviews['price'].values).reshape(-1,1))

MinMaxScaler(copy=True, feature_range=(0, 1))

In [0]:
wine_price = scaler.transform(np.array(reviews['price'].values).reshape(-1,1))

In [0]:
wine_price.shape

(120975, 1)

In [0]:
pre_reviews = reviews['description'].values

In [0]:
post_reviews = pre_process_corpus(pre_reviews)

In [0]:
post_reviews[14343]

'a sourness of bitter black tea overtakes a shyness of fruit in this lightbodied wine that holds on to its tannin with tenacity'

In [0]:
X = list(zip(post_reviews, wine_price.tolist()))
y = reviews['points_cat'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

In [0]:
print(X[9779])
print(y[9779])

('a zippy spicy nose offering minerality earthiness and red berries starts this pert zinfandel the wine offers balanced tannins fruit and a fresh acidity friendly and fun', [0.0018203883495145627])
1


In [0]:
X_train_text = np.array([i for i,j in X_train])
X_test_text = np.array([i for i,j in X_test])
X_train_price = np.array([j for i,j in X_train])
X_test_price = np.array([j for i,j in X_test])

In [0]:
# Training input on the whole training set with no limit on training epochs.
train_input_fn = tf.estimator.inputs.numpy_input_fn(
    {'sentence': X_train_text,'price': X_train_price}, y = y_train,
    batch_size=32, num_epochs=None, shuffle=True)

In [0]:
# Prediction on the whole training set.
predict_train_input_fn = tf.estimator.inputs.numpy_input_fn(
    {'sentence': X_train_text,'price': X_train_price}, y = y_train, shuffle=False)

In [0]:
# Prediction on the test set.
predict_test_input_fn = tf.estimator.inputs.numpy_input_fn(
    {'sentence': X_test_text,'price': X_test_price}, y = y_test, shuffle=False)

In [0]:
embedding_feature = hub.text_embedding_column(
    key='sentence', 
    module_spec="https://tfhub.dev/google/universal-sentence-encoder/2", #https://tfhub.dev/google/nnlm-en-dim128/2
    trainable=True) # can be changed to True

In [0]:
print(embedding_feature)

_TextEmbeddingColumn(key='sentence', module_spec_path='https://tfhub.dev/google/universal-sentence-encoder/2', trainable=True)


In [0]:
combined_feature = [embedding_feature,tf.feature_column.numeric_column("price")]

In [0]:
dnn = tf.estimator.DNNClassifier(
          hidden_units=[256, 64],
          feature_columns= combined_feature,
          n_classes=4,
          activation_fn=tf.nn.relu,
          dropout=0.3,
          optimizer=tf.train.AdamOptimizer(learning_rate=0.001))

INFO:tensorflow:Using default config.


INFO:tensorflow:Using default config.






INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpbtdb6pvi', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f3719b99e80>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpbtdb6pvi', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f3719b99e80>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [0]:
tf.logging.set_verbosity(tf.logging.ERROR)
import time

TOTAL_STEPS = 10000
STEP_SIZE = 1000
for step in range(0, TOTAL_STEPS+1, STEP_SIZE):
    print()
    print('-'*100)
    print('Training for step =', step)
    start_time = time.time()
    dnn.train(input_fn=train_input_fn, steps=STEP_SIZE)
    elapsed_time = time.time() - start_time
    print('Train Time (s):', elapsed_time)
    print('Eval Metrics (Train):', dnn.evaluate(input_fn=predict_train_input_fn))
    print('Eval Metrics (Testing):', dnn.evaluate(input_fn=predict_test_input_fn))


----------------------------------------------------------------------------------------------------
Training for step = 0
Train Time (s): 233.87584853172302
Eval Metrics (Train): {'accuracy': 0.81083536, 'average_loss': 0.49632812, 'loss': 63.47019, 'global_step': 1000}
Eval Metrics (Testing): {'accuracy': 0.7180763, 'average_loss': 0.65809447, 'loss': 84.15817, 'global_step': 1000}

----------------------------------------------------------------------------------------------------
Training for step = 1000
Train Time (s): 229.28874516487122
Eval Metrics (Train): {'accuracy': 0.9094847, 'average_loss': 0.25892824, 'loss': 33.11161, 'global_step': 2000}
Eval Metrics (Testing): {'accuracy': 0.7456024, 'average_loss': 0.6304423, 'loss': 80.62198, 'global_step': 2000}

----------------------------------------------------------------------------------------------------
Training for step = 2000
Train Time (s): 231.24197936058044
Eval Metrics (Train): {'accuracy': 0.9405823, 'average_loss':

In [0]:
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix

In [0]:
confusion_matrix(y_test, y_test_predict)

In [0]:
precision_recall_fscore_support(y_test, y_test_predict, average='weighted')