In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import gensim 

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from gensim.models import Phrases
from gensim.models import Doc2Vec
from sklearn.cluster import KMeans
from gensim import corpora, models, similarities

%matplotlib inline

In [2]:
os.chdir('/home/fykos/Documents/workspace/Kaggle/Wine Reviews/')

In [3]:
beer = pd.read_json('data/raw/beers.json')
wine = pd.read_csv('data/raw/winemag-data-130k-v2.csv')

In [4]:
beer = beer.drop(['designation', 'region_1', 'region_2', 'taster_name', 'taster_twitter_handle', 'variety', 'winery'], axis = 1)
beer['type'] = 'beer'
wine = wine.drop(['Unnamed: 0', 'designation', 'region_1', 'region_2', 'taster_name', 'taster_twitter_handle', 'variety', 'winery'], axis = 1)
wine['type'] = 'wine'

In [5]:
def normalize(review):
    review_letters = re.sub('[^a-zA-Z]', ' ', str(review))
    review_letters = review_letters.lower()
    return (" ".join(review_letters.split()))

In [9]:
def remove_stopwords(review):
    stop_words = set(stopwords.words('english'))
    ls = [word for word in review.split() if word not in stop_words]
    txt = " ".join(ls)
    return (txt)

In [10]:
processed_reviews = []
documents = []
reviews = wine['description']
for review in reviews:
    processed_reviews.append(word_tokenize(remove_stopwords(normalize(review))))

In [12]:
dictionary = corpora.Dictionary(processed_reviews)
dictionary.save('wines.dict')

In [17]:
corpus = [dictionary.doc2bow(review) for review in processed_reviews]
corpora.mmcorpus.serialize('wines.mm', corpus)

AttributeError: module 'gensim.corpora.mmcorpus' has no attribute 'serialize'

In [20]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [21]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=200) # initialize an LSI transformation
corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi

In [14]:
new_doc = 'A hazy, light brown color in the glass. Cloudy, strong head with good retention. Hints of apple, wood-grilled peaches, oatmeal, ginger and other sweet spices. Medium bodied and crisp in the mouth, with green apple backed by banana, pink grapefruit, nectarine pit, yeasty dough and accents of coriander and woody spice. Well balanced, integrated and delicious.'
new_doc = dictionary.doc2bow(word_tokenize(remove_stopwords(normalize(new_doc))))

In [25]:
vec_lsi = lsi[new_doc]

In [26]:
index = similarities.MatrixSimilarity(lsi[corpus]) # transform corpus to LSI space and index it

In [27]:
sims = index[vec_lsi] # perform a similarity query against the corpus

In [28]:
sims = sorted(enumerate(sims), key=lambda item: -item[1])
print(sims) # print sorted (document number, similarity score) 2-tuples

[(117574, 0.59978622), (24082, 0.56237805), (87392, 0.55659729), (1190, 0.55073303), (55400, 0.55073303), (51113, 0.5327574), (41870, 0.52946931), (63997, 0.52946931), (79611, 0.52775544), (4216, 0.52647758), (75165, 0.52639526), (5567, 0.52480191), (88028, 0.524764), (86350, 0.52456897), (47731, 0.51842684), (121470, 0.51723295), (96421, 0.51607841), (96264, 0.51073009), (694, 0.50776118), (104296, 0.50635552), (116590, 0.505463), (37276, 0.5003252), (66755, 0.49893752), (35117, 0.49778345), (128069, 0.49649081), (82543, 0.49556777), (79808, 0.49503154), (12099, 0.49441853), (107885, 0.49344942), (89673, 0.49196321), (126876, 0.49158382), (21584, 0.49090981), (69617, 0.49073815), (87978, 0.49037996), (97294, 0.48723334), (31563, 0.4865292), (100804, 0.48433146), (57465, 0.48408645), (109811, 0.48255566), (105884, 0.48051226), (42654, 0.47964546), (118884, 0.47808912), (103516, 0.47752255), (107950, 0.47705063), (38495, 0.47635576), (114553, 0.47445071), (84303, 0.47366774), (116572, 0

In [33]:
'A hazy, light brown color in the glass. Cloudy, strong head with good retention. Hints of apple, wood-grilled peaches, oatmeal, ginger and other sweet spices. Medium bodied and crisp in the mouth, with green apple backed by banana, pink grapefruit, nectarine pit, yeasty dough and accents of coriander and woody spice. Well balanced, integrated and delicious.'

'A hazy, light brown color in the glass. Cloudy, strong head with good retention. Hints of apple, wood-grilled peaches, oatmeal, ginger and other sweet spices. Medium bodied and crisp in the mouth, with green apple backed by banana, pink grapefruit, nectarine pit, yeasty dough and accents of coriander and woody spice. Well balanced, integrated and delicious.'

In [29]:
wine['description'][117574]

'Very dry, crisp, grapefruit and green apple flavors pile through this wine. There is some spice, hints of kiwi and a layer of apple skin tannins. All these intense flavors explode from the glass.'

In [30]:
wine['description'][24082]

'Crisp and stony, this lovely wine is medium-bodied and balanced, with green apple fruit, Asian spice and a voluptuous texture and finish.'

In [31]:
wine['description'][87392]

'Freshness and refreshment are the strong points of this well-balanced, tangy, medium-bodied wine. It is light in fruit aroma and subtle in flavor, offering some crisp apple inspiration on the palate and finish.'