In [None]:
import pandas as pd
import numpy
import nltk
import re
from nltk.corpus import stopwords
from collections import defaultdict 
from nltk.corpus import wordnet as wn
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from datetime import date
import textstat
import readability
from sklearn.feature_extraction.text import CountVectorizer
from gensim import corpora, models
from sklearn.decomposition import LatentDirichletAllocation
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
df = pd.read_csv('AIT722-data.csv')

In [None]:
def dataPreprocess(t):
    t = t.lower()
    # Remove punctuations
    t = re.sub(r'[^a-zA-Z]',' ', t) 
    t = [w for w in t.split() if w not in stopwords.words('english')]
    return t

In [None]:
df['text'] = df['text'].apply(lambda x: dataPreprocess(x))

In [None]:
# Lemmatization
def lemmatize_token(tokens):
    tags = defaultdict(lambda : wn.NOUN)
    tags['J'] = wn.ADJ
    tags['V'] = wn.VERB
    tags['R'] = wn.ADV

    lemmitizer = WordNetLemmatizer()
    new_tokens = []
    for token, tag in pos_tag(tokens):
        lemma = lemmitizer.lemmatize(token, tags[tag[0]])
        new_tokens.append(lemma)
    return new_tokens

In [None]:
df['text'] = df['text'].apply(lambda x: lemmatize_token(x))

In [None]:
# Extracting top 1000 words used in the reviews
df['text'] = df['text'].apply(lambda x: ' '.join(x))

In [None]:
# Word Frequency Distribution:
freq_dist = nltk.FreqDist(word_tokenize(newText))
# top 1000 frequentwords
print(freq_dist.most_common(1000))

In [None]:
food = ['chicken','sauce','drink','cheese','burger','salad','meat','rice','dessert','pizza','beef','steak','sushi','bread',
       'soup','pork','egg','shrimp','sandwich','potato','buffet','beer','cake','appetizer','chocolate','ramen','green',
       'coffee','crab','onion','taco','garlic','thai','bacon','rib','wine','chip','bbq','tomato','bean','salmon',
       'mushroom','butter','lobster','seafood','corn','pepper','pancake','pasta','veggie','tuna','korean','mac','n','cocktail',
       'sausage','waffle','salt','asian','oyster','salsa','ingredient','eye','truffle','lemon','pie','chinese','vegan',
       'strawberry','shake','chili','tofu','duck','avocado','banana','lettuce','vegetable','calamari','bake','spinach','crepe',
       'pastry','filet','scallop','bone','fruit','meatball','juice','tempura','cheesecake','apple','mango','orange','coconut',
       'pickle','gravy','hawaiian','brisket','caesar','vegetarian','tortilla','sashimi','pudding','margarita','cucumber',
       'vanilla','tacos','basil','patty','sprout','mayo','soda','guacamole','peanut','latte','lime','syrup','paemesan','vietnamese',
       'ranch','ribeye','octopus','ginger','pineapple','cinnamon','katsu']

In [None]:
features = ['chicken','drink','cheese','burger','pizza','coffee','chocolate','wine','veggie','vegetarian','cake','rice','meat',
           'steak','bread','pork','appetizer']

In [None]:
# word 2 vec model

In [None]:
x = df['text'].apply(lambda x: ' '.join(x))

In [None]:
df1 = pd.DataFrame({'review': df['text']})

In [None]:
# phrases take list of list as input
reviews = [row.split() for row in df1['review']]

In [None]:
phrases = Phrases(reviews, min_count=30, progress_per=500)

In [None]:
bigram = Phraser(phrases)
sentences = bigram[reviews]

In [None]:
# word2vec model implementation
# Parameters to word2vec model
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=7)

In [None]:
# Build vocabulary
w2v_model.build_vocab(sentences, progress_per=500)

In [None]:
#Train the word2vec Model
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

In [None]:
w2v_model.init_sims(replace=True)

In [None]:
# Generate features from the features selected from top 1000 words
featuresGenerated = []
i = 0
for j in features:
    # similar words generation
    x = w2v_model.wv.most_similar(positive=[j])
    y = []
    i +=1
    y.append(i)
    y.append(j)
    for k in x:
        t = k[0]
        y.append(t) 
    featuresGenerated.append(y)


In [None]:
food.extend(['beverage','lemonade','mimosa','beverage','manchego','bleu','hamburger','cheeseburger','pepperoni',
                    'espresso','cappucino','caramel','marshmallow','buterscotch','toffee','riesling','champagne','cabernet',
                   'mushroom','buttercream','mousse','fillet','sourdough','baguette','garlic','chasu'])
len(food)