Here we used the csv file created from DataCollection.ipynb to extract features. 
1. Data Pre-processing
2. Lemmatization
3. Extract top 1000 frequently used words
4. Implement word2vec model
5. Binarize the words in the reviews based on features selected
6. Aggregate the features for each restaurant
7. Apply Tf-IDF
8. Save the csv file

In [1]:
import pandas as pd
import numpy
import nltk
import re
from nltk.corpus import stopwords
from collections import defaultdict 
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
data = pd.read_csv('C:/Users/varal/OneDrive - George Mason University/Desktop/MS/AIT 590/Project/AIT590-data.csv')

##### Understanding Data

In [3]:
data.columns

Index(['business_id', 'name', 'city', 'postal_code', 'latitude', 'longitude',
       'Restaurant Rating', 'review_count', 'user_id', 'review_id',
       'User-RestaurantRating', 'text', 'date', 'user-reviewCount'],
      dtype='object')

In [4]:
data.shape

(30199, 14)

In [5]:
data.head()

Unnamed: 0,business_id,name,city,postal_code,latitude,longitude,Restaurant Rating,review_count,user_id,review_id,User-RestaurantRating,text,date,user-reviewCount
0,utIA0LyQmwP-9DRyxUe6qQ,"Snooze, An A.M. Eatery",Phoenix,85016,33.508204,-112.037033,4.0,3515,h0fffFM3GcXll6FsgUGC8g,F6MW-SXeUw4P_NpSRufCpQ,5,Great breakfast joint! \n\nHave been to Snooze...,1/4/2015 4:11,30
1,utIA0LyQmwP-9DRyxUe6qQ,"Snooze, An A.M. Eatery",Phoenix,85016,33.508204,-112.037033,4.0,3515,3cC726zwgerKNNasnidAww,71Je4Eb7kX9vhvbrXsLgRw,4,I should have had the pancakes.\n\nDon't get m...,4/19/2014 21:39,25
2,utIA0LyQmwP-9DRyxUe6qQ,"Snooze, An A.M. Eatery",Phoenix,85016,33.508204,-112.037033,4.0,3515,dEk7mXM4npuDPmxrwPcEgQ,Bx8IxSpyzeZcMQlmVa5hCQ,3,95 minute wait for breakfast? Really? So here ...,11/30/2014 23:45,48
3,utIA0LyQmwP-9DRyxUe6qQ,"Snooze, An A.M. Eatery",Phoenix,85016,33.508204,-112.037033,4.0,3515,771OWzbzelsEeSlx8QsfsQ,egxw1AeFUURKuCZKjBP9XA,2,I so wanted to fall in love with the cuteness ...,12/7/2013 20:25,26
4,utIA0LyQmwP-9DRyxUe6qQ,"Snooze, An A.M. Eatery",Phoenix,85016,33.508204,-112.037033,4.0,3515,_jYEC7fvqTxu5R2jhk_NDQ,6ZgMgiayplF_kA-gBKqj2A,4,"OMG, this place is so so so so so yummy...and ...",4/19/2014 18:05,23


**Data Pre-processing**    
Pre-processing the reviews
1. Converting to lower case
2. Removing stop words, punctuations, digits
3. Lemmatizing the data

In [9]:
def Preprocess(text):
  # Remove Numbers
  text = re.sub(r'[0-9]+', '', text)
  # convert to lower case
  text = text.lower()
  # Remove punctuations and empty spaces
  text = re.sub(r'[^a-zA-Z]',' ', text) 
  STOP_WORDS = stopwords.words('english')
  # word tokenization and remove stop words
  text = [w for w in text.split() if w not in STOP_WORDS]
  return text


In [10]:
#Reviews data
text = data['text']
text = text.apply(lambda x: Preprocess(x))

In [26]:
# Lemmatization
def lemmatize_token(tokens):
    tags = defaultdict(lambda : wn.NOUN)
    tags['J'] = wn.ADJ
    tags['V'] = wn.VERB
    tags['R'] = wn.ADV

    lemmitizer = WordNetLemmatizer()
    new_tokens = []
    for token, tag in pos_tag(tokens):
        lemma = lemmitizer.lemmatize(token, tags[tag[0]])
        new_tokens.append(lemma)
    return new_tokens

In [27]:
text = text.apply(lambda x: lemmatize_token(x))

In [29]:
# Extracting top 1000 words used in the reviews
newText = text.apply(lambda x: ' '.join(x))

In [30]:
newText = ' '.join(newText)

In [31]:
# Word Frequency Distribution:
freq_dist = nltk.FreqDist(word_tokenize(newText))
# top 1000 frequentwords
print(freq_dist.most_common(1000))

[('good', 31558), ('get', 28432), ('food', 24150), ('place', 24004), ('go', 23362), ('order', 21681), ('like', 21601), ('come', 20058), ('time', 18821), ('great', 18263), ('one', 17814), ('really', 15383), ('make', 15303), ('try', 15146), ('well', 13998), ('also', 13237), ('would', 13118), ('service', 12850), ('back', 12586), ('love', 12127), ('restaurant', 12020), ('menu', 11825), ('wait', 10348), ('say', 10228), ('table', 9872), ('drink', 9640), ('think', 9385), ('cheese', 9374), ('fry', 9318), ('take', 9295), ('chicken', 9225), ('even', 9097), ('burger', 9089), ('u', 9042), ('little', 8979), ('delicious', 8925), ('nice', 8914), ('bar', 8602), ('sauce', 8544), ('want', 8112), ('eat', 8103), ('best', 7984), ('pretty', 7966), ('much', 7884), ('look', 7814), ('flavor', 7738), ('taste', 7721), ('dish', 7596), ('side', 7447), ('thing', 7445), ('give', 7304), ('meal', 7207), ('definitely', 7174), ('know', 7088), ('first', 7081), ('night', 7043), ('friend', 6901), ('dinner', 6724), ('could'

In [32]:
# List of selected features 
features = ['cheese', 'chicken', 'burger',  'sauce', 'salad', 'pizza', 'dessert', 'pork', 'egg', 'meat', 'steak', 'beef', 'shrimp', 'bacon', 'fish', 'bbq', 'pasta', 'thai', 'mexican', 'seafood', 'tea', 'italian', 'vegetarian', 'asian', 'goat', 'vegan', 'american', 'chinese','beer', 'wine', 'cocktail', 'shake', 'juice', 'coffee','fry', 'tasty', 'hot', 'spicy', 'crispy', 'grill', 'toast', 'roast', 'yummy', 'salt', 'spice', 'creamy', 'juicy', 'fried', 'crisp', 'bake', 'crunchy', 'sour', 'greasy', 'chewy','friendly', 'fantastic',  'classic', 'incredible', 'authentic', 'overly']

WORD 2 vec Model

In [33]:
text = text.apply(lambda x: ' '.join(x))

In [16]:
df = pd.DataFrame({'review': text})

In [17]:
# phrases take list of list as input
reviews = [row.split() for row in df['review']]

In [18]:
phrases = Phrases(reviews, min_count=30, progress_per=500)

In [19]:
bigram = Phraser(phrases)
sentences = bigram[reviews]

In [20]:
# word2vec model implementation
# Parameters to word2vec model
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=7)

In [21]:
# Build vocabulary
w2v_model.build_vocab(sentences, progress_per=500)

In [22]:
#Train the word2vec Model
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

(37363694, 78770070)

In [23]:
w2v_model.init_sims(replace=True)

In [25]:
# Generate features from the features selected from top 1000 words
featuresGenerated = []
i = 0
for j in features:
    # similar words generation
    x = w2v_model.wv.most_similar(positive=[j])
    y = []
    i +=1
    y.append(i)
    y.append(j)
    for k in x:
        t = k[0]
        y.append(t) 
    featuresGenerated.append(y)


In [36]:
finalFeatures = ['chorizo', 'rib' , 'brisket', 'hanger_steak', 'filet', 'spencer', 'ribeye', 'skirt', 'tomahawk', 'filet','mignon',
                'duck',  'meatball', 'lamb', 'turkey', 'calamari', 'rotisserie', 'breast', 'loin', 'angus', 'spare', 'shank', 'sheep',
                'cream', 'chocolate', 'cake', 'waffle', 'biscuit', 'truffle', 'pudding', 'honey', 'crepe', 'caramel', 'jam', 'sugar', 'vanilla', 'pastry', 'cheesecake', 'gelato', 'cinnamon', 'cookie', 'chocolate_mousse', 'tiramisu', 'brownie', 'oreo', 'whipped', 'muffin', 'croissant',
                'potato', 'onion', 'tomato', 'corn', 'bean', 'mushroom', 'garlic', 'lemon', 'pickle', 'vegetable', 'chili', 'spinach', 'lettuce', 'peanut', 'olive', 'basil', 'cucumber', 'jalapeno', 'cilantro', 'carrot', 'endive', 'green', 'portobello',
                'avocado', 'banana', 'fruit', 'strawberry', 'apple', 'orange', 'pineapple', 'coconut', 'mango', 'papaya', 'kiwi', 'guava',
                'margarita', 'ipas', 'hefeweizen', 'chianti', 'vino', 'malbec', 'pinot', 'sauvignon', 'negroni', 'gimlet', 'mojito', 'gin',
                'salsa', 'pepper', 'guacamole', 'cheddar', 'aioli', 'parmesan', 'mozzarella', 'mayo', 'mayonnaise', 'marinara', 'fraiche',
                'bread', 'pancake', 'pie', 'burrito', 'donut', 'flatbread', 'omelet', 'scramble', 'cereal',
                'taco', 'sandwich', 'roll', 'rice', 'bun', 'dog', 'sausage', 'tortilla', 'savory', 'caesar', 'chowder', 'etouffee',
                'pasta', 'risotto', 'dumpling', 'biancoverde', 'tagliatelle', 'carbonara', 'pappardelle', 'spaghetti', 'penne', 'bucatini', 'lasagna', 'fettuccine','linguini', 'spaghetti', 'spaetzle', 'focaccia',
                'chip', 'pretzel', 'patty', 'cheeseburger', 'gourmet', 'delux', 'ronin', 'hamburger', 'pepperoni', 'french', 'parmesan', 'rotisserie', 'hashbrowns', 'croquette', 'tots',
                'shrimp', 'crab', 'salmon', 'lobster', 'tuna', 'oyster', 'scallop', 'mussel', 'squid', 'tilapia', 'hamachi', 'sear', 'halibut', 'albacore', 'tilapia',
                'sushi', 'noodle', 'ramen', 'tofu', 'kalbi', 'congee', 'dim',
                'mimosa', 'lime', 'shot', 'milkshake',
                'provolone', 'gouda', 'manchego', 'asiago', 'white', 'gruyere', 'cheddar', 'havarti', 'jack', 'mancheg',
                'tinga', 'bean',
                'cobb', 'argula', 'caesar',
                'tom', 'panang','curry', 'massaman', 'kee', 'pad',
                'milk', 'iced', 'lattes','french', 'matcha', 'cappuccino',
                'taiwanese']

In [38]:
finalFeatures.extend(features)
len(finalFeatures)

272

In [39]:
# Total 274 features are collected

In [40]:
# Binarize the number of words selected from finalFeatures where 1 word exist 0 not exists
def binarize(finalFeatures, text):
    tokens = word_tokenize(text)
    binaryText = []
    for item in finalFeatures:
        if item in tokens:
            binaryText.append(1)
        else:
            binaryText.append(0)
    return binaryText

In [41]:
binaryText = text.apply(lambda x: binarize(finalFeatures, x))

In [50]:
len(sum(list(binaryText[2:3]),[]))

272

In [33]:
# Now each record is coded in terms of these 274 features


In [34]:
dfBinaryText = pd.DataFrame({'review': binaryText})
dfBinaryText.shape

(30199, 1)

In [35]:
dfBinaryText['business'] = data['business_id'] 
dfBinaryText['latitude'] = data['latitude']
dfBinaryText['longitude'] = data['longitude']
dfBinaryText['stars'] = data['Restaurant Rating']
dfBinaryText['city'] = data['city']

In [36]:
#Aggregating the records(Reviews) of every restaurant, creating a row for each restaurant
dfAggregate = (dfBinaryText['review'].groupby([dfBinaryText.business,dfBinaryText.latitude, dfBinaryText.longitude,dfBinaryText.stars,dfBinaryText.city]).apply(list)).to_frame().reset_index()
dfAggregate[5:10]

Unnamed: 0,business,latitude,longitude,stars,city,review
5,#NAME?,36.168783,-115.139913,4.0,Las Vegas,"[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
6,-050d_XIor1NpCuWkbIVaQ,33.456696,-112.072327,4.0,Phoenix,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
7,-7H-oXvCxJzuT42ky6Db0g,40.470783,-79.96025,3.5,Pittsburgh,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
8,01fuY2NNscttoTxOYbuZXw,35.229128,-80.867464,4.0,Charlotte,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
9,042IHd5KjHiMuBtGtugO_g,33.584063,-111.97976,4.0,Phoenix,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [37]:
dfAggregate.shape

(500, 6)

In [39]:
# Each record contains list of list, we are converting that into 1 list. That is combining all the vectors into one vector
a = dfAggregate['review'].apply(lambda x: (sum(x,[])))
dfAggregate1 = pd.DataFrame({'review':a})
dfAggregate1['business'] = dfAggregate['business']
dfAggregate1['stars'] = dfAggregate['stars']
dfAggregate1['city'] = dfAggregate['city']
dfAggregate1['latitude'] = dfAggregate['latitude']
dfAggregate1['longitude'] = dfAggregate['longitude']
dfAggregate = dfAggregate1

In [40]:
# Pre-process data to Construct TF-IDF 
def convertToWord(text, finalFeatures):
    wordForm = []
    n = len(finalFeatures)
    index = 0
    for item in text:
        if item == 1:
            wordForm.append(finalFeatures[index])   
        index +=1
        if index == n:
                index = 0
    return wordForm

In [41]:
dfWord = dfAggregate['review'].apply(lambda x: convertToWord(x,finalFeatures))

In [42]:
dfWord = pd.DataFrame({'review':dfWord})
dfWord['business'] = dfAggregate['business']

In [43]:
dfWord[5:10]
df = dfWord

In [44]:
import numpy as np
x = dfWord['review'].apply(lambda x: ' '.join(x))
x = ' '.join(x)
x = word_tokenize(x)
x
len(set(x))

258

In [45]:
# TF-IDF 
temp = dfWord['review'].apply(lambda x: ' '.join(x))
v = TfidfVectorizer()
x = v.fit_transform(temp)
# converting the data to dataframe
df1 = pd.DataFrame(x.toarray(), columns=v.get_feature_names())
resultTfIDF = pd.concat([dfWord, df1], axis=1)
resultTfIDF[5:10]

Unnamed: 0,review,business,aioli,albacore,american,angus,apple,argula,asiago,asian,...,vanilla,vegan,vegetable,vegetarian,vino,waffle,whipped,white,wine,yummy
5,"[rib, potato, onion, tomato, garlic, vegetable...",#NAME?,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.020644,...,0.0,0.010494,0.016088,0.0,0.0,0.0,0.0,0.031826,0.014518,0.012627
6,"[waffle, egg, bacon, american, fry, waffle, ba...",-050d_XIor1NpCuWkbIVaQ,0.0,0.0,0.015693,0.0,0.016032,0.0,0.0,0.0,...,0.017481,0.018934,0.0,0.013976,0.0,0.311309,0.0,0.02297,0.0,0.045567
7,"[potato, ipas, bread, flatbread, sandwich, chi...",-7H-oXvCxJzuT42ky6Db0g,0.086986,0.0,0.027238,0.0,0.027826,0.038104,0.0,0.0,...,0.0,0.016432,0.0,0.012129,0.0,0.0,0.0,0.009967,0.0341,0.049431
8,"[potato, chili, mayo, cheese, salad, vegan, be...",01fuY2NNscttoTxOYbuZXw,0.0,0.0,0.008753,0.0,0.0,0.0,0.0,0.020777,...,0.009751,0.063366,0.0,0.109139,0.0,0.310072,0.017208,0.153748,0.021916,0.025416
9,"[skirt, garlic, lemon, green, avocado, aioli, ...",042IHd5KjHiMuBtGtugO_g,0.100976,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.065972,0.0


In [46]:
resultTfIDF['stars'] = dfAggregate['stars']
resultTfIDF['city'] = dfAggregate['city']
resultTfIDF['latitude'] = dfAggregate['latitude']
resultTfIDF['longitude'] = dfAggregate['longitude']

In [47]:
resultTfIDF.shape

(500, 264)

In [48]:
resultTfIDF.to_csv('FinalFeatures1.csv')