# Shakespeare Word Frequency 

In [None]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

In [None]:
with open('Macbeth Shakespeare.txt') as f:
    x = f.read()

### Tokenization

In [None]:
sent = sent_tokenize(x)
print(sent)

In [None]:
print(word_tokenize(sent[1]))

In [None]:
words = []
for s in sent:
    for w in word_tokenize(s):
        words.append(w)

In [None]:
print(words)

### Stopword removal

In [None]:
from nltk.corpus import stopwords
from string import punctuation
nltk.download('stopwords')

In [None]:
print(punctuation)

In [None]:
print(stopwords.words('english'))

In [None]:
print(list(punctuation))

In [None]:
myStopWords = list(punctuation) + stopwords.words('english')

In [None]:
wordsNoStop = []
for i in words:
    if i not in myStopWords:
        wordsNoStop.append(i)
print(wordsNoStop)

In [None]:
wordsNoStopComp = [w for w in words if w not in myStopWords]
print(wordsNoStopComp)

### 20 Most frequent words in Macbeth

In [None]:
from nltk.corpus import wordnet

In [None]:
words = word_tokenize(x.lower())

In [None]:
words

In [None]:
wordsNoStopWords = [w for w in words if w not in myStopWords]

In [None]:
wordsNoStopWords

In [None]:
from nltk.probability import FreqDist

In [None]:
freq = FreqDist(wordsNoStopWords)

In [None]:
freq

In [None]:
for i in sorted(freq, key=freq.get, reverse=True)[:20]:
    print(i,freq[i])

#### With exception of "though," "enter," "us" and other similar words, others give a fair impression of words that a reader would encounter in the play Macbeth. 

# Yelp sentiments (Howlin' Ray's in Chinatown)

In [None]:
from nltk.sentiment import vader
nltk.download('vader_lexicon')

In [None]:
sia = vader.SentimentIntensityAnalyzer()

In [None]:
review_1 = '''
5 stars: It's been a first world problem dealing with hot "sandies" in LA. 
But nothing - ever - compares to Howlin' Ray's. 
My wife and I have tried them all and nothing comes close. 
Some things are a pale comparison but alas, no they don't compare.
'''

In [None]:
sia.polarity_scores(review_1)

It appears that the review itself is more positive than what the sentiment analysis shows

In [None]:
review_2 = '''
4 stars: Omg! They're not playing when it comes to spice...
Be careful... be VERY careful. But be ready for something delicious.
'''

In [None]:
sia.polarity_scores(review_2)

The sentiment analysis is a little generous in its assessment of this review that warns 
potential customers about spice levels of the sandos

In [None]:
review_3 = '''
5 stars: Overall, I found the sandwich very enjoyable. 
The chicken was fresh and juicy, and the buns were noteworthy. 
Would I wait over an hour for one of these sandwiches? Definitely not. 
That said, I'm happy to have finally tried this ultimate foodie destination 
that has been on my list forever. It was delicious!
'''

In [None]:
sia.polarity_scores(review_3)

A neutral review indeed.

In [None]:
review_4 = '''
5 stars: The Nashville original fried chicken sandwich here 
still tops any chicken sandwich I've ever tried.
'''

In [None]:
sia.polarity_scores(review_4)

This is a more positive than a neutral one

In [None]:
review_5 = '''
5 stars: We order the Sandos with mild spicy level. 
I am a little bit of a chicken to go any higher. 
There is quite a jump in slickness from mild to medium. YOU ARE WARNED! 
I cannot imagine how spicy Hot or Howlin is. I be dead, Forreals. 
The Sando is ABSOLUTELY AMAZING! Probably the best chicken sandwich I had... ever! 
The chicken is super juicy, their coleslaw mix is delish, as well as their pickles! 
You will not not like their Sando, how can you?
'''

In [None]:
sia.polarity_scores(review_5)

Certainly a very positive review rather than just neutral

In [None]:
review_6 = '''
4 stars: We ordered the sando in hot and the wings in medium. There is a distinct 
heat from medium to hot with hot having more of the hot oil taste. 
The sando tasted delicious though! Juicy chicken with a nice ratio of less bread, 
equal coleslaw and big chicken. The wings were a nice crisp level to the skin but 
less and medium was much more manageable to eat without any side to slice the heat.
'''

In [None]:
sia.polarity_scores(review_6)

Agree with the rating here

In [None]:
review_7 = '''
5 stars: This is the standard that all fried chicken should be measured by.  
Juicy, flavorful, fresh, and just absolutely amazing!  But be careful: 
even their medium will make you sweat! Their fries are also absolutely amazing!  
There are some other competition around towb but Howlin' Ray's is hands down the best.
'''

In [None]:
sia.polarity_scores(review_7)

A more positive review than neutral in my opinion

In [None]:
review_8 = '''
4 stars: The chicken is cooked perfectly. It's moist, juicy and tender. 
It's piping hot and flavorful.  The only thing that I would say is that the wings were 
a little too salty from all the powdered seasoning. 
Fries were perfectly crisp outside and fluffy inside.  
The sandwich is so well balanced.
'''

In [None]:
sia.polarity_scores(review_8)

There is a negative note to the review regarding salt level in the wings but Vader didn't seem to catch on

In [None]:
review_9 = '''
5 stars: We got the chicken sandwich, tenders and fries in both mild and medium plus. 
The mild didn't taste spicy to me at all which is fine since i can't handle it anyways. 
My friends said the medium plus was spicy but still very tasty. The chicken was super juicy and crispy. 
Literally dripping. Loved the slaw and pickles. Fries were lightly seasoned and went well with 
extra sauce we ordered. This place definitely lives up to the hype for their Nashville hot chicken. 
Highly recommend!
'''

In [None]:
sia.polarity_scores(review_9)

The review is very positive

In [None]:
review_10 = '''
4 stars: We enjoy spicy food, but medium was a little spicier than I would have wanted. 
It seems to be the kind that's spicy just to be spicy, not to add flavor. 
I'd go down a level next time. Other than that, the sandwich was good.
'''

In [None]:
sia.polarity_scores(review_10)

This is a more neutral review, perhaps even with negative undertones about the medium spice level

In [None]:
review_11 = '''
5 stars: The Sando is Howlin Ray's classic hot chicken sandwich, and it is my favorite item on the menu. 
The thick boneless chicken breast is always so juicy, crispy, and flavorful. 
The cool, tangy slaw cuts the heat and adds crunch and texture to every bite. 
All of this sits between two buttered brioche buns that are slathered with their creamy comeback sauce. 
I like ordering the x-hot level. I'm seriously drooling just typing this out.
'''

In [None]:
sia.polarity_scores(review_11)

There isn't much negativity in the review above, it is very positive

In [None]:
review_12 = '''
3 stars: Is the chicken here good? Yes. 
Is it worth the 2 hour wait + price? No. With the time I spent waiting, 
I could've driven back to San Diego and bought a 10-piece dark meat  
fried chicken place from my favorite spot 
(it's glorious and as good as Howlin' Rays) for $10.99.
'''

In [None]:
sia.polarity_scores(review_12)

Mostly negative review

In [None]:
review_13 = '''
4 stars for the wings: Unfortunately I have less to say about the wings cause 
I only got to take a nibble. I mainly bought it to take home to my parents and 
I definitely still really enjoyed them. I went for a slightly spicier version and 
my tolerance has weakened so the medium was perfect for me. The wings also don't have 
those other elements to help cut the spice so it was tasty but not as tasty as the sandwich. 
It does come with pickles and bread but it's not quite the same. 
Still delicious but I'll definitely always go for the sandwich.
'''

In [None]:
sia.polarity_scores(review_13)

A more negative than positive review

In [None]:
review_14 = '''
5 stars: FIERY GOOD! We ordered pick-up from this restaurant and took it back home (maybe 10 min walk). 
We got an order of The Sando as well as the Mario Style Fries and let me tell you - explosion of flavor. 
Hot flavor! Be careful with what spice level you choose because medium had me sweating a little. 
The chicken is seasoned so well and you get so much crunch in each bite. 
The comeback sauce had me wanting more and the fries were such a good pair. 
I totally recommend this kicking and buttery sandwich!
'''

In [None]:
sia.polarity_scores(review_14)

A very positive review

In [None]:
review_15 = '''
5 stars: Situated in a shopping plaza in downtown La. Waited a good 2+hours in the summer time of 2019. 
I ordered the fried chicken sando with medium heat. Let me tell you this has a kick to it. 
And boy it is worth the 2+ hour wait. THE BEST NASHVILLE HOT SANDWICH EVER. 
And I've been to over 50+ Nashville hot chicken places and this is the king of them all. 
You get the crunchiness of the chicken and the tenderness on the inside as well. 
The salt and garlic and pepper taste of the seasoning plays well with all the other ingredients. 
The owner Johnny Zone came up to me and greeted me and told me I was dressed nicely and 
hooked me up with another sandwich and fries! I like this place so much so that I've been coming 
8 times already. And yes I've been waiting in line over 2+ hours each time. 
That's how good it is. Highly highly recommend this place to others
'''

In [None]:
sia.polarity_scores(review_15)

This is a very positive review

# Movie reviews

In [1]:
import pandas as pd
from pathlib import Path  
import glob

In [3]:
directory_path = 'moviereviews'
text_files = glob.glob(f"{directory_path}/*")

In [4]:
listofreviews = []
for i in text_files:
    try:
        with open(i) as f:
            listofreviews.append(f.read())
    except:
        pass

In [5]:
len(text_files)

10

In [6]:
len(listofreviews)

10

In [7]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from string import punctuation

In [10]:
extrastop = ['’', '“','”']

In [11]:
myStopWords = list(punctuation) + stopwords.words('english') + extrastop

In [12]:
[w for w in word_tokenize(listofreviews[0].lower()) if w not in myStopWords]

['times',
 'steven',
 'knight',
 'script',
 'over-eggs',
 'imagery',
 'least',
 'recurrent',
 'motif',
 'pheasants',
 'beautiful',
 'bright',
 'birds',
 'bred',
 'shooting',
 'subplot',
 'bertie',
 'scarecrow',
 'wearing',
 'coat',
 'diana',
 'papa',
 'also',
 'visions',
 'anne',
 'boleyn',
 'beheaded',
 'royal',
 'husband',
 'could',
 'replace',
 'another',
 'woman',
 'amid',
 'madness',
 'royal',
 'life',
 'presence',
 'seems',
 'weirdly',
 'unintrusive',
 'affecting',
 'ecstatic',
 'montage',
 'stewart',
 'dances',
 'way',
 'chapters',
 'diana',
 'life',
 'ballet',
 'bops',
 'breaking',
 'running',
 'canter',
 'whiff',
 'freedom',
 'presents']

In [13]:
listofreviews2 = []
for i in listofreviews:
    listofreviews2.append([w for w in word_tokenize(i.lower()) if w not in myStopWords])

In [14]:
listofreviews2[0]

['times',
 'steven',
 'knight',
 'script',
 'over-eggs',
 'imagery',
 'least',
 'recurrent',
 'motif',
 'pheasants',
 'beautiful',
 'bright',
 'birds',
 'bred',
 'shooting',
 'subplot',
 'bertie',
 'scarecrow',
 'wearing',
 'coat',
 'diana',
 'papa',
 'also',
 'visions',
 'anne',
 'boleyn',
 'beheaded',
 'royal',
 'husband',
 'could',
 'replace',
 'another',
 'woman',
 'amid',
 'madness',
 'royal',
 'life',
 'presence',
 'seems',
 'weirdly',
 'unintrusive',
 'affecting',
 'ecstatic',
 'montage',
 'stewart',
 'dances',
 'way',
 'chapters',
 'diana',
 'life',
 'ballet',
 'bops',
 'breaking',
 'running',
 'canter',
 'whiff',
 'freedom',
 'presents']

In [15]:
from nltk.stem.porter import PorterStemmer

In [16]:
p_stemmer = PorterStemmer()

In [17]:
listOfStemmedWords = []
for i in listofreviews2:
    listOfStemmedWords.append([p_stemmer.stem(w) for w in i])

In [18]:
listOfStemmedWords[0]

['time',
 'steven',
 'knight',
 'script',
 'over-egg',
 'imageri',
 'least',
 'recurr',
 'motif',
 'pheasant',
 'beauti',
 'bright',
 'bird',
 'bred',
 'shoot',
 'subplot',
 'berti',
 'scarecrow',
 'wear',
 'coat',
 'diana',
 'papa',
 'also',
 'vision',
 'ann',
 'boleyn',
 'behead',
 'royal',
 'husband',
 'could',
 'replac',
 'anoth',
 'woman',
 'amid',
 'mad',
 'royal',
 'life',
 'presenc',
 'seem',
 'weirdli',
 'unintrus',
 'affect',
 'ecstat',
 'montag',
 'stewart',
 'danc',
 'way',
 'chapter',
 'diana',
 'life',
 'ballet',
 'bop',
 'break',
 'run',
 'canter',
 'whiff',
 'freedom',
 'present']

In [19]:
!pip install gensim

Collecting gensim
  Using cached gensim-4.1.2-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.0 MB)
Collecting smart-open>=1.8.1
  Using cached smart_open-5.2.1-py3-none-any.whl (58 kB)
Installing collected packages: smart-open, gensim
Successfully installed gensim-4.1.2 smart-open-5.2.1


In [20]:
from gensim import corpora, models
import gensim

In [21]:
dictionary = corpora.Dictionary(listOfStemmedWords)

In [22]:
print(dictionary.token2id)

{'affect': 0, 'also': 1, 'amid': 2, 'ann': 3, 'anoth': 4, 'ballet': 5, 'beauti': 6, 'behead': 7, 'berti': 8, 'bird': 9, 'boleyn': 10, 'bop': 11, 'break': 12, 'bred': 13, 'bright': 14, 'canter': 15, 'chapter': 16, 'coat': 17, 'could': 18, 'danc': 19, 'diana': 20, 'ecstat': 21, 'freedom': 22, 'husband': 23, 'imageri': 24, 'knight': 25, 'least': 26, 'life': 27, 'mad': 28, 'montag': 29, 'motif': 30, 'over-egg': 31, 'papa': 32, 'pheasant': 33, 'presenc': 34, 'present': 35, 'recurr': 36, 'replac': 37, 'royal': 38, 'run': 39, 'scarecrow': 40, 'script': 41, 'seem': 42, 'shoot': 43, 'steven': 44, 'stewart': 45, 'subplot': 46, 'time': 47, 'unintrus': 48, 'vision': 49, 'way': 50, 'wear': 51, 'weirdli': 52, 'whiff': 53, 'woman': 54, 'around': 55, 'asphalt': 56, 'audienc': 57, 'beaten': 58, 'ben': 59, 'beverag': 60, 'charg': 61, 'come': 62, 'contriv': 63, 'cours': 64, 'discombobul': 65, 'drift': 66, 'drink': 67, 'dude': 68, 'entir': 69, 'experi': 70, 'film': 71, 'gazzara': 72, 'glass': 73, 'hey': 7

In [23]:
corpus = [dictionary.doc2bow(text) for text in listOfStemmedWords]

In [24]:
print(corpus[3])

[(21, 1), (27, 1), (69, 1), (71, 2), (125, 2), (129, 1), (132, 1), (133, 1), (134, 1), (135, 1), (136, 1), (137, 1), (138, 2), (139, 1), (140, 1), (141, 1), (142, 1), (143, 1), (144, 1), (145, 1), (146, 1), (147, 1), (148, 1), (149, 1), (150, 1), (151, 1), (152, 1), (153, 1), (154, 1), (155, 1), (156, 1), (157, 1), (158, 1), (159, 1), (160, 1), (161, 1), (162, 2), (163, 1), (164, 2), (165, 1), (166, 1), (167, 1), (168, 1), (169, 1), (170, 1), (171, 1), (172, 1), (173, 1), (174, 1), (175, 1)]


In [27]:
print(dictionary.token2id['vision'])

49


In [28]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, 
                                           num_topics=2, 
                                           id2word = dictionary, 
                                           passes=20)

In [29]:
for i in ldamodel.print_topics(num_topics=2, num_words=20):
    print(i)

(0, '0.010*"alway" + 0.008*"\'s" + 0.008*"entir" + 0.008*"scene" + 0.008*"strang" + 0.008*"macbeth" + 0.006*"could" + 0.006*"life" + 0.006*"sondheimian" + 0.006*"royal" + 0.006*"larson" + 0.006*"dude" + 0.006*"give" + 0.006*"come" + 0.006*"diana" + 0.006*"seriou" + 0.006*"sens" + 0.006*"witch" + 0.006*"bird" + 0.006*"charm"')
(1, '0.009*"movi" + 0.009*"film" + 0.009*"look" + 0.009*"use" + 0.009*"set" + 0.009*"love" + 0.009*"\'s" + 0.006*"moment" + 0.006*"one" + 0.006*"us" + 0.006*"gucci" + 0.006*"gaga" + 0.006*"python" + 0.006*"—" + 0.006*"ladi" + 0.006*"shot" + 0.006*"brian" + 0.006*"want" + 0.006*"let" + 0.006*"burnham"')


The program unfortunately did not create the categories based on comedy vs drama reviews. Perhaps longer reviews with a larger sample size would have been better for this. There are unique names from titles in both categories without a separation between what should be a comedy and what should be a drama.