In [6]:
nltk, spacy 
'esc' + 'a' ===> new cell above
'esc' + 'b' ===> new cell below
'esc' + 'dd'--> delete a cell
shift + enter

In [11]:
!pip install spacy
!python -m spacy download en_core_web_sm 
#en_core_web_md, en_core_web_lg, en

[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


# Natural Language Understanding 

*Generic Data cleaning steps to take:*

**1. Converting text to lower case** 
* if your data is case insensitive, you'll need to consider your usecase as well
* some exceptions, if there're acronyms, which may lose their meaning if converted to word, e.g. in german language, MIT (university) vs mit (german for with)

**2. Removing punctuations**
* exceptions, twitter data, where punctuations may be good in showing sentiment

**3. Removing any numerical values**

**4. Tokenize text**
* different ways to tokenize the text

**5. Remove stop words**
* stop words can be altered to be usecase specific, 
* e.g. adding restaurant name to list of stopwords for restaurant review, as the people will usually give the restaurant name in the review
* e.g. twitter you may want to add words like 'RT' 'Retweet' etc for tweets


**6. Stemming / lemmatization** 
* different variation of words due to suffix, but the meaning of the word is the same, use of the word is identical, to prevent duplication of words being stored and improve efficiency
* lemmatization result is a proper word, unlike stemming


**7. Parts of speech tagging**
* good for identifying, summarising, to get e.g. retrieval of nouns from large pieces of texts (NNP)

**8. Create bi-grams or tri-grams**

**9. Deal with typos and spelling mistakes**

In [12]:
import spacy
nlp = spacy.load('en_core_web_sm')


# Tokenisation

In [8]:
regular expression

SyntaxError: invalid syntax (<ipython-input-8-890fd7fe702d>, line 1)

In [13]:
para = nlp('''Hello Mr. Tan, this is the first sentence. 
           This is the second sentence. 
           Notice, the abbreviations, e.g. Mr. Goh, Ms. Ng, do not get tokenised wrongly.''')

In [14]:
for sent in para.sents:
    print(sent)

Hello Mr. Tan, this is the first sentence.

           
This is the second sentence.

           Notice, the abbreviations, e.g. Mr. Goh, Ms. Ng, do not get tokenised wrongly.


In [15]:
sentence = nlp('I went to the nearby grocers to buy ten oranges and 5 apples.')

In [16]:
all_stopwords = nlp.Defaults.stop_words

In [17]:
for word in sentence:
    print(word, word.is_stop)

I True
went False
to True
the True
nearby False
grocers False
to True
buy False
ten True
oranges False
and True
5 False
apples False
. False


In [18]:
tokens = [token.text for token in sentence if not token.is_stop]


In [19]:
tokens

['went', 'nearby', 'grocers', 'buy', 'oranges', '5', 'apples', '.']

In [20]:
all_stopwords

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [21]:
type(all_stopwords)

set

In [22]:
all_stopwords.add('retweet') # RT,

In [23]:
all_stopwords.update({'zooloo','zzz'})

In [24]:
all_stopwords

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [25]:
all_stopwords.remove('single') #discard
all_stopwords.difference_update({'multiple','to','remove'})

KeyError: 'single'

# Lemmatization

In [26]:
doc = nlp('She was better than the best in atheletics. She was a great runner. Running was her passion.')

In [27]:
for token in doc:
    print(token, token.lemma_)

She she
was be
better well
than than
the the
best good
in in
atheletics atheletic
. .
She she
was be
a a
great great
runner runner
. .
Running running
was be
her her
passion passion
. .


In [29]:
doing stemming -- > NLTK natural language toolkit for python, porter stemmer and snowball stemmer

SyntaxError: invalid syntax (<ipython-input-29-e1107cd3b8a1>, line 1)

# Part of Speech Tags

In [None]:
# For each token, print corresponding part of speech tag
for token in para:
    print('{} - {}'.format(token, token.pos_))

In [None]:
sentence = nlp('He left his keys in his left pocket.')

for token in sentence:
    print('{} - {}'.format(token, token.pos_))

# Named Entity Recognition

In [None]:
finetune spacy language model

In [None]:

# Print all named entities with named entity types

doc_2 = nlp("I went out with Jane to the Apple store in Orchard to buy an iphone. It cost me $800. After that I stopped for an apple pie.")
for ent in doc_2.ents:
    print('{} - {}'.format(ent, ent.label_))

# Sentiment Analysis

Data Source: https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews


The dataset is comprised of tab-separated files with phrases from the Rotten Tomatoes dataset. The train/test split has been preserved for the purposes of benchmarking, but the sentences have been shuffled from their original order. Each Sentence has been parsed into many phrases by the Stanford parser. Each phrase has a PhraseId. Each sentence has a SentenceId. Phrases that are repeated (such as short/common words) are only included once in the data.

train.tsv contains the phrases and their associated sentiment labels. We have additionally provided a SentenceId so that you can track which phrases belong to a single sentence.
test.tsv contains just phrases. You must assign a sentiment label to each phrase.
The sentiment labels are:

0 - negative
1 - somewhat negative
2 - neutral
3 - somewhat positive
4 - positive

In [None]:
df = pd.read_csv('train.tsv', sep='\t')

In [None]:
Sentiment_count = df.groupby('Sentiment').count()

plt.bar(Sentiment_count.index.values, Sentiment_count['Phrase'])
plt.xlabel('Review Sentiments')
plt.ylabel('Number of Review')
plt.show()

In [None]:
#tokenizer to remove unwanted elements from out data like symbols and numbers
token = RegexpTokenizer(r'[a-zA-Z0-9]+')

cv = CountVectorizer(lowercase=True, stop_words='english', ngram_range = (1,1), tokenizer = token.tokenize)

text_counts= cv.fit_transform(df['Phrase'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(text_counts, df['Sentiment'], test_size=0.3, random_state=1)

In [None]:
# Model Generation Using Multinomial Naive Bayes
clf = MultinomialNB().fit(X_train, y_train)
predicted = clf.predict(X_test)

print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predicted))

In [None]:
tf = TfidfVectorizer()

text_tf = tf.fit_transform(df['Phrase'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(text_tf, df['Sentiment'], test_size=0.3, random_state=123)

In [None]:
# Model Generation Using Multinomial Naive Bayes
clf = MultinomialNB().fit(X_train, y_train)
predicted= clf.predict(X_test)

print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predicted))



1. **TextBlob Module:** Linguistic researchers have labeled the sentiment of words based on their domain expertise. Sentiment of words can vary based on where it is in a sentence. The TextBlob module allows us to take advantage of these labels.
2. **Sentiment Labels:** Each word in a corpus is labeled in terms of polarity and subjectivity (there are more labels as well, but we're going to ignore them for now). A corpus' sentiment is the average of these.
   * **Polarity**: How positive or negative a word is. -1 is very negative. +1 is very positive.
   * **Subjectivity**: How subjective, or opinionated a word is. 0 is fact. +1 is very much an opinion.

For more info on how TextBlob coded up its [sentiment function](https://planspace.org/20150607-textblob_sentiment/).

Let's take a look at the sentiment of the various transcripts, both overall and throughout the comedy routine.

In [None]:
#!pip install spacytextblob

In [None]:
from spacytextblob.spacytextblob import SpacyTextBlob
spacy_text_blob = SpacyTextBlob()
nlp.add_pipe(spacy_text_blob)
text = 'I had a really horrible day. It was the worst day ever! But every now and then I have a really good day that makes me happy.'
doc = nlp(text)


In [None]:
doc._.sentiment.polarity      # Polarity: -0.125

In [None]:
doc._.sentiment.subjectivity  # Sujectivity: 0.9

In [None]:
doc._.sentiment.assessments  

In [None]:
def scrape_lyrics(url):
    page = requests.get(url)
    html = BeautifulSoup(page.text, "html.parser")
    lyrics = html.find("pre", class_="lyric-body").get_text()
    print(url)
    #print(lyrics)
    return lyrics.replace("\n"," ")


links = ['https://www.lyrics.com/lyric/36863481/Justin+Bieber/Yummy',
     'https://www.lyrics.com/lyric/35362456/Ed+Sheeran/Castle+on+the+Hill',
     'https://www.lyrics.com/lyric/35342586/Taylor+Swift/22',
     'https://www.lyrics.com/lyric/36147543/Kygo/Happy+Now',
     'https://www.lyrics.com/sublyric/58125/Lauv/Superhero',
     'https://www.lyrics.com/lyric/30514737/Fix+You',
     'https://www.lyrics.com/lyric/32981724/One+Direction/Perfect',
     'https://www.lyrics.com/lyric/36489666/Bahari/Crashing',
     'https://www.lyrics.com/lyric/33787626/ROZES/Matches',
     'https://www.lyrics.com/lyric/36341880/Maroon+5/She+Will+Be+Loved',
     'https://www.lyrics.com/lyric/25306933/Queen/Dont+Stop+Me+Now',
     'https://www.lyrics.com/lyric/31781320/Eric+Clapton/Tears+In+Heaven']


lyrics = [scrape_lyrics(link) for link in links]

artists = ['JustinBieber', 'EdSheeran', 'TaylorSwift', 'Kygo', 'Lauv', 'Coldplay', 'OneDirection','Bahari','Rozes','Maroon5', 'Queen', 'EricClapton']

#fun fact: queen dont stop me now is apparently the happiest song, and eric clapton is supposedly a sad song
# https://www.indy100.com/article/dont-stop-me-now-is-the-happiest-song-in-the-world-according-to-a-neuroscientist-7318321
#but there's more to the what affects the sentiment of the song, not just the lyrics, e.g. tempo

In [None]:
df = pd.DataFrame({'Lyrics':lyrics}, index=artists)

pol = lambda x: TextBlob(x).sentiment.polarity
sub = lambda x: TextBlob(x).sentiment.subjectivity
df['polarity'] = df['Lyrics'].apply(pol)
df['subjectivity'] = df['Lyrics'].apply(sub)

plt.rcParams['figure.figsize'] = [10, 8]

for artist in df.index:
    x = df.polarity.loc[artist]
    y = df.subjectivity.loc[artist]
    plt.scatter(x, y, color='green')
    plt.text(x+.001, y+.001, artist, fontsize=10)
    plt.xlim(-.7, .7) 
    plt.ylim(0,1) 
    
plt.title('Sentiment Analysis', fontsize=20)
plt.xlabel('<-- Negative ---------- Positive -->', fontsize=15)
plt.ylabel('<-- Facts -------- Opinions -->', fontsize=15)

plt.show()