### NLP Refresher

In [1]:
text = ["System of the World. By Isaac Newton", "   Snow Crash  .  By Neal Stephenson ",
       " AFROFUTURISM. by     Ytasha L. Womack "]

In [2]:
strip_whitespace = [string.strip() for string in text]

In [3]:
strip_whitespace

['System of the World. By Isaac Newton',
 'Snow Crash  .  By Neal Stephenson',
 'AFROFUTURISM. by     Ytasha L. Womack']

In [4]:
remove_periods = [string.replace(".","") for string in strip_whitespace]

In [5]:
remove_periods

['System of the World By Isaac Newton',
 'Snow Crash    By Neal Stephenson',
 'AFROFUTURISM by     Ytasha L Womack']

In [8]:
upper = [string.upper() for string in strip_whitespace]

In [9]:
upper

['SYSTEM OF THE WORLD. BY ISAAC NEWTON',
 'SNOW CRASH  .  BY NEAL STEPHENSON',
 'AFROFUTURISM. BY     YTASHA L. WOMACK']

In [10]:
import re

In [12]:
xs = [re.sub(r"[a-zA-Z]", "X", string) for string in strip_whitespace]

In [13]:
xs

['XXXXXX XX XXX XXXXX. XX XXXXX XXXXXX',
 'XXXX XXXXX  .  XX XXXX XXXXXXXXXX',
 'XXXXXXXXXXXX. XX     XXXXXX X. XXXXXX']

**REGEX TUTORIAL**

https://www.analyticsvidhya.com/blog/2015/06/regular-expression-python/

### Scraping

In [36]:
import requests
from bs4 import BeautifulSoup

In [37]:
url = 'https://www.analyticsvidhya.com/blog/2015/06/regular-expression-python/'

In [38]:
req = requests.get(url)

In [39]:
req

<Response [200]>

In [40]:
soup = BeautifulSoup(req.text)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [41]:
soup

<!DOCTYPE html>
<html><head lang="en-US" prefix="og: http://ogp.me/ns#">
<meta charset="utf-8"/>
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
<link href="//www.google-analytics.com" rel="dns-prefetch"/>
<link href="https://maxcdn.bootstrapcdn.com/font-awesome/4.4.0/css/font-awesome.min.css" rel="stylesheet prefetch"/>
<link href="https://fonts.googleapis.com/css?family=Roboto" rel="stylesheet"/>
<meta content="width=device-width, initial-scale=1.0, user-scalable=yes" name="viewport"/>
<link href="https://www.analyticsvidhya.com/xmlrpc.php" rel="pingback"/>
<link href="https://www.analyticsvidhya.com/wp-content/uploads/2015/02/logo_square_v2.jpg" id="favicon" rel="icon" type="image/png"/><link href="https://www.analyticsvidhya.com/wp-content/uploads/2015/02/logo_square_v2.jpg" rel="apple-touch-icon"/><link href="https://www.analyticsvidhya.com/wp-content/uploads/2015/02/logo_square_v2.jpg" rel="apple-touch-icon" sizes="76x76"/><link href="https://www.analyticsvidhya.c

In [42]:
soup.find('h2')

<h2 class="site-outline">Learn everything about Analytics</h2>

In [43]:
heads = soup.find_all('h2')

In [44]:
len(heads)

6

### Basic NLP

In [45]:
from nltk.tokenize import word_tokenize

In [46]:
pgraph = soup.find('p').text

In [54]:
tokes = word_tokenize(pgraph)

In [55]:
tokes

['In',
 'last',
 'few',
 'years',
 ',',
 'there',
 'has',
 'been',
 'a',
 'dramatic',
 'shift',
 'in',
 'usage',
 'of',
 'general',
 'purpose',
 'programming',
 'languages',
 'for',
 'data',
 'science',
 'and',
 'machine',
 'learning',
 '.',
 'This',
 'was',
 'not',
 'always',
 'the',
 'case',
 '–',
 'a',
 'decade',
 'back',
 'this',
 'thought',
 'would',
 'have',
 'met',
 'a',
 'lot',
 'of',
 'skeptic',
 'eyes',
 '!']

In [56]:
from nltk.tokenize import sent_tokenize

In [57]:
sent_tokenize(pgraph)[0]

'In last few years, there has been a dramatic shift in usage of general purpose programming languages for data science and machine learning.'

In [58]:
from nltk.corpus import stopwords

In [59]:
stop_words = stopwords.words('english')

In [61]:
[word for word in tokes if word not in stop_words]

['In',
 'last',
 'years',
 ',',
 'dramatic',
 'shift',
 'usage',
 'general',
 'purpose',
 'programming',
 'languages',
 'data',
 'science',
 'machine',
 'learning',
 '.',
 'This',
 'always',
 'case',
 '–',
 'decade',
 'back',
 'thought',
 'would',
 'met',
 'lot',
 'skeptic',
 'eyes',
 '!']

In [62]:
stop_words[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [63]:
#stemming
from nltk.stem.porter import PorterStemmer

In [64]:
porter = PorterStemmer()

In [66]:
[porter.stem(word) for word in tokes]

['In',
 'last',
 'few',
 'year',
 ',',
 'there',
 'ha',
 'been',
 'a',
 'dramat',
 'shift',
 'in',
 'usag',
 'of',
 'gener',
 'purpos',
 'program',
 'languag',
 'for',
 'data',
 'scienc',
 'and',
 'machin',
 'learn',
 '.',
 'thi',
 'wa',
 'not',
 'alway',
 'the',
 'case',
 '–',
 'a',
 'decad',
 'back',
 'thi',
 'thought',
 'would',
 'have',
 'met',
 'a',
 'lot',
 'of',
 'skeptic',
 'eye',
 '!']

In [67]:
from nltk import pos_tag

In [70]:
text_tagged = pos_tag(tokes)

In [71]:
text_tagged

[('In', 'IN'),
 ('last', 'JJ'),
 ('few', 'JJ'),
 ('years', 'NNS'),
 (',', ','),
 ('there', 'EX'),
 ('has', 'VBZ'),
 ('been', 'VBN'),
 ('a', 'DT'),
 ('dramatic', 'JJ'),
 ('shift', 'NN'),
 ('in', 'IN'),
 ('usage', 'NN'),
 ('of', 'IN'),
 ('general', 'JJ'),
 ('purpose', 'NN'),
 ('programming', 'NN'),
 ('languages', 'NNS'),
 ('for', 'IN'),
 ('data', 'NNS'),
 ('science', 'NN'),
 ('and', 'CC'),
 ('machine', 'NN'),
 ('learning', 'NN'),
 ('.', '.'),
 ('This', 'DT'),
 ('was', 'VBD'),
 ('not', 'RB'),
 ('always', 'RB'),
 ('the', 'DT'),
 ('case', 'NN'),
 ('–', 'VBZ'),
 ('a', 'DT'),
 ('decade', 'NN'),
 ('back', 'RB'),
 ('this', 'DT'),
 ('thought', 'NN'),
 ('would', 'MD'),
 ('have', 'VB'),
 ('met', 'VBN'),
 ('a', 'DT'),
 ('lot', 'NN'),
 ('of', 'IN'),
 ('skeptic', 'JJ'),
 ('eyes', 'NNS'),
 ('!', '.')]

In [73]:
[word for word, tag in text_tagged if tag in ['NN', 'NNS']]

['years',
 'shift',
 'usage',
 'purpose',
 'programming',
 'languages',
 'data',
 'science',
 'machine',
 'learning',
 'case',
 'decade',
 'thought',
 'lot',
 'eyes']

In [80]:
tweets = ["we are more worried about what we can lose than what we feel",
         "it's really cool to say I hate you. But it's not cool to say I love you. Love has a stigma",
         "Instead of doing what you feel you just do what other people think you should do"]

In [82]:
tagged_tweets = []
for tweet in tweets:
    tweet_tag = pos_tag(word_tokenize(tweet))
    tagged_tweets.append([tag for word, tag in tweet_tag])

In [87]:
tagged_tweets[2][:5]

['RB', 'IN', 'VBG', 'WP', 'PRP']

In [88]:
from sklearn.preprocessing import MultiLabelBinarizer

In [89]:
one_hot_multi = MultiLabelBinarizer()

In [90]:
one_hot_multi.fit_transform(tagged_tweets)

array([[0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1],
       [1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0],
       [0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1]])

In [91]:
one_hot_multi.classes_

array(['.', 'CC', 'DT', 'IN', 'JJ', 'MD', 'NN', 'NNS', 'PRP', 'RB', 'RBR',
       'TO', 'VB', 'VBG', 'VBP', 'VBZ', 'WP'], dtype=object)

### CountVectorizer

In [92]:
import numpy as np

In [94]:
from sklearn.feature_extraction.text import CountVectorizer

In [95]:
text_data = np.array(['I like Cardi B. ', 'Tribeca is a strange place.', ' Germany is where they make volkswagen cars.'])

In [96]:
count = CountVectorizer()


In [97]:
bag_of_words = count.fit_transform(text_data)

In [98]:
bag_of_words

<3x12 sparse matrix of type '<class 'numpy.int64'>'
	with 13 stored elements in Compressed Sparse Row format>

In [99]:
bag_of_words.toarray()

array([[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0],
       [0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1]], dtype=int64)

In [100]:
count.get_feature_names()

['cardi',
 'cars',
 'germany',
 'is',
 'like',
 'make',
 'place',
 'strange',
 'they',
 'tribeca',
 'volkswagen',
 'where']

In [104]:
count_2gram = CountVectorizer(ngram_range = (1, 2), stop_words="english", 
                             vocabulary=['germany'])

In [105]:
bag = count_2gram.fit_transform(text_data)

In [106]:
bag.toarray()

array([[0],
       [0],
       [1]])

### Tfidf

In [107]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [108]:
tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(text_data)

In [109]:
feature_matrix

<3x12 sparse matrix of type '<class 'numpy.float64'>'
	with 13 stored elements in Compressed Sparse Row format>

In [110]:
feature_matrix.toarray()

array([[0.70710678, 0.        , 0.        , 0.        , 0.70710678,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.40204024, 0.        ,
        0.        , 0.52863461, 0.52863461, 0.        , 0.52863461,
        0.        , 0.        ],
       [0.        , 0.38988801, 0.38988801, 0.29651988, 0.        ,
        0.38988801, 0.        , 0.        , 0.38988801, 0.        ,
        0.38988801, 0.38988801]])

In [111]:
tfidf.vocabulary_

{'cardi': 0,
 'cars': 1,
 'germany': 2,
 'is': 3,
 'like': 4,
 'make': 5,
 'place': 6,
 'strange': 7,
 'they': 8,
 'tribeca': 9,
 'volkswagen': 10,
 'where': 11}