In [25]:
import spacy
import nltk
from tqdm import tqdm

# load spaCy model
nlp = spacy.load('en_core_web_sm')

In [26]:
# convert text to spaCy object
doc_1 = nlp('He is the right man for the position')
doc_2 = nlp('Everyone has the right to freedom of opinion and expression')

In [27]:
doc_3 = nlp('Amazon is working on a device that can read emotions')

In [28]:
[token.text + ": " +token.pos_ for token in doc_3]

['Amazon: PROPN',
 'is: AUX',
 'working: VERB',
 'on: ADP',
 'a: DET',
 'device: NOUN',
 'that: DET',
 'can: VERB',
 'read: VERB',
 'emotions: NOUN']

### Find POS tags using spaCy library

In [29]:
[token.text + ": " +token.pos_ for token in doc_1]

['He: PRON',
 'is: AUX',
 'the: DET',
 'right: ADJ',
 'man: NOUN',
 'for: ADP',
 'the: DET',
 'position: NOUN']

In [30]:
[token.text + ": " +token.pos_ for token in doc_2]

['Everyone: PRON',
 'has: AUX',
 'the: DET',
 'right: NOUN',
 'to: ADP',
 'freedom: NOUN',
 'of: ADP',
 'opinion: NOUN',
 'and: CCONJ',
 'expression: NOUN']

<br>
We can even use NLTK library to find POS tags.

In [31]:
nltk.pos_tag('He is the right man for the position'.split())

[('He', 'PRP'),
 ('is', 'VBZ'),
 ('the', 'DT'),
 ('right', 'JJ'),
 ('man', 'NN'),
 ('for', 'IN'),
 ('the', 'DT'),
 ('position', 'NN')]

In [32]:
nltk.pos_tag('Everyone has the right to freedom of opinion and expression'.split())

[('Everyone', 'NN'),
 ('has', 'VBZ'),
 ('the', 'DT'),
 ('right', 'NN'),
 ('to', 'TO'),
 ('freedom', 'VB'),
 ('of', 'IN'),
 ('opinion', 'NN'),
 ('and', 'CC'),
 ('expression', 'NN')]

In [33]:
import re
import pandas as pd
pd.set_option('display.max_colwidth', 300)

# load twitter data
tweets = pd.read_csv("tweets.csv")

In [34]:
def cleaner(x):
    x = re.sub(r'http\S+', '', x) # remove URLs
    x = re.sub(r'[%$#@&)(]', '', x) # remove special characters
    x = re.sub(r'[.,:;]', '', x)
    return x    

In [35]:
tweets.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/
2,3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu
3,4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/
4,5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!


In [36]:
tweets['clean_tweet'] = tweets['tweet'].apply(lambda x: cleaner(x))

In [37]:
tweets.head()

Unnamed: 0,id,label,tweet,clean_tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone,fingerprint Pregnancy Test android apps beautiful cute health igers iphoneonly iphonesia iphone
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/,Finally a transparant silicon case ^^ Thanks to my uncle yay Sony Xperia S sonyexperias…
2,3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu,We love this! Would you go? talk makememories unplug relax iphone smartphone wifi connect
3,4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/,I'm wired I know I'm George I was made that way iphone cute daventry home
4,5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!,What amazing service! Apple won't even talk to me about a question I have unless I pay them 1995 for their stupid support!


In [38]:
tweets['clean_tweet'][0]

'fingerprint Pregnancy Test  android apps beautiful cute health igers iphoneonly iphonesia iphone'

### Feature Creation

We create count features from the POS tags. For example, how many verbs are there in a sentence or how many adjectives are there.

In [39]:
noun = 0
pron = 0
adj = 0
verb = 0

# find count of nouns, pronouns, adjectives and verbs in a tweet
for token in nlp(tweets['clean_tweet'][0]):
    if token.pos_ == "PROPN" or token.pos_ == "NOUN":
        noun+=1
    elif token.pos_ == "PRON":
        pron+=1
    elif token.pos_ == "ADJ":
        adj+=1
    elif token.pos_ == "VERB":
        verb+=1

In [40]:
noun, pron, adj, verb

(9, 0, 2, 0)

In [41]:
# cross check
[token.text + ": " +token.pos_ for token in nlp(tweets['clean_tweet'][0])]

['fingerprint: NOUN',
 'Pregnancy: PROPN',
 'Test: PROPN',
 ' : SPACE',
 'android: NOUN',
 'apps: NOUN',
 'beautiful: ADJ',
 'cute: ADJ',
 'health: NOUN',
 'igers: NOUN',
 'iphoneonly: ADV',
 'iphonesia: PROPN',
 'iphone: PROPN']