# Class 14 - Starter Code

Natural Language Processing and Topic Modeling

In [None]:
# spacy is used for pre-processing and traditional NLP
import spacy
from spacy.en import English
nlp_toolkit = English()

# Gensim is used for LDA and word2vec
from gensim.models.word2vec import Word2Vec

# Twitter Lab

In this exercise, we will compare some of the classical NLP tools from the last class with these more modern latent variable techniques.  We will do this by comparing information extraction on Twitter using two different methods.

> NOTE:  There is a pre-existing file of captured tweets you can use.  It is located in the class repo for lesson-14.  However, you can also collect your own tweets following the instructions in twitter-instructions.md.

In [None]:
# Loading the twitter data
tweets = [unicode(tweet, errors='ignore') for tweet in \
          open('../../assets/dataset/captured-tweets.txt', 'r')]

# Part 1: Using `spacy`

Use `spacy` to write a function to filter tweets down to those where Google is announcing a product. How might we do this? One way might be to identify verbs, where 'Google' is the noun and there is some action like 'announcing'

In [None]:
# Use spacy to parse each tweet
parsed_tweets = []
for tweet in tweets:
    parsed_tweets.append(nlp_toolkit(tweet))

### 1.a
Write a function that can take a sentence parsed by spacy and identify if it mentions a company named 'Google'. Remember,spacy can find entities and code them as ORG if they are a company. 

### 1.b
BONUS: Make this function work for any company.

Hint: https://spacy.io/docs#examples-entities

In [None]:
# Write a function that can take a take a sentence parsed by `spacy` and 
# identify if it mentions a company named 'Google'. 
# Remember, `spacy` can find entities and code them `ORG` if they are a company.
def mentions_company(parsed, company='Google'):
    for entity in parsed.ents:
        if ### FILL IN ###
            return True
    return False

In [None]:
# For each tweet, use parsed tweet to check your function
for i, parsed_tweet in enumerate(parsed_tweets):
    if mentions_company(parsed_tweet, 'Google'):
        print parsed_tweet
        if i>10:
            break

### 1.c
Write a function that can take a sentence parsed by spacy and return the verbs of the sentence (preferably lemmatized).

Hint: https://spacy.io/docs#examples-pos-tags

In [None]:
# Write a function that can take a sentence parsed by `spacy` 
# and return the verbs of the sentence (preferably lemmatized)
def get_actions(parsed):
    actions = [el.lemma_ 
                for el in parsed 
                if ### FILL IN ###
               ]
    return actions

In [None]:
# For each tweet, use parsed tweet to check your function
for i, parsed_tweet in enumerate(parsed_tweets):
    print get_actions(parsed_tweet)
    if i>10:
        break

### 1.d

For each tweet that mentions Google, parse it using spacy and print it out if the tweet has 'release' or 'announce' as a verb.

In [None]:
for i, parsed_tweet in enumerate(parsed_tweets):
    ### FILL IN ###
    print(parsed_tweet)

### 1.e
Write a function that identifies countries.  HINT: the entity label for countries is GPE (or "GeoPolitical Entity").

Hint: https://spacy.io/docs#annotation-ner

In [None]:
# Write a function that identifies countries - HINT: the entity label for 
# countries is GPE (or GeoPolitical Entity)
def mentions_country(parsed, country):
    for entity in parsed.ents:
        if ### FILL IN ###
            return True
    return False

In [None]:
for i, parsed_tweet in enumerate(parsed_tweets):
    if mentions_country(parsed_tweet, 'Iran'):
        print parsed_tweet
        if i>1000:
            break

### 1.f
Re-run to find country tweets that discuss 'Iran' announcing or releasing.

In [None]:
for i, parsed_tweet in enumerate(parsed_tweets):
    ### FILL IN ###
    print(parsed_tweet)

# Part 2: Using `gensim`

Build a `word2vec` model of the tweets we have collected using `gensim`.

### 2.a
First take the collection of tweets and tokenize them using spacy.
Think about how this should be done. 
Should you only use upper-case or lower-case? 
Should you remove punctuations or symbols? 

In [None]:
# Lemmatize the verbs for easier searching and keep symbols and punctuations
split_tweets = [[x.text if x.pos != spacy.parts_of_speech.VERB else x.lemma_ 
                 for x in nlp_toolkit(t)] for t in tweets]

In [None]:
print tweets[0]

In [None]:
print split_tweets[0]

### 2.b
Build a word2vec model.  
Test the window size as well - this is how many surrounding words need to be used to model a word. What do you think is appropriate for Twitter? 


In [None]:
# Build a `word2vec` model
model = Word2Vec(split_tweets, size=100, window=4, min_count=5, workers=4)

### 2.c
Test your word2vec model with a few similarity functions.  
Find words similar to 'Syria'.  
Find words similar to 'war'.  
Find words similar to 'Iran'.  
Find words similar to 'Verizon'. 

In [None]:
model.most_similar(positive=['Verizon'])

In [None]:
### FILL IN ###

In [None]:
### FILL IN ###

In [None]:
### FILL IN ###

In [None]:
model.most_similar(positive=['war', 'Iraq'])

# Part 3: Comparing `spacy` and `gensim`
Filter tweets to those that mention 'Iran' or similar entities and 'war' or similar entities.

### 3.a
Using `spacy`

In [None]:
# Using spacy
for i, parsed_tweet in enumerate(parsed_tweets):
    if mentions_country(parsed_tweet, 'Iran') \
    or mentions_country(parsed_tweet, 'Iraq'):
        if 'attack' in get_actions(parsed_tweet) \
        or 'war' in parsed_tweet.text:
            print(parsed_tweet)

### 3.b
Using `gensim`

In [None]:
# Using gensim
for i, split_tweet in enumerate(split_tweets):
    similarity_to_iran = max([model.similarity('Iran', tok) for tok in split_tweet if tok in model.vocab]+[0])
    similarity_to_war = max([model.similarity('war', tok) for tok in split_tweet if tok in model.vocab]+[0])
    if similarity_to_iran > 0.999 and similarity_to_war > 0.999:
        print (similarity_to_iran, similarity_to_war)
        print ' '.join(split_tweet)

# Part 4: [Bonus] Your Own Analysis
Build your own analysis using the above twitter data.
Alternatively, collect your own tweets to analyze following the instructions in `twitter-instructions.md`