In [None]:
# Install spacy on NLU environment
# > conda install -c conda-forge spacy 
# > python -m spacy download en_core_web_md

In [1]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm

nlp = en_core_web_sm.load()

In [8]:
import pandas as pd
import dataset
import vsm
import sst

In [18]:
from pprint import pprint


In [2]:
TWITTER = 2
TWITTER_AIRLINES = 3
TWITTER_APPLE = 4

------------

## Exploring Apple Dataset

In [5]:
twitter_train, twitter_validate, twitter_test =  dataset.dataset_reader(TWITTER_APPLE)
[twitter_train, twitter_validate, twitter_test] = list(map(lambda ds : dataset.prune_columns(TWITTER_APPLE, ds), [twitter_train, twitter_validate, twitter_test]))

In [6]:
[ds.shape for ds in [twitter_train, twitter_validate, twitter_test]]

[(3109, 4), (388, 4), (389, 4)]

In [9]:
twitter_ds = pd.concat([twitter_train,twitter_validate,twitter_test],axis=0)

In [10]:
twitter_ds

Unnamed: 0,dataset,tweet_id,text,sentiment
1408,twitter_apple,623496932,RT @huffpostgay: @Apple CEO Time Cook gets a u...,3
3132,twitter_apple,623498662,NO CALLER ID must be able to be BLOCKED on @Ap...,1
1362,twitter_apple,623496886,RT @JPDesloges: iPhone 6 makes iOS a success i...,3
988,twitter_apple,623496506,RT @HamzeiAnalytics: BLOCK TRADE detected in #...,3
1516,twitter_apple,623497040,@OneRepublic @Apple Lmaoo! Someone needs to ge...,3
...,...,...,...,...
3810,twitter_apple,623499340,We hope @Apple doesn't use glass for their new...,1
3812,twitter_apple,623499342,@Apple co-founder Steve Wozniak talks about St...,3
3840,twitter_apple,623499370,Lets go shopping!!! @apple http://t.co/uTFUHuoJIi,5
3847,twitter_apple,623499377,proof @apple does not use it's own products. f...,1


### Exploring single text input

In [16]:
text = twitter_ds['text'][6] #6 -> tablets
text

'Top 3 all @Apple #tablets. Damn right! http://t.co/RJiGn2JUuB'

In [17]:
doc = nlp(text)
doc

Top 3 all @Apple #tablets. Damn right! http://t.co/RJiGn2JUuB

In [19]:
pprint([(X.text, X.label_) for X in doc.ents])

[('3', 'CARDINAL')]


### Exploring all tweets entities

In [21]:
twitter_ds['ner_entities'] = twitter_ds['text'].apply(lambda x: [(ent.text, ent.label_) for ent in nlp(x).ents])

In [22]:
twitter_ds['ner_entities'] 

1408    [(Time Cook, PERSON), (Alabama http://t.co/TDR...
3132                [(BLOCKED, ORG), (#iPhone6 #, MONEY)]
1362    [(6, CARDINAL), (UK, GPE), (Android, ORG), (Ja...
988                                                    []
1516                                                   []
                              ...                        
3810    [(Williamsburg, PERSON), (Sikrikim, PERSON), (...
3812    [(Steve Wozniak, PERSON), (Steve Jobs &amp, OR...
3840                                   [(@apple, PERSON)]
3847                                        [(MAIL, ORG)]
3884                                                   []
Name: ner_entities, Length: 3886, dtype: object

Get only records with PRODUCT labels:

In [27]:
def find_product(ents, tag="PRODUCT"):    
    entities = set()
    [entities.add(tup[1]) for tup in ents]    
    return tag in entities
# test
find_product([('6', 'CARDINAL'), ('UK', 'GPE'), ('Android', 'ORG'), ('x','PRODUCT')])

True

In [31]:
twitter_products_ds = twitter_ds[twitter_ds.ner_entities.apply(lambda x: find_product(x))]
twitter_products_ds.shape

(60, 5)

In [32]:
twitter_products_ds.head()

Unnamed: 0,dataset,tweet_id,text,sentiment,ner_entities
178,twitter_apple,623495691,None too happy with @Apple Mac OS X (Yosemite)...,1,"[(@Apple Mac OS, ORG), (Yosemite, NORP), (File..."
2355,twitter_apple,623497885,AFP requests to Apple reveal 'get a warrant' p...,3,"[(AFP, ORG), (Apple, ORG), (http://t.co/7zH5iM..."
860,twitter_apple,623496373,#New #foldable #Macbook #Pro falling on your l...,3,"[(Pro, PERSON), (@Apple s, PRODUCT)]"
3069,twitter_apple,623498599,"In the @Apple iPod antitrust case, recent deve...",3,"[(@Apple iPod, ORG), (iPods, PRODUCT), (first,..."
3707,twitter_apple,623499237,CNBCTV: #Cramer's view on #Apple &amp; Tesla ...,3,"[(Cramer, PERSON), (Apple &amp, ORG), (Tesla, ..."


--------------
--------------

## NER + Sentiment

### Get Entities and Product Flag into Sentiment Analysis Results (Twitter Apple Dataset)

In [44]:
apple_sent = pd.read_csv('results/BERT_predictions_added_to_twitter_test_apple.csv',index_col=0)

In [46]:
apple_sent.head()

Unnamed: 0,tweet_id,text,sentiment,BERT_sentiment
22,623495535,@robconeybeer: You need an IP portfolio to def...,3,3
31,623495544,@thehill @Apple i cite the 4th amendment as a ...,3,3
49,623495562,RT @thehill: Justice Department cites 18th cen...,3,3
52,623495565,This one chart explains @tim_cook's affect on ...,3,3
58,623495571,5 Companies Growing Faster Than Apple Inc. htt...,3,3


In [48]:
def get_entities(ds_text_col):    
    return ds_text_col.apply(lambda x: [(ent.text, ent.label_) for ent in nlp(x).ents])
    
def get_product_flag(ds_entity_col):
    def find_product(ents, tag="PRODUCT"):    
        entities = set()
        [entities.add(tup[1]) for tup in ents]            
        return tag in entities
    return ds_entity_col.apply(lambda x: find_product(x))
    
    

In [49]:
apple_sent['NER_ents'] = get_entities(apple_sent['text'])

In [50]:
apple_sent['NER_prod_flag']  = get_product_flag(apple_sent['NER_ents'])

In [51]:
apple_sent

Unnamed: 0,tweet_id,text,sentiment,BERT_sentiment,NER_ents,NER_prod_flag
22,623495535,@robconeybeer: You need an IP portfolio to def...,3,3,"[(IP, ORG), (@Samsung @Apple court, ORG)]",False
31,623495544,@thehill @Apple i cite the 4th amendment as a ...,3,3,"[(4th, ORDINAL)]",False
49,623495562,RT @thehill: Justice Department cites 18th cen...,3,3,"[(Justice Department, ORG), (18th century, DAT...",False
52,623495565,This one chart explains @tim_cook's affect on ...,3,3,"[(one, CARDINAL), (@tim_cook, ORG), (@Apple, O...",False
58,623495571,5 Companies Growing Faster Than Apple Inc. htt...,3,3,"[(5, CARDINAL)]",False
...,...,...,...,...,...,...
3845,623499375,media reports say that @Apple is hiring pros f...,3,3,"[(@Apple, ORG)]",False
3859,623499389,Apple Is Warming Up To Social Media: Apple is ...,3,3,"[(Apple, ORG), (Apple, ORG), (L.A., GPE)]",False
3868,623499398,Apple Is Warming Up To Social Media: Apple is ...,5,3,"[(Apple, ORG), (Apple, ORG), (L.A., GPE)]",False
3870,623499400,Apple Is Warming Up To Social Media: Apple is ...,3,3,"[(Apple, ORG), (Apple, ORG), (L.A., GPE)]",False
