In [None]:
# Install spacy on NLU environment
# > conda install -c conda-forge spacy 
# > python -m spacy download en_core_web_md

In [1]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm

nlp = en_core_web_sm.load()

In [8]:
import pandas as pd
import dataset
import vsm
import sst

In [18]:
from pprint import pprint


In [2]:
TWITTER = 2
TWITTER_AIRLINES = 3
TWITTER_APPLE = 4

------------

## Exploring Apple Dataset

In [5]:
twitter_train, twitter_validate, twitter_test =  dataset.dataset_reader(TWITTER_APPLE)
[twitter_train, twitter_validate, twitter_test] = list(map(lambda ds : dataset.prune_columns(TWITTER_APPLE, ds), [twitter_train, twitter_validate, twitter_test]))

In [6]:
[ds.shape for ds in [twitter_train, twitter_validate, twitter_test]]

[(3109, 4), (388, 4), (389, 4)]

In [9]:
twitter_ds = pd.concat([twitter_train,twitter_validate,twitter_test],axis=0)

In [10]:
twitter_ds

Unnamed: 0,dataset,tweet_id,text,sentiment
1408,twitter_apple,623496932,RT @huffpostgay: @Apple CEO Time Cook gets a u...,3
3132,twitter_apple,623498662,NO CALLER ID must be able to be BLOCKED on @Ap...,1
1362,twitter_apple,623496886,RT @JPDesloges: iPhone 6 makes iOS a success i...,3
988,twitter_apple,623496506,RT @HamzeiAnalytics: BLOCK TRADE detected in #...,3
1516,twitter_apple,623497040,@OneRepublic @Apple Lmaoo! Someone needs to ge...,3
...,...,...,...,...
3810,twitter_apple,623499340,We hope @Apple doesn't use glass for their new...,1
3812,twitter_apple,623499342,@Apple co-founder Steve Wozniak talks about St...,3
3840,twitter_apple,623499370,Lets go shopping!!! @apple http://t.co/uTFUHuoJIi,5
3847,twitter_apple,623499377,proof @apple does not use it's own products. f...,1


### Exploring single text input

In [16]:
text = twitter_ds['text'][6] #6 -> tablets
text

'Top 3 all @Apple #tablets. Damn right! http://t.co/RJiGn2JUuB'

In [17]:
doc = nlp(text)
doc

Top 3 all @Apple #tablets. Damn right! http://t.co/RJiGn2JUuB

In [19]:
pprint([(X.text, X.label_) for X in doc.ents])

[('3', 'CARDINAL')]


### Exploring all tweets entities

In [21]:
twitter_ds['ner_entities'] = twitter_ds['text'].apply(lambda x: [(ent.text, ent.label_) for ent in nlp(x).ents])

In [22]:
twitter_ds['ner_entities'] 

1408    [(Time Cook, PERSON), (Alabama http://t.co/TDR...
3132                [(BLOCKED, ORG), (#iPhone6 #, MONEY)]
1362    [(6, CARDINAL), (UK, GPE), (Android, ORG), (Ja...
988                                                    []
1516                                                   []
                              ...                        
3810    [(Williamsburg, PERSON), (Sikrikim, PERSON), (...
3812    [(Steve Wozniak, PERSON), (Steve Jobs &amp, OR...
3840                                   [(@apple, PERSON)]
3847                                        [(MAIL, ORG)]
3884                                                   []
Name: ner_entities, Length: 3886, dtype: object

Get only records with PRODUCT labels:

In [27]:
def find_product(ents, tag="PRODUCT"):    
    entities = set()
    [entities.add(tup[1]) for tup in ents]    
    return tag in entities
# test
find_product([('6', 'CARDINAL'), ('UK', 'GPE'), ('Android', 'ORG'), ('x','PRODUCT')])

True

In [31]:
twitter_products_ds = twitter_ds[twitter_ds.ner_entities.apply(lambda x: find_product(x))]
twitter_products_ds.shape

(60, 5)

In [32]:
twitter_products_ds.head()

Unnamed: 0,dataset,tweet_id,text,sentiment,ner_entities
178,twitter_apple,623495691,None too happy with @Apple Mac OS X (Yosemite)...,1,"[(@Apple Mac OS, ORG), (Yosemite, NORP), (File..."
2355,twitter_apple,623497885,AFP requests to Apple reveal 'get a warrant' p...,3,"[(AFP, ORG), (Apple, ORG), (http://t.co/7zH5iM..."
860,twitter_apple,623496373,#New #foldable #Macbook #Pro falling on your l...,3,"[(Pro, PERSON), (@Apple s, PRODUCT)]"
3069,twitter_apple,623498599,"In the @Apple iPod antitrust case, recent deve...",3,"[(@Apple iPod, ORG), (iPods, PRODUCT), (first,..."
3707,twitter_apple,623499237,CNBCTV: #Cramer's view on #Apple &amp; Tesla ...,3,"[(Cramer, PERSON), (Apple &amp, ORG), (Tesla, ..."


--------------
--------------

## NER + Sentiment

### Get Entities and Product Flag into Sentiment Analysis Results (Twitter Apple Dataset)

In [56]:
apple_sent = pd.read_csv('results/BERTweet_predictions_added_to_twitter_test_apple.csv',index_col=0)

In [57]:
apple_sent.head()

Unnamed: 0,dataset,tweet_id,text,sentiment,BERTweet_sentiment
12,twitter_apple,623495525,"The Best-Designed #iPhone #Apps In the World, ...",3,3
15,twitter_apple,623495528,#aapl @applenws Thanks to the non factual dumb...,3,3
38,twitter_apple,623495551,RT @thehill: Justice Department cites 18th cen...,3,3
43,twitter_apple,623495556,RT @thehill: Justice Department cites 18th cen...,1,3
45,twitter_apple,623495558,RT @thehill: Justice Department cites 18th cen...,3,3


In [58]:
def get_entities(ds_text_col):    
    return ds_text_col.apply(lambda x: [(ent.text, ent.label_) for ent in nlp(x).ents])
    
def get_product_flag(ds_entity_col):
    def find_product(ents, tag="PRODUCT"):    
        entities = set()
        [entities.add(tup[1]) for tup in ents]            
        return tag in entities
    return ds_entity_col.apply(lambda x: find_product(x))    

In [59]:
apple_sent['NER_ents'] = get_entities(apple_sent['text'])

In [60]:
apple_sent['NER_prod_flag']  = get_product_flag(apple_sent['NER_ents'])

In [61]:
apple_sent

Unnamed: 0,dataset,tweet_id,text,sentiment,BERTweet_sentiment,NER_ents,NER_prod_flag
12,twitter_apple,623495525,"The Best-Designed #iPhone #Apps In the World, ...",3,3,"[(Apps, PERSON)]",False
15,twitter_apple,623495528,#aapl @applenws Thanks to the non factual dumb...,3,3,"[(aapl @applenws, PERSON), (3, CARDINAL), (one...",False
38,twitter_apple,623495551,RT @thehill: Justice Department cites 18th cen...,3,3,"[(Justice Department, ORG), (18th century, DAT...",False
43,twitter_apple,623495556,RT @thehill: Justice Department cites 18th cen...,1,3,"[(Justice Department, ORG), (18th century, DAT...",False
45,twitter_apple,623495558,RT @thehill: Justice Department cites 18th cen...,3,3,"[(Justice Department, ORG), (18th century, DAT...",False
...,...,...,...,...,...,...,...
3853,twitter_apple,623499383,So @Apple just gave me a huge middle finger to...,1,1,[],False
3869,twitter_apple,623499399,Apple Is Warming Up To Social Media: Apple is ...,3,3,"[(Apple, ORG), (Apple, ORG), (L.A., GPE)]",False
3870,twitter_apple,623499400,Apple Is Warming Up To Social Media: Apple is ...,3,3,"[(Apple, ORG), (Apple, ORG), (L.A., GPE)]",False
3874,twitter_apple,623499404,Apple Is Warming Up To Social Media: Apple is ...,3,3,"[(Apple, ORG), (Apple, ORG), (L.A., GPE)]",False


Get tweets where NER detected PRODUCT tag:

In [63]:
apple_products = apple_sent[apple_sent.NER_prod_flag]
apple_products

Unnamed: 0,dataset,tweet_id,text,sentiment,BERTweet_sentiment,NER_ents,NER_prod_flag
178,twitter_apple,623495691,None too happy with @Apple Mac OS X (Yosemite)...,1,1,"[(@Apple Mac OS, ORG), (Yosemite, NORP), (File...",True
649,twitter_apple,623496162,That would be brilliant! It's sad @Apple doesn...,1,1,"[(@Apple, ORG), (Mozilla, PRODUCT), (@OpenSour...",True
1155,twitter_apple,623496674,@Apple removed songs from iPods without tellin...,1,3,"[(@Apple, ORG), (iPods, PRODUCT)]",True
1790,twitter_apple,623497314,#AAPL:Apple says plaintiffs' iPods not covered...,3,3,"[(AAPL, PERSON), (Apple, ORG), (iPods, PRODUCT)]",True


In [75]:
def get_sent(sent):
    if sent == '1':
        return "negative"
    elif sent == '3':
        return "neutral"
    else:
        return "positive"
for sentiment, tweet in zip(apple_products['sentiment'], apple_products['text']):     
    print('[',get_sent(sentiment),'] ->',tweet,'\n--------------------------')

[ negative ] -> None too happy with @Apple Mac OS X (Yosemite), File Vault 2, or Boot Camp right now. Thankful to own a @lenovo. 
--------------------------
[ negative ] -> That would be brilliant! It's sad @Apple doesn't allow Mozilla's better engine @OpenSourceAgent #opensource 
--------------------------
[ negative ] -> @Apple removed songs from iPods without telling customers http://t.co/C29jxFuNcJ 
--------------------------
[ neutral ] -> #AAPL:Apple says plaintiffs' iPods not covered by suit...http://t.co/8V0eYzQFDQ 
--------------------------


-----

### Get Entities and Product Flag into Sentiment Analysis Results (Airline Datasets)

In [82]:
df = pd.read_csv('results/BERTweet_predictions_added_to_twitter_test_airline.csv',index_col=0)
df.head()

Unnamed: 0,dataset,tweet_id,text,sentiment,airline,BERTweet_sentiment
5,twitter_airline,570300767074181121,@VirginAmerica seriously would pay $30 a fligh...,negative,Virgin America,negative
18,twitter_airline,570270684619923457,I ❤️ flying @VirginAmerica. ☺️👍,positive,Virgin America,positive
24,twitter_airline,570256553502068736,@VirginAmerica you guys messed up my seating.....,negative,Virgin America,negative
48,twitter_airline,570010539499393025,@VirginAmerica @ladygaga @carrieunderwood Afte...,neutral,Virgin America,positive
52,twitter_airline,570004391731847169,@VirginAmerica wish you flew out of Atlanta......,neutral,Virgin America,neutral


In [80]:
df['text'].astype(str)

2       im getting on borderlands and i will kill you ...
3       im coming on borderlands and i will murder you...
7       So I spent a couple of hours doing something f...
14      Rock-Hard La Varlope, RARE & POWERFUL, HANDSOM...
17      I-Hard like me, RARE LONDON DE, HANDSOME 2011,...
                              ...                        
3836    @Apple recruiting luxury executives for iwatch...
3846    RT @TeamCavuto: Protesters stage #DieIn protes...
3858    Apple Is Warming Up To Social Media: Apple is ...
3870    Apple Is Warming Up To Social Media: Apple is ...
3872    Apple Is Warming Up To Social Media: Apple is ...
Name: text, Length: 24258, dtype: object

In [83]:
df['NER_ents'] = get_entities(df['text'])

In [84]:
df['NER_prod_flag']  = get_product_flag(df['NER_ents'])

In [85]:
df

Unnamed: 0,dataset,tweet_id,text,sentiment,airline,BERTweet_sentiment,NER_ents,NER_prod_flag
5,twitter_airline,570300767074181121,@VirginAmerica seriously would pay $30 a fligh...,negative,Virgin America,negative,"[(@VirginAmerica, GPE), (30, MONEY), (VA, ORG)]",False
18,twitter_airline,570270684619923457,I ❤️ flying @VirginAmerica. ☺️👍,positive,Virgin America,positive,"[(@VirginAmerica, GPE), (☺, ORG)]",False
24,twitter_airline,570256553502068736,@VirginAmerica you guys messed up my seating.....,negative,Virgin America,negative,"[(@VirginAmerica, GPE)]",False
48,twitter_airline,570010539499393025,@VirginAmerica @ladygaga @carrieunderwood Afte...,neutral,Virgin America,positive,"[(@VirginAmerica @ladygaga @carrieunderwood, O...",False
52,twitter_airline,570004391731847169,@VirginAmerica wish you flew out of Atlanta......,neutral,Virgin America,neutral,"[(@VirginAmerica, GPE), (Atlanta, GPE)]",False
...,...,...,...,...,...,...,...,...
14595,twitter_airline,569593694963310593,@AmericanAir @ShannonBloom Where's my DM? Wher...,negative,American,neutral,"[(70, MONEY), (50, MONEY), (JFK, PERSON), (tom...",False
14607,twitter_airline,569592270866878464,@AmericanAir i need someone to help me out,neutral,American,negative,[],False
14614,twitter_airline,569591540944756737,@AmericanAir I need to be at work tomorrow at ...,negative,American,negative,"[(tomorrow, DATE), (8am, TIME), (800, CARDINAL...",False
14620,twitter_airline,569590965880532993,@AmericanAir I wait 2+ hrs for CS to call me b...,negative,American,negative,"[(2, CARDINAL), (CS, ORG), (the minute, TIME),...",False


In [88]:
df_products = df[df.NER_prod_flag]
df_products.shape

(40, 8)

In [90]:
for sentiment, tweet in zip(df_products['sentiment'], df_products['text']):     
    print('[',sentiment,'] ->',tweet,'\n--------------------------')

[ neutral ] -> @VirginAmerica Can't bring up my reservation online using Flight Booking Problems code 
--------------------------
[ negative ] -> @VirginAmerica And now the flight Flight Booking Problems site is totally down. Folks, what is the problem? 
--------------------------
[ positive ] -> @VirginAmerica thanks to your outstanding NYC-JFK crew who moved mountains to get me home to San Francisco tonight! 
--------------------------
[ neutral ] -> @VirginAmerica - can you tweet me the Cancelled Flight/chng fee for a flight? or can I rebook under one of your affiliates? If so, who are afiliates? 
--------------------------
[ neutral ] -> @united Left item n the seatback on UA1260. Is there any way to call DCA to ask if they have the item? Already submitted lost &amp; found report 
--------------------------
[ neutral ] -> @united Seat 14A, Flight UA895 
--------------------------
[ neutral ] -> @united any chance you'll ever do CPUs on your JFK-LAX like @AmericanAir? 
-------------