# Tagging data using NER

### Importing libraries and loading spacy model

In [1]:
import spacy
import pandas as pd

nlp = spacy.load('en_core_web_sm')

### Defining getting org function

In [2]:
def get_orgs(text):
    # process the text with our SpaCy model to get named entities
    doc = nlp(text)
    # initialize list to store identified organizations
    org_list = []
    # loop through the identified entities and append ORG entities to org_list
    for entity in doc.ents:
        if entity.label_ == 'ORG':
            org_list.append(entity.text)
    # if organization is identified more than once it will appear multiple times in list
    # we use set() to remove duplicates then convert back to list
    org_list = list(set(org_list))
    return org_list

### Reading data and applying getting org function

In [3]:
df = pd.read_csv('./data/reddit_investing.csv', sep='|')
df.head()

Unnamed: 0,id,created_utc,subreddit,title,selftext,upvote_ratio,ups,downs,score
0,t3_zwixbx,1672156884,investing,Companies that are focusing on desalination?,"Hi All,\n\nIm wondering if anyone out there kn...",0.5,0,0,0
1,t3_zwivno,1672156783,investing,Legendary investor David Tepper revealed his s...,The only stock he bought was Google (GOOG). He...,0.83,4,0,4
2,t3_zwi6qy,1672154900,investing,Investing for roughly two year window.,"Hello, first allow me to say that my wife and ...",0.2,0,0,0
3,t3_zwhson,1672153936,investing,ELI5: How can wash sales be used to actually a...,(Edit: My assumption is that they were abused ...,0.43,0,0,0
4,t3_zwhm6b,1672153434,investing,Chipmakers Struggle With Inventory Buildup On ...,https://finance.yahoo.com/news/chipmakers-stru...,0.89,32,0,32


In [4]:
df['organizations'] = df['selftext'].apply(get_orgs)
df.head()

Unnamed: 0,id,created_utc,subreddit,title,selftext,upvote_ratio,ups,downs,score,organizations
0,t3_zwixbx,1672156884,investing,Companies that are focusing on desalination?,"Hi All,\n\nIm wondering if anyone out there kn...",0.5,0,0,0,[]
1,t3_zwivno,1672156783,investing,Legendary investor David Tepper revealed his s...,The only stock he bought was Google (GOOG). He...,0.83,4,0,4,[]
2,t3_zwi6qy,1672154900,investing,Investing for roughly two year window.,"Hello, first allow me to say that my wife and ...",0.2,0,0,0,"[DCA, Ammo.com, REIT, SPLG, LETB, Farmland Par..."
3,t3_zwhson,1672153936,investing,ELI5: How can wash sales be used to actually a...,(Edit: My assumption is that they were abused ...,0.43,0,0,0,[]
4,t3_zwhm6b,1672153434,investing,Chipmakers Struggle With Inventory Buildup On ...,https://finance.yahoo.com/news/chipmakers-stru...,0.89,32,0,32,[Micron]


In [5]:
# merge organizations column into one big list
orgs = df['organizations'].to_list()
orgs = [org for sublist in orgs for org in sublist]
orgs[:10]

['DCA',
 'Ammo.com',
 'REIT',
 'SPLG',
 'LETB',
 'Farmland Partners',
 'AI',
 'Ford',
 'Micron',
 'DCA']

### Getting frequency of organizations

In [6]:
from collections import Counter

In [7]:
# create dictionary of organization mention frequency
org_freq = Counter(orgs)

In [8]:
org_freq.most_common(10)

[('FAQ', 50),
 ('ETF', 38),
 ('Fidelity', 34),
 ('Fed', 30),
 ('VOO', 24),
 ('Vanguard', 21),
 ('treasury', 17),
 ('DCA', 16),
 ('Amazon', 14),
 ('Treasury', 14)]

### Blacklisting unrelated entities

In [9]:
BLACKLIST = ['ev', 'covid', 'etf', 'nyse', 'sec', 'spac', 'fda']

def get_orgs(text):
    doc = nlp(text)
    org_list = []
    for entity in doc.ents:
        # here we modify the original code to check that entity text is not equal to one of our 'blacklisted' organizations
        # (we also add .lower() to lowercase the text, this allows us to match both 'nyse' and 'NYSE' with just 'nyse')
        if entity.label_ == 'ORG' and entity.text.lower() not in BLACKLIST:
            org_list.append(entity.text)
    # if organization is identified more than once it will appear multiple times in list
    # we use set() to remove duplicates then convert back to list
    org_list = list(set(org_list))
    return org_list

In [10]:
df['organizations'] = df['selftext'].apply(get_orgs)
df.head()

Unnamed: 0,id,created_utc,subreddit,title,selftext,upvote_ratio,ups,downs,score,organizations
0,t3_zwixbx,1672156884,investing,Companies that are focusing on desalination?,"Hi All,\n\nIm wondering if anyone out there kn...",0.5,0,0,0,[]
1,t3_zwivno,1672156783,investing,Legendary investor David Tepper revealed his s...,The only stock he bought was Google (GOOG). He...,0.83,4,0,4,[]
2,t3_zwi6qy,1672154900,investing,Investing for roughly two year window.,"Hello, first allow me to say that my wife and ...",0.2,0,0,0,"[DCA, Ammo.com, REIT, SPLG, LETB, Farmland Par..."
3,t3_zwhson,1672153936,investing,ELI5: How can wash sales be used to actually a...,(Edit: My assumption is that they were abused ...,0.43,0,0,0,[]
4,t3_zwhm6b,1672153434,investing,Chipmakers Struggle With Inventory Buildup On ...,https://finance.yahoo.com/news/chipmakers-stru...,0.89,32,0,32,[Micron]


### Saving data to file

In [12]:
df.to_csv('./data/reddit_investing_ner.csv', sep='|', index=False)