In [16]:
import re
import numpy as np
import pandas as pd
import difflib
import transformers
import spacy
from transformers import BertTokenizerFast

In [17]:
df_labels = pd.read_csv("dslim/nasdaq_labeled_companies_holdings.csv",index_col=False,header=0)
df_all_extracted_companies = pd.read_csv("dslim/all_companies_nasdaq.csv",index_col=False,header=0)
df_news = pd.read_csv("dslim/nasdaq_labeled_news.csv",index_col=False,header=0)

In [18]:
nlp = spacy.load("en_core_web_sm")
COMMENT = ""

In [19]:
tokenizer = transformers.BertTokenizerFast.from_pretrained("dslim/bert-base-NER")

In [20]:
feature_words = ["america","bank","health"]

In [21]:
def is_valid_match(real_str, extracted_str):
    global COMMENT
    words = real_str.title().replace("-","").split(" ")#for walmart and other
    for word in words:
        for feature in feature_words:
            if (word.lower().find(feature)!=-1) and extracted_str.lower().find(feature)==-1:
                return False
        if(word not in extracted_str):
            return False
    COMMENT = "Vaild words match"
    return True

In [22]:
def is_also_proper(real_str, extracted_str,nlp):
    global COMMENT
    doc = nlp(real_str)
    for tok in doc: 
        if(tok.pos_ == 'PROPN' and extracted_str.replace("-","").title().find(tok.text)==-1 and tok.text not in tokenizer.get_vocab() and len(tok.text)>3):
            return False
        if(tok.pos_ == 'PROPN' and extracted_str.replace("-","").title().find(tok.text)!=-1 and tok.text not in tokenizer.get_vocab() and len(tok.text)>3):
            COMMENT = "Both companies have same proper name"
            return True
    return False

In [23]:
def is_also_person(real_str, extracted_str,nlp):
    global COMMENT
    doc = nlp(real_str)
    for ent in doc.ents: 
        if(ent.label_ == 'PERSON' and extracted_str.find(ent.text)!=-1 and len(ent.label > 2)):
            COMMENT = "Both companies have same person name"
            return True
    return False

In [24]:
def is_both_unique(real_str, extracted_str):#check if organisation name contains unique word
    global COMMENT
    words = real_str.replace("-","").split(" ")
    if(len(words)==1 and len(extracted_str.replace("-","").split(" "))==1):#one subword - not valid
        return False
    for word in words:
        if(word in extracted_str and word not in tokenizer.get_vocab()):
            COMMENT = "Both companies have unique word"
            return True
    words = real_str.split(" ")
    for word in words:
        if(word in extracted_str and word not in tokenizer.get_vocab()):
            COMMENT = "Both companies have unique word"
            return True
    return False

In [25]:
def is_extracted_contain_real(real_str, extracted_str):#check if organisation name contains person name
    words = real_str.replace("-"," ").lower().split(" ")
    for word in words:
        if(word not in extracted_str.lower()):
            return False
    return True

In [26]:
def is_arconym(real_str, extracted_str):
    global COMMENT
    company = real_str.lower()
    extracted = extracted_str.replace("-"," ").lower().split(" ")
    extracted = [elem for elem in extracted if len(elem)!=0]
    if(len(company)!=len(extracted)): 
        extracted.append("group")
    if(len(company)!=len(extracted)):
        return False
    for idx in range(len(extracted)):
        if(company[idx]!=extracted[idx][0]):
            return False
    COMMENT = "Company is acronym"
    return True

True
0.8
0.8235294117647058
Walmart
Name    55729
dtype: int64
Unnamed: 0    55729
Name          55729
dtype: int64
True
True
Company is acronym
True


In [32]:
#matching:
res_df = pd.DataFrame(columns=['Real company','Extracted company','Metrix','Comment','Article'])

found_num = 0
not_found_num = 0
total = 0

labeled_companies = df_labels['Name']
news = df_news['News']

ARTICLES_NUM = len(labeled_companies)
extracted_companies = df_all_extracted_companies['Name']
perfect_companies = 0
contain_real_companies = 0

for idx in range (ARTICLES_NUM):
    labeled = labeled_companies[idx].split('\t')
    extracted = extracted_companies[idx].split('\t')
    total += len(labeled)
    for labeled_elem in labeled:
        max_metrix = 0.0
        COMMENT = ""
        for extracted_elem in extracted:
            labeled_str = labeled_elem.replace(" ","").replace("-","").lower()
            extracted_str = extracted_elem.replace(" ","").replace("-","").lower()
            metrix = difflib.SequenceMatcher(None,labeled_str,extracted_str).ratio()
                
            if (metrix == 1.0):
                max_elem = extracted_elem
                max_metrix = 1.0
                perfect_companies += 1
                break
            if (is_extracted_contain_real(labeled_elem,extracted_elem)):
                max_elem = extracted_elem
                max_metrix = 1.0
                contain_real_companies += 1
                COMMENT = "Extracted company contain real"
                break
            if (metrix > 0.3 and len(extracted_str) > 2 and (len(labeled_str) > 2) and (is_valid_match(labeled_elem,extracted_elem) or is_valid_match(extracted_elem,labeled_elem) or is_also_proper(labeled_elem,extracted_elem,nlp) or is_both_unique(labeled_elem,extracted_elem))):
                max_metrix = 0.95
                max_elem = extracted_elem
            if metrix > max_metrix:
                max_metrix = metrix
                max_elem = extracted_elem
                COMMENT = ""
            if (is_arconym(labeled_elem,extracted_elem) or is_arconym(extracted_elem,labeled_elem)):
                max_metrix = 0.8
                max_elem = extracted_elem
        if (max_metrix > 0.95)or(max_metrix > 0.8  and len(labeled_elem) > 2 and len(max_elem) > 2):      
            res_df.loc[found_num]=[labeled_elem,max_elem,max_metrix,COMMENT,news[idx]]
            found_num += 1
        else:
            not_found_num += 1
            #print(labeled_elem,"+", extracted," best: ",max_elem," :idx = ",idx)
print("not found", not_found_num)
print("found", found_num)
print("total",total)
print("found_res = ",found_num/total)
print("perfect_res = ",perfect_companies/total)
print("contained_res = ",contain_real_companies/total)
print(res_df.count())
res_df.drop_duplicates(subset=['Real company','Extracted company'],inplace=True)
print(res_df.count())
res_df.to_csv("results/extracted_res_with_alice_improved.csv",index=False)


Sentinelone + ['Sprint Nextel', 'Clearwire', 'Att', 'Sprint Nextel', '', 'Wall Street Journal', 'Sprint', 'Journal', 'Clearwire', 'Sprint', 'Sprint', 'Att', 'Att', 'Sprint']  best:  Sprint Nextel  :idx =  12
Discover Financial Services + ['Capital One Financial', 'Jpmorgan Chase', 'Bank America', 'Bank America', 'Bank America', 'Citigroup', 'Citigroup', 'Citigroup', 'American Express', 'American Express', 'American Express', 'Capital One Financial', 'Co', 'Capital One', 'Capital One', 'Capital One', 'Jpmorgan Chase', 'Jpm', 'Bank America', 'Bac', 'Citigroup', '', 'American Express', 'Disco', 'Dfs', 'Capital One', 'Masterc', 'Ma', 'Capital One', 'New York Stock Exchange']  best:  Dfs  :idx =  21
Visa + ['Capital One Financial', 'Jpmorgan Chase', 'Bank America', 'Bank America', 'Bank America', 'Citigroup', 'Citigroup', 'Citigroup', 'American Express', 'American Express', 'American Express', 'Capital One Financial', 'Co', 'Capital One', 'Capital One', 'Capital One', 'Jpmorgan Chase', 'Jpm

KeyboardInterrupt: 

In [None]:
# USING conll2003 pretrained
#result - 59% for pretrained 1000 companies
# 58,5% for 10000 companies

#after handling
#63.5% for oretrained 1000 
#

#after ramoving long articles - 
#0.792 for 1000 articles
#0.877 for all articles
#afrer removing "&" - 0.83295
#holdngs and group reamoval: - 0.8382
#0.81 - spacy, but probably more false positive

In [None]:
#only nasdaq:
#93,7% - bert ->94,18%
#90.9% - spacy
#52.3% - nltk
#95% - flair

In [None]:
#add  to normalization:
#if(elem.find(" ")!=-1 and extracted_elem.find(" ")!=-1):# delete words except first with len < 3 if both companies have more than 1 word
#                labeled_elem = re.sub(r' \w{1,2}\b', '', labeled_elem).strip()
#                extracted_elem = re.sub(r' \w{1,2}\b', '', extracted_elem).strip()