In [1]:
import re
import numpy as np
import pandas as pd
import difflib
import transformers
import spacy
from transformers import BertTokenizerFast

In [2]:
commonly_used_words = ["america","american","taiwan","mexico","china","health","healthcare","pharma","pharmaceutical","us","partners","united states","managment","mosaic",
                       "financial","bank","companies","services","products","brands","air","airlines","international","asset","equity","fund","group","resourses","technologies","hotels","control","controls","black","green","natural","steel","motor",
                       "general","resourses","electric","payments","home","world","union","credit","business","public","shipping","capital","express","royal","mobile","microelectronics"
                       "first","exchange","block","united","energy","national","realty","york","titan","community","skin","foods","industrial","iron","paper","crown","petroleum","jewelers"]

In [19]:
df_labels = pd.read_csv("with_keywords/nasdaq_labeled_companies_holdings.csv",index_col=False,header=0)
df_all_extracted_companies = pd.read_csv("with_keywords/all_companies_nasdaq.csv",index_col=False,header=0)
df_news = pd.read_csv("with_keywords/nasdaq_news.csv",index_col=False,header=0)

In [4]:
nlp = spacy.load("en_core_web_sm")
COMMENT = ""

In [5]:
tokenizer = transformers.BertTokenizerFast.from_pretrained("dslim/bert-base-NER")

In [6]:
print(commonly_used_words[0])
print("brands" in commonly_used_words)

america
True


In [7]:
def is_valid_match(real_str, extracted_str):
    global COMMENT
    
    all_real_words_is_common = True
    all_extracted_words_is_common = True
    real_words = real_str.lower().replace("-"," ").split(" ")#for walmart and other
    extracted_words = extracted_str.lower().replace("-"," ").split(" ")
    
    
    for word in real_words:
        if(word not in commonly_used_words and len(word) > 3):
            all_real_words_is_common = False
            
    if(all_real_words_is_common):
        COMMENT = "all words in real is commonly used, only full match avaliable"
        return False
    
    
    for word in extracted_words:
        if(word not in commonly_used_words and len(word) > 3):
            all_extracted_words_is_common = False
    if(all_extracted_words_is_common):
        COMMENT = "all words in extracted company is commonly used, only full match avaliable"
        return False
    
    if(len(real_words) > 1 and len(extracted_words) > 1 and real_str.replace(" ","").find(extracted_str)!=-1 ):
        COMMENT = "Vaild words match"
        return True
    
    for word in extracted_words:
        if(word not in real_words):
            return False
    COMMENT = "Vaild words match"
    return True

In [8]:
def is_also_proper(real_str, extracted_str,nlp):
    global COMMENT
    doc = nlp(real_str)
    
    for tok in doc: 
        if(tok.pos_ == 'PROPN' and extracted_str.replace("-","").title().find(tok.text)==-1 and tok.text not in tokenizer.get_vocab() and len(tok.text)>3 and tok.text.lower not in commonly_used_words):
            return False
        if(not tok.text.isupper() and tok.text.lower() not in commonly_used_words and tok.pos_ == 'PROPN' and extracted_str.replace("-","").title().find(tok.text)!=-1 and len(tok.text)>3):
            COMMENT = "Both companies have same proper name"
            return True
    return False

In [9]:
def is_also_person(real_str, extracted_str,nlp):
    global COMMENT
    doc = nlp(real_str)
    for ent in doc.ents: 
        if(ent.label_ == 'PERSON' and extracted_str.find(ent.text)!=-1 and len(ent.label > 2)):
            COMMENT = "Both companies have same person name"
            return True
    return False

In [10]:
def is_both_unique(real_str, extracted_str):#check if organisation name contains unique word
    global COMMENT
    real_words = real_str.replace("-","").split(" ")
    extracted_words = extracted_str.replace("-","").split(" ")
    if(len(real_words)==1 and len(extracted_str.replace("-","").split(" "))==1):#one subword - not valid
        return False
    for word in real_words:
        if(word in extracted_words and word not in tokenizer.get_vocab() and word.lower() not in commonly_used_words):
            COMMENT = "Both companies have unique word"
            return True
    words = real_str.split(" ")
    for word in words:
        if(word in extracted_words and word not in tokenizer.get_vocab() and word.lower() not in commonly_used_words):
            COMMENT = "Both companies have unique word"
            return True
    return False

In [11]:
def is_extracted_contain_real(real_str, extracted_str):#check if organisation name contains person name
    words = real_str.replace("-"," ").lower().split(" ")
    extracted_words = extracted_str.replace("-"," ").lower().split(" ")
    for word in words:
        if(word not in extracted_words):
            return False
    return True

In [12]:
def is_abbreviation(real_str, extracted_str):
    global COMMENT
    company = real_str.lower()
    extracted = extracted_str.replace("-"," ").lower().split(" ")
    extracted = [elem for elem in extracted if len(elem)!=0]
    if(len(company)!=len(extracted)): 
        extracted.append("group")
    if(len(company)!=len(extracted)):
        return False
    for idx in range(len(extracted)):
        if(company[idx]!=extracted[idx][0]):
            return False
    COMMENT = "Company is abbreviation"
    return True

In [13]:
print(is_also_proper("Pacific Gas Electric","El Paso Electric",nlp))

False


In [14]:
print("Nemours" in tokenizer.get_vocab())
print(is_valid_match("Yum! Brands","Jerry Dean"))
print(is_also_proper("BNY Mellon","Bank of New York Mellon",nlp))
print(is_both_unique("Yum Brands","Conagra Brands"))
print(is_both_unique("Sl Green Realty","Slg"))
print(is_both_unique("Dupont De Nemours","Ei Du Pont De Nemours"))
print(is_valid_match("Jp Morgan Chase","Jpmorgan"))
print(is_valid_match("Bank America","Bank"))
print(is_valid_match("Freeport-Mcmoran","Mcmoran"))
print(is_valid_match("Jp Morgan Chase","Jpmorgan"))
print(difflib.SequenceMatcher(None,"Sl Green Realty","Green Realty").ratio())
print(difflib.SequenceMatcher(None,"Manpowergroup","Manpower").ratio())

False
False
True
False
False
True
False
False
True
False
0.8888888888888888
0.7619047619047619


In [30]:
#getting sector data
#matching:
df_sectors = pd.read_csv("fix normalized tickers holdings with industry.csv",index_col=False,header=0)
found_num = 0
not_found_num = 0
total = 0

labeled_companies = df_labels['Name']
news = df_news['News']
#print(df_sectors.loc[df_sectors['Name'] == 'US Steel']['Name'])
ARTICLES_NUM = len(labeled_companies)
extracted_companies = df_all_extracted_companies['Name']
perfect_companies = 0
contain_real_companies = 0

all_sectors = []
for idx in range (ARTICLES_NUM):
    curr_sector = ""
    labeled = labeled_companies[idx].split('\t')
    for elem in labeled:
        curr_sector = str(df_sectors.loc[df_sectors['Name'] == elem]['Industry'].iloc[0]) + '\t'
    all_sectors.append(curr_sector[:-1])
    print(curr_sector)
print(len(all_sectors))

Steel/Iron Ore	
Semiconductors	
Semiconductors	
Investment Bankers/Brokers/Service	
Real Estate Investment Trusts	
Real Estate Investment Trusts	
Construction/Ag Equipment/Trucks	
Life Insurance	
Real Estate	
Major Banks	
Semiconductors	
Oil & Gas Production	
Telecommunications Equipment	
Major Banks	
Major Banks	
Investment Bankers/Brokers/Service	
Oil & Gas Production	
Investment Managers	
Telecommunications Equipment	
Trusts Except Educational Religious and Charitable	
Major Banks	
Business Services	
Real Estate	
Major Banks	
Restaurants	
Integrated oil Companies	
Consumer Electronics/Video Chains	
Major Banks	
Industrial Machinery/Components	
Industrial Machinery/Components	
Advertising	
Recreational Products/Toys	
Investment Bankers/Brokers/Service	
Industrial Machinery/Components	
Aerospace	
Package Goods/Cosmetics	
Investment Managers	
Investment Managers	
Auto Manufacturing	
Major Banks	
Biotechnology: Pharmaceutical Preparations	
Major Banks	
EDP Services	
Integrated oil Compa

In [31]:
res_df = pd.DataFrame(data=all_sectors,columns=['Sector'])
res_df.to_csv("companies_sectors.csv",index=False)
print(len(res_df))

44733


In [35]:
#matching:
res_df = pd.DataFrame(columns=['Real company','Extracted company','Metrix','Choice list','Comment','Article'])
df_sectors = pd.read_csv("companies_sectors.csv",index_col=False,header=0)
found_num = 0
not_found_num = 0
total = 0

labeled_companies = df_labels['Name']
news = df_news['News']
print(len(news))
print(len(labeled_companies))
headline = df_news['Keywords']
keywords = df_news['Headline']
#for i in range (50):
#    print(keywords[i])
#for i in range (50):
#    print(headline[i])
    #print(keywords[i].replace("{","").replace("}","").replace('"',"").split(","))
companies_sectors = df_sectors['Sector']
print(len(df_sectors))
COEFFS = {'contain_real': 0.9,'is_valid_match': 0.8,'is_both_unique': 0.8,'is_also_proper': 0.8,'is_abbreviation': 0.5,'is_also_person':0.8}
ARTICLES_NUM = len(labeled_companies)
extracted_companies = df_all_extracted_companies['Name']
perfect_companies = 0
contain_real_companies = 0

for idx in range (ARTICLES_NUM):
    labeled = labeled_companies[idx].split('\t')
    extracted = extracted_companies[idx].split('\t')
    
    if(companies_sectors[idx] == companies_sectors[idx]):
        sectors = companies_sectors[idx].split('\t')
    total += len(labeled)
    for labeled_elem in labeled:
        max_metrix = 0.0
        COMMENT = ""
        for extracted_elem in extracted:
            labeled_str = labeled_elem.replace(" ","").replace("-","").lower()
            extracted_str = extracted_elem.replace(" ","").replace("-","").lower()
            metrix = difflib.SequenceMatcher(None,labeled_str,extracted_str).ratio()
                
            if (metrix == 1.0 or labeled_str.replace("group","") == extracted_str):
                max_elem = extracted_elem
                max_metrix = 1.0
                COMMENT = ""
                perfect_companies += 1
                break
            if (is_extracted_contain_real(labeled_elem,extracted_elem)):
                max_elem = extracted_elem
                max_metrix = 1.0
                contain_real_companies += 1
                COMMENT = "Extracted company contain real"
                break
            if (metrix > 0.3 and len(extracted_str) > 2 and (len(labeled_str) > 2) and (is_valid_match(labeled_elem,extracted_elem) or is_valid_match(extracted_elem,labeled_elem) or is_also_proper(labeled_elem,extracted_elem,nlp) or is_both_unique(labeled_elem,extracted_elem))):
                max_metrix = 0.85
                max_elem = extracted_elem
            if metrix > max_metrix:
                max_metrix = metrix
                max_elem = extracted_elem
                COMMENT = ""
            if (is_abbreviation(labeled_elem,extracted_elem) or is_abbreviation(extracted_elem,labeled_elem)):
                max_metrix = 0.81
                max_elem = extracted_elem
        if (max_metrix > 0.95)or(max_metrix > 0.8  and len(labeled_elem) > 2 and len(max_elem) > 2):      
            res_df.loc[found_num]=[labeled_elem,max_elem,max_metrix,extracted,COMMENT,news[idx]]
            found_num += 1
        else:
            not_found_num += 1
            print(labeled_elem,"+", extracted," best: ",max_elem," :idx = ",idx)
print("not found", not_found_num)
print("found", found_num)
print("total",total)
print("found_res = ",found_num/total)
print("perfect_res = ",perfect_companies/total)
print("contained_res = ",contain_real_companies/total)
print(res_df.count())
res_df.drop_duplicates(subset=['Real company','Extracted company'],inplace=True)
print(res_df.count())
res_df.to_csv("with keyword results/bert_large_clean_all_choices.csv",index=False)


44733
44733
44733
Sentinelone + ['Sprint Nextel', 'Clearwire', 'Att', 'Sprint Nextel', 'Wall Street Journal', 'Sprint', 'Journal', 'Clearwire', 'Sprint', 'Sprint', 'Att', 'Att', 'Sprint']  best:  Sprint Nextel  :idx =  12
Visa + ['Capital One Financial', 'Jpmorgan Chase ', 'Bank America', 'Bank America', 'Bank America', 'Citigroup', 'Citigroup', 'Citigroup', 'American Express ', 'American Express ', 'American Express ', 'Capital One Financial', 'Cof ', 'Capital One', 'Capital One', 'Capital One', 'Jpmorgan Chase ', 'Jpm', 'Bank America', 'Bac', 'Citigroup', 'American Express ', 'Axp', 'Discover Financial Services', 'Dfs', 'Capital One', 'Capital One', 'New York Stock Exchange']  best:  Capital One  :idx =  21
Mastercard + ['Capital One Financial', 'Jpmorgan Chase ', 'Bank America', 'Bank America', 'Bank America', 'Citigroup', 'Citigroup', 'Citigroup', 'American Express ', 'American Express ', 'American Express ', 'Capital One Financial', 'Cof ', 'Capital One', 'Capital One', 'Capital O