# Company Name to Symbol Linking

## Name Matching


In [20]:
%pip install spacy
%pip install spacy-transformers

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [None]:
!python3 -m spacy download en_core_web_trf

In [21]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_trf")

In [22]:
text = "The speakers are serviceable, being loud enough for general use, but they pale in comparison with the best you get from Apple, Dell or Razer"  # HEXK uses Stoc code ##### 01337

doc = nlp(text)

displacy.render(doc, style="ent", jupyter=True)

# Get collection of entities
entities = doc.ents

In [23]:
print(f"Entities count: {len(entities)}")

# Print all entities
print("\nAll entities:")
for entity in entities:
    print(f" {entity.text} ({entity.label_})")

Entities count: 3

All entities:
 Apple (ORG)
 Dell (ORG)
 Razer (ORG)


In [24]:
# Filter entities to only include ORG entities
org_entities = [entity for entity in entities if entity.label_ == "ORG"]

display(org_entities)

[Apple, Dell, Razer]

## Dataset Preparation


In [None]:
%pip install pandas

In [25]:
import pandas as pd
import csv

# Read the CSV file of without considering quotes
nasdaq_df = pd.read_csv(
    "data/symbols/nasdaq.csv", sep="\t", quoting=csv.QUOTE_NONE, header=0
)

nyse_df = pd.read_csv(
    "data/symbols/nyse.csv", sep="\t", quoting=csv.QUOTE_NONE, header=0
)

amex_df = pd.read_csv(
    "data/symbols/amex.csv", sep="\t", quoting=csv.QUOTE_NONE, header=0
)

# Remove quotes from each column
nasdaq_df = nasdaq_df.apply(lambda x: x.str.replace('"', ""))
nyse_df = nyse_df.apply(lambda x: x.str.replace('"', ""))
amex_df = amex_df.apply(lambda x: x.str.replace('"', ""))

# Remove quotes from the column names
nasdaq_df.columns = nasdaq_df.columns.str.replace('"', "")
nyse_df.columns = nyse_df.columns.str.replace('"', "")
amex_df.columns = amex_df.columns.str.replace('"', "")

# Rename the columns
nasdaq_df.rename(columns={"Symbol": "ticker", "Description": "name"}, inplace=True)
nyse_df.rename(columns={"Symbol": "ticker", "Description": "name"}, inplace=True)
amex_df.rename(columns={"Symbol": "ticker", "Description": "name"}, inplace=True)

display(nasdaq_df.head())
display(nasdaq_df.count())
display(nasdaq_df.dtypes)

display(nyse_df.head())
display(nyse_df.count())
display(nyse_df.dtypes)

display(amex_df.head())
display(amex_df.count())
display(amex_df.dtypes)

Unnamed: 0,ticker,name
0,AACG,Ata Creativity Global ADR
1,AACI,Armada Acquisition Corp I
2,AACIU,Armada Acquisition Corp I
3,AACIW,Armada Acquisition Corp I WT
4,AADI,Aadi Biosciences Inc


ticker    5073
name      5073
dtype: int64

ticker    object
name      object
dtype: object

Unnamed: 0,ticker,name
0,A,Agilent Technologies
1,AA,Alcoa Corp
2,AAC,Ares Acquisition Corp Cl A
3,AAC.S,Ares Acquisition Corp II [Aact.U]
4,AAC.T,Ares Acquisition Corp II [Aact.Ws]


ticker    3102
name      3101
dtype: int64

ticker    object
name      object
dtype: object

Unnamed: 0,ticker,name
0,AAA,First Priority Clo Bond ETF
1,AAAU,GS Physical Gold ETF
2,AAMC,Altisource Asset Management Corp Com
3,AAPX,T-Rex 2X Long Apple Daily Target ETF
4,AAPY,Neos Strategy Apple [Aapl] ETF


ticker    3165
name      3165
dtype: int64

ticker    object
name      object
dtype: object

In [26]:
# Concatenate the two dataframes into a single dataframe with alphabetical order
tickers_df = pd.concat([nasdaq_df, nyse_df, amex_df]).sort_values(by="ticker")

# Drop NaN values
tickers_df.dropna(inplace=True)

# Reset the index
tickers_df.reset_index(drop=True, inplace=True)

display(tickers_df)

Unnamed: 0,ticker,name
0,A,Agilent Technologies
1,AA,Alcoa Corp
2,AAA,First Priority Clo Bond ETF
3,AAAU,GS Physical Gold ETF
4,AAC,Ares Acquisition Corp Cl A
...,...,...
11334,ZVRA,Zevra Therapeutics Inc
11335,ZVSA,Zyversa Therapeutics Inc
11336,ZWS,Zurn Elkay Water Solutions Corp
11337,ZYME,Zymeworks Inc


In [27]:
# Display AAPL from ticker_df
display(tickers_df[tickers_df["ticker"] == "AAPL"])

Unnamed: 0,ticker,name
29,AAPL,Apple Inc


## Name Matching library

- source: https://github.com/DeNederlandscheBank/name_matching
- article about it: https://medium.com/dnb-data-science-hub/company-name-matching-6a6330710334


In [None]:
%pip install name_matching

In [33]:
from name_matching.name_matcher import NameMatcher
import pandas as pd
from typing import List, Dict

# Set the minimum score for a match
min_score = 80


class Matcher:
    """
    A class to match company names to tickers.
    """

    def __init__(
        self,
        lowercase: bool,
        punctuations: bool,
        remove_ascii: bool,
        legal_suffixes: bool,
        common_words: bool,
        verbose: bool,
        distance_metrics: List[str],
    ):
        self.tickers_df = tickers_df
        self.min_score = 80
        self.instance = NameMatcher(
            number_of_matches=1,
            lowercase=lowercase,
            punctuations=punctuations,
            remove_ascii=remove_ascii,
            legal_suffixes=legal_suffixes,
            common_words=common_words,
            verbose=verbose,
        )

        self.instance.set_distance_metrics(distance_metrics)

        self.instance.load_and_process_master_data("name", self.tickers_df)

    def match_ticker(self, company_names: List[str]) -> pd.DataFrame:
        """
        Matches company names to tickers based on a given list of company names.

        Args:
            company_names (List[str]): A list of company names to be matched.

        Returns:
            pd.DataFrame: A dataframe containing the matched company names and their corresponding tickers.
        """

        # Create a dataframe with the company names
        company_names_df = pd.DataFrame(company_names, columns=["name"])

        # Match the company names
        matches_df = self.instance.match_names(
            to_be_matched=company_names_df, column_matching="name"
        )

        return matches_df

    def postprocess_matches(self, matches_df: pd.DataFrame) -> Dict[str, str]:
        """
        Postprocesses the matches DataFrame and returns a dictionary with company names as keys and tickers as values.

        Args:
            matches_df (pd.DataFrame): The DataFrame containing the matches.

        Returns:
            dict: A dictionary with company names as keys and tickers as values.
        """
        # Drop the matches with a score higher than 50 (score is ranging from 0 to 100)
        matches_df = matches_df[matches_df["score"] > self.min_score]

        # Drop the columns that are not needed
        matches_df = matches_df.drop(columns=["score", "match_name"])

        # Map the match_index to index in the tickers_df
        matches_df.loc[:, "ticker"] = matches_df["match_index"].apply(
            lambda x: self.tickers_df.iloc[x]["ticker"]
        )

        # Create dictionary with the company names and the ticker
        matches_dict = matches_df.set_index("original_name")["ticker"].to_dict()

        return matches_dict


a_matcher = Matcher(
    lowercase=True,
    punctuations=True,
    remove_ascii=True,
    legal_suffixes=True,
    common_words=True,
    verbose=True,
    distance_metrics=["weighted_jaccard"],
)

b_matcher = Matcher(
    lowercase=True,
    punctuations=True,
    remove_ascii=True,
    legal_suffixes=True,
    common_words=True,
    verbose=True,
    distance_metrics=["discounted_levenshtein"],
)

c_matcher = Matcher(
    lowercase=True,
    punctuations=True,
    remove_ascii=True,
    legal_suffixes=True,
    common_words=True,
    verbose=True,
    distance_metrics=["fuzzy_wuzzy_token_set"],
)


In [34]:
a_matches_raw_result = a_matcher.match_ticker(org_entities)
display(a_matches_raw_result)

a_matches = a_matcher.postprocess_matches(a_matches_raw_result)
display(a_matches)

preprocessing...

preprocessing complete 
 searching for matches...



100%|██████████| 1/1 [00:00<00:00, 170.36it/s]


possible matches found   
 fuzzy matching...



100%|██████████| 3/3 [00:00<00:00, 41.54it/s]

done





Unnamed: 0,original_name,match_name,score,match_index
0,apple,apple inc,100.0,29
1,dell,delta apparel,52.173913,2791
2,razer,rayzebio inc,50.0,8859


{'apple': 'AAPL'}

In [35]:
b_matches_raw_result = b_matcher.match_ticker(org_entities)
display(b_matches_raw_result)

b_matches = b_matcher.postprocess_matches(b_matches_raw_result)
display(b_matches)

preprocessing...

preprocessing complete 
 searching for matches...



100%|██████████| 1/1 [00:00<00:00, 530.99it/s]


possible matches found   
 fuzzy matching...



100%|██████████| 3/3 [00:00<00:00, 48.66it/s]

done





Unnamed: 0,original_name,match_name,score,match_index
0,apple,apple inc,100.0,29
1,dell,kellanova,39.380683,5801
2,razer,rayzebio inc,56.633368,8859


{'apple': 'AAPL'}

In [36]:
c_matches_raw_result = b_matcher.match_ticker(org_entities)
display(c_matches_raw_result)

c_matches = b_matcher.postprocess_matches(c_matches_raw_result)
display(c_matches)

preprocessing...

preprocessing complete 
 searching for matches...



100%|██████████| 1/1 [00:00<00:00, 474.90it/s]


possible matches found   
 fuzzy matching...



100%|██████████| 3/3 [00:00<00:00, 49.18it/s]

done





Unnamed: 0,original_name,match_name,score,match_index
0,apple,apple inc,100.0,29
1,dell,kellanova,39.380683,5801
2,razer,rayzebio inc,56.633368,8859


{'apple': 'AAPL'}

# Comp-Match library


Dose not work, because it is not possible to install the library. (Additionaly package was not updated for 6 years)

- source: https://github.com/franklingu/comp-match


In [30]:
%pip install comp-match

Note: you may need to restart the kernel to use updated packages.


In [31]:
import comp_match

print(comp_match.__version__)

ModuleNotFoundError: No module named 'comp_match'

## Spark NLP


SPÍŠ POPSAT V ENTITY LINKING https://www.johnsnowlabs.com/finance-nlp-1-6-sec-schedules-nasdaq-and-wikidata-integration-and-much-more/


Spark NLP is offering only pre-trained model for NASDAQ.csv
