# Associate Company Name With Ticker

In [1]:
%pip install pandas
%pip install name_matching

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


## Dataset Preparation


In [2]:
import pandas as pd
import csv

# Read the CSV file without considering quotes
nasdaq_df = pd.read_csv(
    "data/symbols/nasdaq.csv", sep="\t", quoting=csv.QUOTE_NONE, header=0
)

nyse_df = pd.read_csv(
    "data/symbols/nyse.csv", sep="\t", quoting=csv.QUOTE_NONE, header=0
)

# Remove quotes from each column
nasdaq_df = nasdaq_df.apply(lambda x: x.str.replace('"', ""))
nyse_df = nyse_df.apply(lambda x: x.str.replace('"', ""))

# Remove quotes from the column names
nasdaq_df.columns = nasdaq_df.columns.str.replace('"', "")
nyse_df.columns = nyse_df.columns.str.replace('"', "")

# Rename the columns
nasdaq_df.rename(columns={"Symbol": "ticker", "Description": "name"}, inplace=True)
nyse_df.rename(columns={"Symbol": "ticker", "Description": "name"}, inplace=True)

# Add an exchange column
# nasdaq_df["exchange"] = "NASDAQ"
# nyse_df["exchange"] = "NYSE"

display(nasdaq_df.head())
display(nyse_df.head())

Unnamed: 0,ticker,name
0,AACG,Ata Creativity Global ADR
1,AACI,Armada Acquisition Corp I
2,AACIU,Armada Acquisition Corp I
3,AACIW,Armada Acquisition Corp I WT
4,AADI,Aadi Biosciences Inc


Unnamed: 0,ticker,name
0,A,Agilent Technologies
1,AA,Alcoa Corp
2,AAC,Ares Acquisition Corp Cl A
3,AAC.S,Ares Acquisition Corp II [Aact.U]
4,AAC.T,Ares Acquisition Corp II [Aact.Ws]


In [3]:
# Concatenate the two dataframes into a single dataframe with alphabetical order
tickers_df = pd.concat([nasdaq_df, nyse_df]).sort_values(by="ticker")

# Drop NaN values
tickers_df.dropna(inplace=True)

# Reset the index
tickers_df.reset_index(drop=True, inplace=True)

# Save the dataframe to a CSV file
tickers_df.to_csv("data/symbols/NYSE_NASDAQ_Merged.csv", index=False)

display(tickers_df)

Unnamed: 0,ticker,name
0,A,Agilent Technologies
1,AA,Alcoa Corp
2,AAC,Ares Acquisition Corp Cl A
3,AAC.S,Ares Acquisition Corp II [Aact.U]
4,AAC.T,Ares Acquisition Corp II [Aact.Ws]
...,...,...
8169,ZVRA,Zevra Therapeutics Inc
8170,ZVSA,Zyversa Therapeutics Inc
8171,ZWS,Zurn Elkay Water Solutions Corp
8172,ZYME,Zymeworks Inc


In [4]:
# Display AAPL from ticker_df
display(tickers_df[tickers_df["ticker"] == "AAPL"])

Unnamed: 0,ticker,name
26,AAPL,Apple Inc


## Name matching library
- source: https://github.com/DeNederlandscheBank/name_matching
- article about it: https://medium.com/dnb-data-science-hub/company-name-matching-6a6330710334


In [11]:
from name_matching.name_matcher import NameMatcher
import pandas as pd
from typing import List, Dict


class Matcher:
    """
    A class to match company names to tickers.
    """

    def __init__(self):
        self.tickers_df = pd.read_csv("data/symbols/NYSE_NASDAQ_Merged.csv")
        self.min_score = 90
        self.instance = NameMatcher(
            number_of_matches=1,
            lowercase=True,
            punctuations=True,
            remove_ascii=False,
            legal_suffixes=True,
            common_words=False,
            verbose=True,
        )

        self.instance.set_distance_metrics(
            ["discounted_levenshtein", "fuzzy_wuzzy_partial_string"]
        )

        self.instance.load_and_process_master_data("name", self.tickers_df)

    def match_ticker(self, company_names: List[str]) -> pd.DataFrame:
        """
        Matches company names to tickers based on a given list of company names.

        Args:
            company_names (List[str]): A list of company names to be matched.

        Returns:
            pd.DataFrame: A dataframe containing the matched company names and their corresponding tickers.
        """

        # Create a dataframe with the company names
        company_names_df = pd.DataFrame(company_names, columns=["name"])

        # Match the company names
        matches_df = self.instance.match_names(
            to_be_matched=company_names_df, column_matching="name"
        )

        return matches_df

    def postprocess_matches(self, matches_df: pd.DataFrame) -> Dict[str, str]:
        """
        Postprocesses the matches DataFrame and returns a dictionary with company names as keys and tickers as values.

        Args:
            matches_df (pd.DataFrame): The DataFrame containing the matches.

        Returns:
            dict: A dictionary with company names as keys and tickers as values.
        """
        # Drop the matches with a score higher than 50 (score is ranging from 0 to 100)
        matches_df = matches_df[matches_df["score"] > self.min_score]

        # Drop the columns that are not needed
        matches_df = matches_df.drop(columns=["score", "match_name"])

        # Map the match_index to index in the tickers_df
        matches_df.loc[:, "ticker"] = matches_df["match_index"].apply(
            lambda x: self.tickers_df.iloc[x]["ticker"]
        )

        # Create dictionary with the company names and the ticker
        matches_dict = matches_df.set_index("original_name")["ticker"].to_dict()

        return matches_dict


matcher = Matcher()

In [12]:
# List of company names to match with tickers
company_names = ["apple", "dell", "razer", "AMD"]

matches_raw_result = matcher.match_ticker(company_names)
display(matches_raw_result)
matches = matcher.postprocess_matches(matches_raw_result)
display(matches)

preprocessing...

preprocessing complete 
 searching for matches...



100%|██████████| 1/1 [00:00<00:00, 629.30it/s]


possible matches found   
 fuzzy matching...



100%|██████████| 4/4 [00:00<00:00, 34.92it/s]

done





Unnamed: 0,original_name,match_name,score,match_index
0,apple,apple inc,100.0,26
1,dell,dell technologies inc,67.505619,2119
2,razer,parazero technologies ltd,65.377932,5989
3,amd,amdocs ltd ord,70.045197,2236


{'apple': 'AAPL'}

# Comp-Match library


Dose not work, because it is not possible to install the library. (Additionaly package was not updated for 6 years)
- source: https://github.com/franklingu/comp-match


In [7]:
%pip install comp-match

Note: you may need to restart the kernel to use updated packages.


In [8]:
import comp_match

print(comp_match.__version__)

ModuleNotFoundError: No module named 'comp_match'

In [None]:
import comp_match

comp_match.match(["Apple", "Google", "Facebook", "CitiBank"])

## Spark NLP

Spark NLP nabízí pouze pouze natrénovaný model pro NASDAQ.csv

In [4]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Initialize the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

  from .autonotebook import tqdm as notebook_tqdm
Downloading: 100%|██████████| 0.99M/0.99M [00:00<00:00, 2.07MB/s]
Downloading: 100%|██████████| 446k/446k [00:00<00:00, 1.48MB/s]
Downloading: 100%|██████████| 26.0/26.0 [00:00<00:00, 6.98kB/s]
Downloading: 100%|██████████| 1.29M/1.29M [00:00<00:00, 2.93MB/s]
Downloading: 100%|██████████| 665/665 [00:00<00:00, 407kB/s]
Downloading: 100%|██████████| 523M/523M [03:38<00:00, 2.51MB/s] 
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a world where the world is so much more diverse, it's important to understand that the world is not always as diverse as we think it is.

"We're not always going to be able to see the world in the same way that we see the world in the movies. We're going to be able to see the world in a different way. We're going to be able to see the world in a different way. We're going to be able to see the world in


In [12]:
# Define a prompt
prompt = "Give me Apple Inc. identifier (for Dell Inc. it is DELL)."

# Encode the prompt and generate text
inputs = tokenizer.encode(prompt, return_tensors="pt")
outputs = model.generate(inputs, max_length=100, temperature=0.7, num_return_sequences=1)

# Decode the output
text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Give me Apple Inc. identifier (for Dell Inc. it is DELL).

I have a Dell computer with a Dell logo on it.

I have a Dell logo on my laptop.

I have a Dell logo on my laptop.

I have a Dell logo on my laptop.

I have a Dell logo on my laptop.

I have a Dell logo on my laptop.

I have a Dell logo on my laptop.

I have
