In [1]:
# declare a list tasks whose products you want to use as inputs
upstream = ['fetch_securities']

In [2]:
# Parameters
upstream = {
    "fetch_securities": {
        "nb": "/root/market_watch/output/notebooks/fetch_securities.ipynb",
        "data": "/root/market_watch/output/data/raw/securities.csv",
    }
}
product = {
    "nb": "/root/market_watch/output/notebooks/normalize_security_names.ipynb",
    "data": "/root/market_watch/output/data/interim/normalized_security_names.csv",
}


In [3]:
import ast
import pandas as pd
import numpy as np
import json
from pathlib import Path
from src.utils import preprocess_text

import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
securities_file_path = upstream['fetch_securities']['data']
securities_df = pd.read_csv(securities_file_path, index_col=0)

In [5]:
securities_df.drop_duplicates(subset=['cik'], inplace=True)

In [6]:
securities_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9213 entries, 0 to 9212
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   cik                    9213 non-null   int64  
 1   ticker                 9213 non-null   object 
 2   exchanges              9213 non-null   object 
 3   ein                    9204 non-null   float64
 4   full_name              9213 non-null   object 
 5   former_names           9213 non-null   object 
 6   short_name             499 non-null    object 
 7   gics_sector            499 non-null    object 
 8   gics_sub_industry      499 non-null    object 
 9   sic                    8428 non-null   float64
 10  sic_description        8414 non-null   object 
 11  headquarters_location  499 non-null    object 
dtypes: float64(2), int64(1), object(9)
memory usage: 935.7+ KB


In [7]:
securities_df.former_names.replace('[]', np.nan, inplace=True)
securities_df[securities_df.former_names.isnull()]

Unnamed: 0,cik,ticker,exchanges,ein,full_name,former_names,short_name,gics_sector,gics_sub_industry,sic,sic_description,headquarters_location
0,1603978,AQB,['Nasdaq'],43156167.0,"AquaBounty Technologies, Inc.",,,,,900.0,"Fishing, Hunting and Trapping",
2,1268533,TYG,['NYSE'],0.0,TORTOISE ENERGY INFRASTRUCTURE CORP,,,,,,,
3,23197,CMTL,['Nasdaq'],112139466.0,COMTECH TELECOMMUNICATIONS CORP /DE/,,,,,3663.0,Radio & Tv Broadcasting & Communications Equip...,
6,1077428,TCBI,"['Nasdaq', 'Nasdaq']",752679109.0,TEXAS CAPITAL BANCSHARES INC/TX,,,,,6022.0,State Commercial Banks,
7,1059784,GNBT,['OTC'],820490211.0,GENEREX BIOTECHNOLOGY CORP,,,,,2834.0,Pharmaceutical Preparations,
...,...,...,...,...,...,...,...,...,...,...,...,...
9205,1692427,NCSM,['Nasdaq'],461527455.0,"NCS Multistage Holdings, Inc.",,,,,1389.0,"Oil & Gas Field Services, NEC",
9206,1763543,UWHGF,[''],0.0,United World Holding Group Ltd.,,,,,7900.0,Services-Amusement & Recreation Services,
9209,1818794,DYN,['Nasdaq'],364883909.0,"Dyne Therapeutics, Inc.",,,,,2834.0,Pharmaceutical Preparations,
9210,1667944,WCFB,['OTC'],0.0,"WCF Bancorp, Inc.",,,,,6035.0,"Savings Institution, Federally Chartered",


In [8]:
def nomalize_names(securities_df):
    def extract_name(string):
        if len(str(string).strip()) > 5:
            string = ast.literal_eval(string)
            string = json.dumps(string)
            lst = json.loads(string)
            name = lst[0]["name"]
            return name
        else:
            return np.nan
            
    securities_df['former_name'] = securities_df.former_names.apply(extract_name)
    securities_df = securities_df.replace(to_replace=["/[A-Za-z]+/?"], value=[''], regex=True)
    securities_df = securities_df.replace(to_replace=[r'\\DE\\'], value=[''], regex=True)
    
    not_null_cond = ~securities_df['former_name'].isnull()
    
    securities_df.loc[not_null_cond,'former_name'] = preprocess_text(securities_df.loc[not_null_cond,'former_name'])
    
    securities_df['full_name'] = preprocess_text(securities_df.full_name)
    securities_df.replace('[]', np.nan, inplace=True)

    columns_to_select = ['cik','ticker','full_name', 'former_name']
    
    return securities_df[columns_to_select]

In [9]:
securities_df = nomalize_names(securities_df)

In [10]:
output_file_path = product['data']
Path(output_file_path).parent.mkdir(exist_ok=True, parents=True)
securities_df.to_csv(output_file_path)
print(f"Saved file {output_file_path}")

Saved file /root/market_watch/output/data/interim/normalized_security_names.csv
