**steps**
1. identify row with headers
2. map headers to apppriate column number
3. read the line above (a) component till total/sub-total
4. remove all rows with null (expect for yeild)
5. add to full_data


In [104]:
import numpy as np 
import pandas as pd

In [105]:
def read_excel_file(file_path):
    file_ext = file_path.split(".")[-1].lower()
    if file_ext == "xlsb":
        try:
            return pd.read_excel(file_path, sheet_name=None, engine="pyxlsb", dtype=str)
        except Exception as e:
            print(f"❌ Error reading {file_path} (XLSB format): {e}")
            return None
    elif file_ext in ["xls", "xlsx"]:
        try:
            return pd.read_excel(file_path, sheet_name=None, dtype=str)
        except Exception as e:
            print(f"❌ Error reading {file_path} (XLS/XLSX format): {e}")
            return None
    else:
        print(f"⚠️ Unsupported file format: {file_path}")
        return None


In [106]:
path = r"data\\data\\DSP Mutual Fund\\dsp-isin-debt-portfolio-as-on-28-feb-2025.xlsx"


In [107]:
all_sheets = []

df_raw = read_excel_file(path)
if df_raw is None:
    pass
for sheet_name, sheet_df in df_raw.items():
    # if sheet_name not in sheets_to_avoid:
    all_sheets.append(sheet_df)

        

In [108]:
df = all_sheets[0]

In [109]:
df_raw.keys()

dict_keys(['LIQUID', 'ULTRA', 'CREDITRISK', 'LDF', 'SHORT', 'STR', 'BOND', 'GILT', 'SAVINGS', 'REGULARSAVINGS', 'Corporate Bond', 'BANKING & PSU', '10YGF', 'LIQUIDETF', 'OVERNIGHT', 'FLOATER', 'Nifty SDL GSec 2028', 'CRISIL SDL GSec 2033', 'NIFTY SDL Gsec 2027', 'BSE Liquid Rate ETF', 'US Treasury FOF', 'FMP Series 270-1144D', 'FMP Series 268-1281D', 'FMP Series 267-1246D', 'SR 264 - 60M - 17D'])

In [110]:
df = all_sheets[0]
df = df.dropna(how='all')
df.reset_index(drop=True , inplace = True)
rows = df.fillna("").agg(" ".join , axis = 1)
df = df.iloc[rows[rows.apply(lambda x : "listing on stock exchange" not in x.lower())].index.to_list()]
df.reset_index(drop=True , inplace = True)

In [111]:
def fetch_header_row(df :pd.DataFrame) -> list[str]: 
    rows = df.astype(str).agg(''.join, axis=1)
    idx = rows[rows.apply(lambda x: "instrument" in x.lower())].index.tolist()[0]
    header_row = df.iloc[idx,:].fillna("NULL")
    header_row = [(header_row.iloc[col]) for col in range(header_row.shape[0])]
    return header_row

In [112]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,DSP Liquidity Fund,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11
0,,"Portfolio as on February 28, 2025",,,,,,,,,,
1,Sr. No.,Name of Instrument,ISIN,Rating/Industry,Quantity,Market value (Rs. In lakhs),% to Net Assets,Maturity Date,Put/Call Option,YTM (%),,
2,,DEBT INSTRUMENTS,,,,,,,,,,
3,,BOND & NCD's,,,,,,,,,,
4,,Listed / awaiting listing on the stock exchanges,,,,,,,,,Sector/Rating,Percent
5,1,Mahindra & Mahindra Financial Services Limited**,INE774D07VB7,CRISIL AAA,13000,13867.97,0.0062,2025-04-25 00:00:00,,7.96,CRISIL A1+,0.6242
6,2,Power Finance Corporation Limited**,INE134E08GY3,CRISIL AAA,600,6479.3,0.0029,2025-03-10 00:00:00,,7.1052,Sovereign,0.1777
7,3,LIC Housing Finance Limited**,INE115A07PM8,CRISIL AAA,250,3006.96,0.0013,2025-04-25 00:00:00,,7.62,ICRA A1+,0.0838
8,4,Power Finance Corporation Limited**,INE134E08KP3,CRISIL AAA,250,2649.14,0.0012,2025-04-24 00:00:00,,7.62,IND A1+,0.0605
9,,Total,,,,26003.37,0.0116,,,,CRISIL AAA,0.0116


In [113]:
header_row = fetch_header_row(df)
header_row

['Sr. No.',
 'Name of Instrument',
 'ISIN',
 'Rating/Industry',
 'Quantity',
 'Market value (Rs. In lakhs)',
 '% to Net Assets',
 'Maturity Date',
 'Put/Call Option',
 'YTM (%)',
 'NULL',
 'NULL']

In [114]:
n_iter = 0
while "NULL" in header_row and n_iter<4:
    start = None
    for i in range(len(header_row)):
        if start == None and header_row[i] != "NULL":
            start = i
            break

    for i in range(start+1 , len(header_row)):
        if header_row[i] != "NULL":
            end = i
            break
    alter1 = df.iloc[:,start:end].fillna("").agg(" ".join,axis = 1)
    alter2 = df.drop(df.columns[start:end],axis = 1)
    df = pd.concat([alter1 , alter2] , axis = 1)

    header_row = fetch_header_row(df)
    n_iter+=1

            
header_row

['Sr. No.',
 'Name of Instrument',
 'ISIN',
 'Rating/Industry',
 'Quantity',
 'Market value (Rs. In lakhs)',
 '% to Net Assets',
 'Maturity Date',
 'Put/Call Option',
 'YTM (%)',
 'NULL',
 'NULL']

## **Step 2** 

In [115]:
base_headers = ["Name of Instrument","ISIN" , "Industry" , "Yield" , "Quantity" , "Market Value" , "Net Asset Value (NAV)","YTM"]

In [116]:
import requests
def generate_embedding(text:str) -> list[float]:
    url = "https://lamhieu-lightweight-embeddings.hf.space/v1/embeddings"
    headers = {
        "accept": "application/json",
        "Content-Type": "application/json"
    }
    data = {
        "model": "snowflake-arctic-embed-l-v2.0",
        "input": text
    }

    response = requests.post(url, headers=headers, json=data)
    if response.ok:
        return response.json()["data"][0]["embedding"]
    else:
        raise Exception("No response")

In [117]:
test_header_row=["Company/Issuer/Instrument Name", "Industry/Rating","Quantity","ISIN","Gibbrish" , "Exposure/Market Value", "yeild to Instrument","% to Nav", "Cupon"]

In [118]:
def header_mapper(header_row , * , base_headers = ["Name of Instrument","ISIN" , "Industry" ,"Coupon" , "Yield" , "Quantity" , "Market Value" , "Net Asset Value (NAV)","YTM"]) -> {str:int}:
    import numpy as np
    from sklearn.metrics.pairwise import cosine_similarity
    import re

    header_map = dict()

    pre_process_header = lambda x : re.sub(r"[^a-z\s]","",x.lower())
    base_headers = [pre_process_header(header.lower()) for header in base_headers]
    header_row = [pre_process_header(header.lower()) for header in header_row]

    base_embeddings = np.array([generate_embedding(value) for value in base_headers])
    header_row_embeddings = np.array([generate_embedding(value) for value in header_row])
    # Compute cosine similarity (shape: 5 x 10)
    similarity_matrix = cosine_similarity(base_embeddings, header_row_embeddings)

    # For each base vector, find the index of the most similar header
    most_similar_indices = np.argmax(similarity_matrix, axis=1)

    # Optionally, get the similarity score too
    most_similar_scores = np.max(similarity_matrix, axis=1)

    # Print results
    for i, (idx, score) in enumerate(zip(most_similar_indices, most_similar_scores)):
        print(f"Base vector {i} ie {base_headers[i]} is most similar to header {idx} ie {header_row[idx]} with score {score:.4f}")
        if score > 0.45:
            header_map[base_headers[i]] = int(idx)

    return header_map

In [119]:
# header_mapper(test_header_row)

In [120]:
header_map = header_mapper(header_row)
header_map

Base vector 0 ie name of instrument is most similar to header 1 ie name of instrument with score 1.0000
Base vector 1 ie isin is most similar to header 2 ie isin with score 1.0000
Base vector 2 ie industry is most similar to header 3 ie ratingindustry with score 0.5426
Base vector 3 ie coupon is most similar to header 4 ie quantity with score 0.3290
Base vector 4 ie yield is most similar to header 4 ie quantity with score 0.3720
Base vector 5 ie quantity is most similar to header 4 ie quantity with score 1.0000
Base vector 6 ie market value is most similar to header 5 ie market value rs in lakhs with score 0.6876
Base vector 7 ie net asset value nav is most similar to header 6 ie  to net assets with score 0.5927
Base vector 8 ie ytm is most similar to header 9 ie ytm  with score 1.0000


{'name of instrument': 1,
 'isin': 2,
 'industry': 3,
 'quantity': 4,
 'market value': 5,
 'net asset value nav': 6,
 'ytm': 9}

In [121]:
import re
def trf(x):
    x = re.search(r"\d+\.\d+", x)
    if x : x = x.group().strip()
    return "0" if x or x == "" else x
    
nav = header_map["ytm"]
df.iloc[:,nav] = df.iloc[:,nav].astype(str).apply(trf).astype(np.float64)


## **step 3**

In [125]:
re.search(r'\d+(?:\.\d+)?',"YTM %").group()

AttributeError: 'NoneType' object has no attribute 'group'

In [None]:
df.select_dtypes("obje")

0
1
2
3
4
...
152
153
154
155
156


## **step 4** read all lines in valid ranges

In [124]:
import pandas as pd
import re 
def check_isin(val):
    s = str(val).lower().strip()
    s = re.sub("[^a-zA-Z0-9]" , "" , s)
    return s.startswith("in") and s[-1] in "0123456789"

def get_valid_periods(df , header_map):

    mask = df.iloc[:, header_map["ISIN"]].apply(check_isin).values

    # Find continuous True periods
    periods = []
    start = None

    for i, val in enumerate(mask):
        if val:
            if start is None:
                start = i
        else:
            if start is not None:
                periods.append((start, i - 1))
                start = None

    # Edge case: last element was True
    if start is not None:
        periods.append((start, len(mask) - 1))

    print("Passing periods:", periods)
    return periods

periods = get_valid_periods(df , header_map)

KeyError: 'ISIN'

In [None]:
full_data = pd.DataFrame(columns= list(header_map.keys()) + ["type" , "scheme" , "amc name"])

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
for (start_idx , end_idx) in periods:
    scheme_name = re.findall(r'\b[a-zA-Z-\\/\s]+\b', df[start_idx-1:start_idx].fillna("").agg(" ".join , axis = 1).iloc[0])[0]
    print("\n",scheme_name)

    for (index , row) in df.iloc[start_idx:end_idx+1].iterrows():
        values = header_map.copy()
        for (key , idx) in header_map.items():
            values[key] = row.iloc[idx]
        print(f"{index} ",end=" , ") # just to keep track

        full_data = pd.concat([full_data , pd.DataFrame([values])],ignore_index=True)
print("sheet over")



 Equity
4  , 5  , 6  , 7  , 8  , 9  , 10  , 11  , 12  , 13  , 14  , 15  , 16  , 17  , 18  , 19  , 20  , 21  , 22  , 23  , 24  , 25  , 26  , 27  , 28  , 29  , 30  , 31  , 32  , 33  , 34  , 35  , 36  , 37  , 38  , 39  , 40  , 41  , 42  , 43  , 44  , 45  , 46  , 47  , 48  , 49  , 50  , 51  , 52  , 53  , 54  , 55  , 56  , 57  , 58  , 59  , 60  , 61  , 62  , 63  , 64  , 65  , 66  , 67  , 68  , 69  , 70  , 71  , 72  , 73  , 74  , 75  , 76  , 77  , 78  , 79  , 80  , 81  , 82  , 83  , 84  , 85  , 86  , 87  , 88  , 89  , 90  , 91  , 92  , 93  , 94  , 95  , 96  , 97  , 98  , 99  , 100  , 101  , 102  , 103  , 104  , 105  , 106  , 107  , 108  , 109  , 110  , 111  , 112  , 113  , 114  , 115  , 116  , 117  , 118  , 119  , 120  , 121  , 122  , 123  , 124  , 125  , 126  , 127  , 128  , 129  , 130  , 131  , 132  , 133  , 134  , 135  , 136  , 137  , 138  , 139  , 140  , 141  , 142  , 143  , 144  , 145  , 146  , 147  , 148  , 149  , 150  , 151  , 152  , 153  , 154  , 155  , 156  , 157  , 158  , 159  , 1

In [None]:
full_data = full_data.drop_duplicates()

In [None]:
full_data

Unnamed: 0,Name of Instrument,ISIN,Industry,Yield,Quantity,Market Value,Net Asset Value (NAV),type,scheme,amc name
0,Infosys Ltd.,INE009A01021,IT - Software,,707726,13303.83,22.290000000000003,,,
1,BHARTI AIRTEL LTD.,INE397D01024,Telecom - Services,,444377,7226.900000000001,12.11,,,
2,TATA CONSULTANCY SERVICES LTD.,INE467B01029,IT - Software,,125972,5180.47,8.68,,,
3,Tech Mahindra Ltd.,INE669C01036,IT - Software,,285579,4781.88,8.01,,,
4,Zomato Ltd,INE758T01015,Retailing,,1245016,2743.39,4.6,,,
5,HCL Technologies Ltd.,INE860A01027,IT - Software,,141107,2434.73,4.08,,,
6,Wipro Ltd.,INE075A01022,IT - Software,,777882,2426.21,4.07,,,
7,Mphasis Ltd,INE356A01018,IT - Software,,65522,1879.14,3.15,,,
8,Persistent Systems Limited,INE262H01021,IT - Software,,27776,1675.61,2.81,,,
9,SWIGGY LTD,INE00H001014,Retailing,,376378,1566.11,2.62,,,


In [None]:
class A:
    def one(self):
        print("Hello")
        return 1


In [None]:
aa = A()
def two(a,b):
    print(f"{a+b} two")
    return 2
aa.one = two

In [None]:
aa.one(1,2)

3 two


2

In [None]:
header_row

['Name of Instrument',
 'ISIN Code',
 'Industry',
 'Yield',
 'Quantity',
 'Market Value (Rs.in Lacs)',
 '% to Net Assets']

In [None]:
start = 0
for i in range(len(header_row)):
    if header_row[i] != "NULL":
        end = i
        alter = df.iloc[:,start:end].fillna("").agg("".join,axis = 1)
        alter2 = df.drop(df.columns[start:end],axis = 1)
        pd.concat([alter , alter2] , axis = 1 )
        start = i

In [None]:
get_valid_periods(df , header_map)

Passing periods: [(2, 29)]


[(2, 29)]

In [None]:
rows

0     Name of Instrument  ISIN CodeIndustryYieldQuan...
1                       Equity & Equity related        
2      Listed/Awaiting listing on Stock Exchange       
3       Infosys Ltd.INE009A01021IT - Software 707726...
4       BHARTI AIRTEL LTD.INE397D01024Telecom - Serv...
5       TATA CONSULTANCY SERVICES LTD.INE467B01029IT...
6       Tech Mahindra Ltd.INE669C01036IT - Software ...
7       Zomato LtdINE758T01015Retailing 12450162743....
8       HCL Technologies Ltd.INE860A01027IT - Softwa...
9       Wipro Ltd.INE075A01022IT - Software 77788224...
10      Mphasis LtdINE356A01018IT - Software 6552218...
11      Persistent Systems LimitedINE262H01021IT - S...
12      SWIGGY LTDINE00H001014Retailing 3763781566.1...
13      Coforge LimitedINE591G01017IT - Software 171...
14      BHARTI HEXACOM LTD.INE343G01021Telecom - Ser...
15      Indus Towers Ltd.INE121J01017Telecom - Servi...
16      ZENSAR TECHNOLGIES LTD.INE520A01027IT - Soft...
17      Birlasoft LImitedINE836A01035IT - Softwa

In [None]:
import yaml
with open("config/amc_configs.yaml", "r") as f:
    data = yaml.safe_load(f)


In [None]:
for key,value in data.items():
    path = value["data_dir"]
    name = path.split("/")[-1]
    name = re.sub("_" , " " , name).title()
    value["data_dir"] = "/".join((path.split("/")[:-1] + [name]))
    path = value["output_file"]
    name = path.split("/")[-1]
    name = re.sub(".csv" , ".xlsx" , name)
    value["output_file"] = "/".join((path.split("/")[:-1] + [name]))

    

In [None]:
with open("config/amc_configs2.yaml","w") as f:
    yaml.dump(data,f ,default_flow_style=False)

In [None]:
"hello wolrd".title()

'Hello Wolrd'

In [None]:
def create_ISIN_mapping(df):
    """Create a mapping of fund names to ISINs."""
    
    isin_mapping = {}
    for index, row in df.iterrows():
        fund_name = row['Cleaned Fund Name'].lower()
        isin = row['ISIN']
        if fund_name and isin and row['Growth/Regular Type'] in ["Growth", "Regular"]:
            isin_mapping[fund_name] = isin
    return isin_mapping

In [None]:
isinlookup_path = r".\\ISIN\\fund_isin.xlsx"
df2 = pd.read_excel(isinlookup_path)
isin_map = create_ISIN_mapping(df2)
isin_map

{'aditya birla sun life banking & psu debt fund': 'INF209K01LT4',
 'axis banking & psu debt fund': 'INF846K01CD6',
 'bajaj finserv banking and psu fund': 'INF0QA701599',
 'bandhan banking & psu debt fund': 'INF194K014M9',
 'baroda bnp paribas banking and psu bond fund': 'INF955L01JY4',
 'canara robeco banking and psu debt fund': 'INF760K01KH3',
 'dsp banking & psu debt fund': 'INF740KA1OJ9',
 'edelweiss banking and psu debt fund': 'INF843K01FN5',
 'franklin india banking & psu debt fund': 'INF090I01KO5',
 'hdfc banking and psu debt fund': 'INF179KA1JC4',
 'hsbc banking and psu debt fund': 'INF917K01HM5',
 'icici prudential banking and psu debt fund': 'INF109K01II5',
 'invesco india banking and psu fund': 'INF205K01JV2',
 'iti banking & psu debt fund': 'INF00XX01846',
 'kotak banking and psu debt - growth': 'INF174K01FO3',
 'lic mf banking & psu fund': 'INF767K01568',
 'mirae asset banking and psu fund': 'INF769K01FU7',
 'nippon india banking  & psu debt fund': 'INF204KA1T56',
 'sbi ban

In [None]:
full_data["dummy"] = "hello"

In [None]:
import os
import yaml
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from dotenv import load_dotenv
load_dotenv()

CONFIG_PATH = os.getenv("CONFIG_PATH")
ISIN_LOOKUP_PATH = r".\\ISIN\\fund_isin.xlsx"

# Import your actual parser classes
from core.amcparser import AMCPortfolioParser

# Load all configurations
def load_yaml_config(filepath=CONFIG_PATH):
    with open(filepath, "r") as f:
        config = yaml.safe_load(f)
    return config

def get_isin_map(ISIN_LOOKUP_PATH):
    df = pd.read_excel(ISIN_LOOKUP_PATH)
    isin_mapping = {}
    for index, row in df.iterrows():
        fund_name = row['Cleaned Fund Name'].lower()
        isin = row['ISIN']
        if fund_name and isin and row['Growth/Regular Type'] in ["Growth", "Regular"]:
            isin_mapping[fund_name] = isin
    return isin_mapping

def get_name_map(amc_name, fund_name):
    name_map = dict()
    for fund in fund_name:
        if name_map.get(fund) :
            continue
        for amc in amc_name:
            words = fund.split(" ")
            if words[0].lower() in amc.lower():
                name_map[fund] = amc
                break
    
    return name_map


generate_embedding = AMCPortfolioParser()._generate_embedding
config_json = load_yaml_config()

AMC_NAMES = list(config_json.keys())
amc_name_value = [ x["amc_name"] for x in list(config_json.values())]
isin_mapping = get_isin_map(ISIN_LOOKUP_PATH)
fund_name = list(isin_mapping.keys())
fund_to_amc_map = get_name_map(amc_name= amc_name_value, fund_name=fund_name)


here1
here2
here3
{'aditya birla sun life banking & psu debt fund': 'Aditya Birla Sun Life Mutual Fund', 'axis banking & psu debt fund': 'Axis Mutual Fund', 'bandhan banking & psu debt fund': 'Bandhan Mutual Fund', 'baroda bnp paribas banking and psu bond fund': 'Baroda BNP Paribas Mutual Fund', 'canara robeco banking and psu debt fund': 'Canara Robeco Mutual Fund', 'dsp banking & psu debt fund': 'DSP Mutual Fund', 'edelweiss banking and psu debt fund': 'Edelweiss Mutual Fund', 'franklin india banking & psu debt fund': 'Franklin Templeton India', 'hdfc banking and psu debt fund': 'HDFC Mutual Fund', 'hsbc banking and psu debt fund': 'HSBC Mutual Fund', 'icici prudential banking and psu debt fund': 'ICICI Prudential Mutual Fund', 'invesco india banking and psu fund': 'Invesco Mutual Fund', 'iti banking & psu debt fund': 'ITI Mutual Fund', 'kotak banking and psu debt - growth': 'Kotak Mutual Fund', 'lic mf banking & psu fund': 'LIC Mutual Fund', 'mirae asset banking and psu fund': 'Mirae

In [None]:
isin_mapping

{'aditya birla sun life banking & psu debt fund': 'INF209K01LT4',
 'axis banking & psu debt fund': 'INF846K01CD6',
 'bajaj finserv banking and psu fund': 'INF0QA701599',
 'bandhan banking & psu debt fund': 'INF194K014M9',
 'baroda bnp paribas banking and psu bond fund': 'INF955L01JY4',
 'canara robeco banking and psu debt fund': 'INF760K01KH3',
 'dsp banking & psu debt fund': 'INF740KA1OJ9',
 'edelweiss banking and psu debt fund': 'INF843K01FN5',
 'franklin india banking & psu debt fund': 'INF090I01KO5',
 'hdfc banking and psu debt fund': 'INF179KA1JC4',
 'hsbc banking and psu debt fund': 'INF917K01HM5',
 'icici prudential banking and psu debt fund': 'INF109K01II5',
 'invesco india banking and psu fund': 'INF205K01JV2',
 'iti banking & psu debt fund': 'INF00XX01846',
 'kotak banking and psu debt - growth': 'INF174K01FO3',
 'lic mf banking & psu fund': 'INF767K01568',
 'mirae asset banking and psu fund': 'INF769K01FU7',
 'nippon india banking  & psu debt fund': 'INF204KA1T56',
 'sbi ban