**steps**
1. identify row with headers
2. map headers to apppriate column number
3. read the line above (a) component till total/sub-total
4. remove all rows with null (expect for yeild)
5. add to full_data


In [349]:
import numpy as np 
import pandas as pd

In [350]:
def read_excel_file(file_path):
    file_ext = file_path.split(".")[-1].lower()
    if file_ext == "xlsb":
        try:
            return pd.read_excel(file_path, sheet_name=None, engine="pyxlsb", dtype=str)
        except Exception as e:
            print(f"❌ Error reading {file_path} (XLSB format): {e}")
            return None
    elif file_ext in ["xls", "xlsx"]:
        try:
            return pd.read_excel(file_path, sheet_name=None, dtype=str)
        except Exception as e:
            print(f"❌ Error reading {file_path} (XLS/XLSX format): {e}")
            return None
    else:
        print(f"⚠️ Unsupported file format: {file_path}")
        return None


In [351]:
path = r"C:\\Users\\vaibh\\Documents\\IPYNB_notebooks\\interview\\data\\data\\HDFC Mutual Fund\\Monthly HDFC Flexi Cap Fund - 31 January 2025.xlsx"


In [352]:
all_sheets = []

df_raw = read_excel_file(path)
if df_raw is None:
    pass
for sheet_name, sheet_df in df_raw.items():
    # if sheet_name not in sheets_to_avoid:
    all_sheets.append(sheet_df)

        

In [353]:
df = all_sheets[0]
df = df.dropna(how='all')
df.reset_index(drop=True , inplace = True)
rows = df.fillna(" ").agg("".join , axis = 1)
df = df.iloc[rows[rows.apply(lambda x : "listing on stock exchange" not in x.lower())].index.to_list()]
df.reset_index(drop=True , inplace = True)

In [354]:
def fetch_header_row(df :pd.DataFrame) -> list[str]: 
    rows = df.astype(str).agg(' '.join, axis=1)
    idx = rows[rows.apply(lambda x: "instrument" in x.lower())].index.tolist()[0]
    header_row = df.iloc[idx,:].fillna("NULL")
    header_row = [(header_row.iloc[col]) for col in range(header_row.shape[0])]
    return header_row

In [355]:
header_row = fetch_header_row(df)
header_row

['NULL',
 'ISIN',
 'Coupon (%)',
 'Name Of the Instrument',
 'Industry+ /Rating',
 'Quantity',
 'Market/ Fair Value (Rs. in Lacs.)',
 '% to NAV',
 'Yield',
 '~YTC (AT1/Tier 2 bonds)',
 'Derivative\n% to NAV',
 'Unhedged\n% to NAV']

In [356]:
while "NULL" in header_row:
    start = None
    for i in range(len(header_row)):
        if start == None and header_row[i] != "NULL":
            start = i
            break

    for i in range(start+1 , len(header_row)):
        if header_row[i] != "NULL":
            end = i
            break
    alter1 = df.iloc[:,start:end].fillna("").agg("".join,axis = 1)
    alter2 = df.drop(df.columns[start:end],axis = 1)
    df = pd.concat([alter1 , alter2] , axis = 1)

    header_row = fetch_header_row(df)

            
header_row

['ISIN',
 'Coupon (%)',
 'Name Of the Instrument',
 'Industry+ /Rating',
 'Quantity',
 'Market/ Fair Value (Rs. in Lacs.)',
 '% to NAV',
 'Yield',
 '~YTC (AT1/Tier 2 bonds)',
 'Derivative\n% to NAV',
 'Unhedged\n% to NAV']

## **Step 2** 

In [357]:
base_headers = ["Name of Instrument","ISIN" , "Industry" , "Yield" , "Quantity" , "Market Value" , "Net Asset Value (NAV)"]

In [358]:
import requests
def generate_embedding(text:str) -> list[float]:
    url = "https://lamhieu-lightweight-embeddings.hf.space/v1/embeddings"
    headers = {
        "accept": "application/json",
        "Content-Type": "application/json"
    }
    data = {
        "model": "snowflake-arctic-embed-l-v2.0",
        "input": text
    }

    response = requests.post(url, headers=headers, json=data)
    if response.ok:
        return response.json()["data"][0]["embedding"]
    else:
        raise Exception("No response")

In [359]:
test_header_row=["Company/Issuer/Instrument Name", "Industry/Rating","Quantity","ISIN","Gibbrish" , "Exposure/Market Value", "yeild to Instrument","% to Nav", "Cupon"]

In [360]:
def header_mapper(header_row , * , base_headers = ["Name of Instrument","ISIN" , "Industry" , "Yield" , "Quantity" , "Market Value" , "Net Asset Value (NAV)"]) -> {str:int}:
    import numpy as np
    from sklearn.metrics.pairwise import cosine_similarity

    header_map = dict()

    base_embeddings = np.array([generate_embedding(value) for value in base_headers])
    header_row_embeddings = np.array([generate_embedding(value) for value in header_row])
    # Compute cosine similarity (shape: 5 x 10)
    similarity_matrix = cosine_similarity(base_embeddings, header_row_embeddings)

    # For each base vector, find the index of the most similar header
    most_similar_indices = np.argmax(similarity_matrix, axis=1)

    # Optionally, get the similarity score too
    most_similar_scores = np.max(similarity_matrix, axis=1)

    # Print results
    for i, (idx, score) in enumerate(zip(most_similar_indices, most_similar_scores)):
        print(f"Base vector {i} ie {base_headers[i]} is most similar to header {idx} ie {header_row[idx]} with score {score:.4f}")
        header_map[base_headers[i]] = int(idx)
        
    return header_map

In [361]:
# header_mapper(test_header_row)

In [362]:
header_map = header_mapper(header_row)
header_map

Base vector 0 ie Name of Instrument is most similar to header 2 ie Name Of the Instrument with score 0.9527
Base vector 1 ie ISIN is most similar to header 0 ie ISIN with score 1.0000
Base vector 2 ie Industry is most similar to header 3 ie Industry+ /Rating with score 0.6091
Base vector 3 ie Yield is most similar to header 7 ie Yield with score 1.0000
Base vector 4 ie Quantity is most similar to header 4 ie Quantity with score 1.0000
Base vector 5 ie Market Value is most similar to header 5 ie Market/ Fair Value (Rs. in Lacs.) with score 0.5944
Base vector 6 ie Net Asset Value (NAV) is most similar to header 6 ie % to NAV with score 0.5839


{'Name of Instrument': 2,
 'ISIN': 0,
 'Industry': 3,
 'Yield': 7,
 'Quantity': 4,
 'Market Value': 5,
 'Net Asset Value (NAV)': 6}

## **step 3**

## **step 4** read all lines in valid ranges

In [378]:
import pandas as pd

def check_isin(val):
    s = str(val).lower().strip()
    s = re.sub("[^a-zA-Z0-9]" , "" , s)
    return s.startswith("in") and s[-1] in "0123456789"

def get_valid_periods(df , header_map):

    mask = df.iloc[:, header_map["ISIN"]].apply(check_isin).values

    # Find continuous True periods
    periods = []
    start = None

    for i, val in enumerate(mask):
        if val:
            if start is None:
                start = i
        else:
            if start is not None:
                periods.append((start, i - 1))
                start = None

    # Edge case: last element was True
    if start is not None:
        periods.append((start, len(mask) - 1))

    print("Passing periods:", periods)
    return periods

periods = get_valid_periods(df , header_map)

Passing periods: [(4, 58), (63, 65), (70, 71)]


In [379]:
df

Unnamed: 0,0,"HDFC Flexi Cap Fund (An open ended equity scheme investing across large cap, mid cap & small cap stocks).2","HDFC Flexi Cap Fund (An open ended equity scheme investing across large cap, mid cap & small cap stocks).3","HDFC Flexi Cap Fund (An open ended equity scheme investing across large cap, mid cap & small cap stocks).4","HDFC Flexi Cap Fund (An open ended equity scheme investing across large cap, mid cap & small cap stocks).5","HDFC Flexi Cap Fund (An open ended equity scheme investing across large cap, mid cap & small cap stocks).6","HDFC Flexi Cap Fund (An open ended equity scheme investing across large cap, mid cap & small cap stocks).7","HDFC Flexi Cap Fund (An open ended equity scheme investing across large cap, mid cap & small cap stocks).8","HDFC Flexi Cap Fund (An open ended equity scheme investing across large cap, mid cap & small cap stocks).9",Income,Hybrid
0,Portfolio as on 31-Jan-2025Portfolio as on 31-...,Portfolio as on 31-Jan-2025,Portfolio as on 31-Jan-2025,Portfolio as on 31-Jan-2025,Portfolio as on 31-Jan-2025,Portfolio as on 31-Jan-2025,Portfolio as on 31-Jan-2025,Portfolio as on 31-Jan-2025,Portfolio as on 31-Jan-2025,,
1,ISIN,Coupon (%),Name Of the Instrument,Industry+ /Rating,Quantity,Market/ Fair Value (Rs. in Lacs.),% to NAV,Yield,~YTC (AT1/Tier 2 bonds),Derivative\n% to NAV,Unhedged\n% to NAV
2,EQUITY & EQUITY RELATED,,,,,,,,,,
3,Equity,,,,,,,,,,
4,INE090A01021|,,ICICI Bank Ltd.,Banks,49500000,620136,9.4,,,0,9.4
...,...,...,...,...,...,...,...,...,...,...,...
137,7) Total value and percentage of Illiquid Equi...,,,,,,,,,,
138,8) IDCW stands for Income Distribution cum Cap...,,,,,,,,,,
139,9) Riskometer based on Scheme Portfolio and Po...,,,,,,,,,,
140,Scheme Riskometer:,,,,,,,,,,


In [380]:
full_data = pd.DataFrame(columns= list(header_map.keys()) + ["type" , "scheme" , "amc name"])

In [381]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
for (start_idx , end_idx) in periods:
    scheme_name = re.findall(r'\b[a-zA-Z-\\/\s]+\b', df[start_idx-1:start_idx].fillna("").agg("".join , axis = 1).iloc[0])[0]
    print("\n",scheme_name)

    for (index , row) in df.iloc[start_idx:end_idx+1].iterrows():
        values = header_map.copy()
        for (key , idx) in header_map.items():
            values[key] = row.iloc[idx]
        print(f"{index} ",end=" , ") # just to keep track

        full_data = pd.concat([full_data , pd.DataFrame([values])],ignore_index=True)
print("sheet over")



 Equity
4  , 5  , 6  , 7  , 8  , 9  , 10  , 11  , 12  , 13  , 14  , 15  , 16  , 17  , 18  , 19  , 20  , 21  , 22  , 23  , 24  , 25  , 26  , 27  , 28  , 29  , 30  , 31  , 32  , 33  , 34  , 35  , 36  , 37  , 38  , 39  , 40  , 41  , 42  , 43  , 44  , 45  , 46  , 47  , 48  , 49  , 50  , 51  , 52  , 53  , 54  , 55  , 56  , 57  , 58  , 
 Government Securities
63  , 64  , 65  , 
 Units issued by ReIT
70  , 71  , sheet over


In [382]:
full_data = full_data.drop_duplicates()

In [383]:
full_data

Unnamed: 0,Name of Instrument,ISIN,Industry,Yield,Quantity,Market Value,Net Asset Value (NAV),type,scheme,amc name
0,ICICI Bank Ltd.,INE090A01021|,Banks,,49500000,620136.0,9.4,,,
1,HDFC Bank Ltd.£,INE040A01034|,Banks,,36000000,611550.0,9.27,,,
2,Axis Bank Ltd.,INE238A01034|,Banks,,56000000,552216.0,8.37,,,
3,Kotak Mahindra Bank Limited,INE237A01028|,Banks,,16500000,313714.5,4.76,,,
4,Maruti Suzuki India Limited,INE585B01010|,Automobiles,,2500000,307766.25,4.67,,,
5,SBI Life Insurance Company Ltd.,INE123W01016|,Insurance,,19500000,289302.0,4.39,,,
6,Cipla Ltd.,INE059A01026|,Pharmaceuticals & Biotechnology,,19200000,284044.8,4.31,,,
7,Bharti Airtel Ltd.,INE397D01024|,Telecom - Services,,16200000,263460.6,3.99,,,
8,HCL Technologies Ltd.,INE860A01027|,IT - Software,,12000000,207054.0,3.14,,,
9,PIRAMAL PHARMA LTD,INE0DK501011|,Pharmaceuticals & Biotechnology,,75597677,176066.99,2.67,,,


In [369]:
class A:
    def one(self):
        print("Hello")
        return 1


In [370]:
aa = A()
def two(a,b):
    print(f"{a+b} two")
    return 2
aa.one = two

In [371]:
aa.one(1,2)

3 two


2

In [372]:
header_row

['ISIN',
 'Coupon (%)',
 'Name Of the Instrument',
 'Industry+ /Rating',
 'Quantity',
 'Market/ Fair Value (Rs. in Lacs.)',
 '% to NAV',
 'Yield',
 '~YTC (AT1/Tier 2 bonds)',
 'Derivative\n% to NAV',
 'Unhedged\n% to NAV']

In [373]:
start = 0
for i in range(len(header_row)):
    if header_row[i] != "NULL":
        end = i
        alter = df.iloc[:,start:end].fillna("").agg("".join,axis = 1)
        alter2 = df.drop(df.columns[start:end],axis = 1)
        pd.concat([alter , alter2] , axis = 1 )
        start = i

In [374]:
get_valid_periods(df , header_map)

Passing periods: [(14, 58), (63, 65), (70, 71)]


[(14, 58), (63, 65), (70, 71)]

In [375]:
rows

0      Portfolio as on 31-Jan-2025Portfolio as on 31-...
1       ISINCoupon (%)Name Of the InstrumentIndustry+...
2                      EQUITY & EQUITY RELATED          
3       (a) Listed / awaiting listing on Stock Exchan...
4                                       Equity          
                             ...                        
139     7) Total value and percentage of Illiquid Equ...
140     8) IDCW stands for Income Distribution cum Ca...
141     9) Riskometer based on Scheme Portfolio and P...
142                         Scheme Riskometer:          
143                      Benchmark Riskometer:          
Length: 144, dtype: object