**steps**
1. identify row with headers
2. map headers to apppriate column number
3. read the line above (a) component till total/sub-total
4. remove all rows with null (expect for yeild)
5. add to full_data


In [46]:
import numpy as np 
import pandas as pd

In [47]:
def read_excel_file(file_path):
    file_ext = file_path.split(".")[-1].lower()
    if file_ext == "xlsb":
        try:
            return pd.read_excel(file_path, sheet_name=None, engine="pyxlsb", dtype=str)
        except Exception as e:
            print(f"❌ Error reading {file_path} (XLSB format): {e}")
            return None
    elif file_ext in ["xls", "xlsx"]:
        try:
            return pd.read_excel(file_path, sheet_name=None, dtype=str)
        except Exception as e:
            print(f"❌ Error reading {file_path} (XLS/XLSX format): {e}")
            return None
    else:
        print(f"⚠️ Unsupported file format: {file_path}")
        return None


In [48]:
path = r"data\\data\\Kotak Mutual Fund\\ConsolidatedSebiPortfolioJanuary2025.xls"


In [49]:
all_sheets = []

df_raw = read_excel_file(path)
if df_raw is None:
    pass
for sheet_name, sheet_df in df_raw.items():
    # if sheet_name not in sheets_to_avoid:
    all_sheets.append(sheet_df)

        

In [50]:
df_raw.items()

dict_items([('TIF',                                            Unnamed: 0  \
0                                  Name of Instrument   
1                             Equity & Equity related   
2                                                 NaN   
3                                                 NaN   
4                                                 NaN   
..                                                ...   
56                                                NaN   
57                                                NaN   
58                                                NaN   
59                                                NaN   
60  Benchmark - - Nifty India Tourism Index (Total...   

                                   Unnamed: 1  \
0                                         NaN   
1                                         NaN   
2   Listed/Awaiting listing on Stock Exchange   
3                                               
4                                               
.

In [51]:
df = all_sheets[0]
df = df.dropna(how='all')


In [52]:
def fetch_header_row(df :pd.DataFrame) -> list[str]: 
    rows = df.astype(str).agg(' '.join, axis=1)
    idx = rows[rows.apply(lambda x: "instrument" in x.lower())].index.tolist()[0]
    header_row = df.iloc[idx,:].fillna("NULL")
    header_row = [(header_row.iloc[col]) for col in range(header_row.shape[0])]
    return header_row

In [53]:
header_row = fetch_header_row(df)
header_row

['Name of Instrument',
 'NULL',
 'NULL',
 'ISIN Code',
 'Industry',
 'Yield',
 'Quantity',
 'Market Value (Rs.in Lacs)',
 '% to Net Assets']

## **Step 2** 

In [54]:
base_headers = ["Name of Instrument","ISIN" , "Industry" , "Yield" , "Quantity" , "Market Value" , "Net Asset Value (NAV)"]

In [55]:
import requests
def generate_embedding(text:str) -> list[float]:
    url = "https://lamhieu-lightweight-embeddings.hf.space/v1/embeddings"
    headers = {
        "accept": "application/json",
        "Content-Type": "application/json"
    }
    data = {
        "model": "snowflake-arctic-embed-l-v2.0",
        "input": text
    }

    response = requests.post(url, headers=headers, json=data)
    if response.ok:
        return response.json()["data"][0]["embedding"]
    else:
        raise Exception("No response")

In [56]:
test_header_row=["Company/Issuer/Instrument Name", "Industry/Rating","Quantity","ISIN","Gibbrish" , "Exposure/Market Value", "yeild to Instrument","% to Nav", "Cupon"]

In [57]:
def header_mapper(header_row , * , base_headers = ["Name of Instrument","ISIN" , "Industry" , "Yield" , "Quantity" , "Market Value" , "Net Asset Value (NAV)"]) -> {str:int}:
    import numpy as np
    from sklearn.metrics.pairwise import cosine_similarity

    header_map = dict()

    base_embeddings = np.array([generate_embedding(value) for value in base_headers])
    header_row_embeddings = np.array([generate_embedding(value) for value in header_row])
    # Compute cosine similarity (shape: 5 x 10)
    similarity_matrix = cosine_similarity(base_embeddings, header_row_embeddings)

    # For each base vector, find the index of the most similar header
    most_similar_indices = np.argmax(similarity_matrix, axis=1)

    # Optionally, get the similarity score too
    most_similar_scores = np.max(similarity_matrix, axis=1)

    # Print results
    for i, (idx, score) in enumerate(zip(most_similar_indices, most_similar_scores)):
        print(f"Base vector {i} ie {base_headers[i]} is most similar to header {idx} ie {header_row[idx]} with score {score:.4f}")
        header_map[base_headers[i]] = int(idx)
        
    return header_map

In [58]:
# header_mapper(test_header_row)

In [59]:
header_map = header_mapper(header_row)
header_map

Base vector 0 ie Name of Instrument is most similar to header 0 ie Name of Instrument with score 1.0000
Base vector 1 ie ISIN is most similar to header 3 ie ISIN Code with score 0.8405
Base vector 2 ie Industry is most similar to header 4 ie Industry with score 1.0000
Base vector 3 ie Yield is most similar to header 5 ie Yield with score 1.0000
Base vector 4 ie Quantity is most similar to header 6 ie Quantity with score 1.0000
Base vector 5 ie Market Value is most similar to header 7 ie Market Value (Rs.in Lacs) with score 0.6188
Base vector 6 ie Net Asset Value (NAV) is most similar to header 8 ie % to Net Assets with score 0.5592


{'Name of Instrument': 0,
 'ISIN': 3,
 'Industry': 4,
 'Yield': 5,
 'Quantity': 6,
 'Market Value': 7,
 'Net Asset Value (NAV)': 8}

## **step 3**

In [60]:
rows = df.fillna("").astype(str).agg(' '.join, axis=1)
referance_rows = np.array(rows[rows.apply(lambda x : "stock exchange" in x.lower())].index.tolist())
investment_type_idxes = referance_rows -1
start_row_idxes = referance_rows +1

In [61]:
header_row

['Name of Instrument',
 'NULL',
 'NULL',
 'ISIN Code',
 'Industry',
 'Yield',
 'Quantity',
 'Market Value (Rs.in Lacs)',
 '% to Net Assets']

In [62]:
col_name = df.columns[header_map["Name of Instrument"]]

# All columns except 'Name'
other_cols = df.drop(columns=[col_name],axis =0).columns

# Mask where other columns are all NaN and 'Name' is not NaN
mask = df[other_cols].isna().all(axis=1) | df[col_name].notna()

result = df[mask]
print(result.fillna("").astype(str).agg("".join,axis = 1))


0     Name of InstrumentISIN CodeIndustryYieldQuanti...
1                               Equity & Equity related
23          Net Current Assets/(Liabilities)-51.95-2.18
27                                              Notes :
28                     1 Face Value per unit: Rs.  : 10
30                  2Portfolio Turnover Ratio  : 13.56%
32    3For NAV and IDCW refer NAV & IDCW details at ...
41    4SO: Structured Obligations FRB: Floating Rate...
43                                               Scheme
60    Benchmark - - Nifty India Tourism Index (Total...
dtype: object


In [63]:
start_indexes = df.index[mask].to_numpy()
start_indexes

array([ 0,  1, 23, 27, 28, 30, 32, 41, 43, 60])

In [64]:
prefinal_end_rows = np.array(rows[rows.apply(lambda x : "total" in x.lower() and "subtotal" not in x.lower())].index.tolist())

In [65]:
referance_end_rows = np.array([prefinal_end_rows[prefinal_end_rows > start_idx][0] for start_idx in start_row_idxes])
valid_ranges = np.array(tuple(zip(start_row_idxes,referance_end_rows)))
valid_ranges

array([[ 3, 18]])

In [66]:
df[valid_ranges[0][0]:valid_ranges[0][1]]

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Portfolio of Kotak Nifty India Tourism Index Fund as on 31-Jan-2025,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
3,,,Inter Globe Aviation Ltd,INE646L01027,Transport Services,,11079,479.09,20.07
4,,,INDIAN HOTELS CO LTD,INE053A01029,Leisure Services,,60289,461.03,19.31
5,,,Jubilant Foodworks Limited,INE797F01020,Leisure Services,,43712,307.69,12.89
6,,,GMR AIRPORTS LIMITED,INE776C01039,Transport Infrastructure,,410633,298.41,12.5
7,,,Indian Railway Catering And Tourism Corporatio...,INE335Y01020,Leisure Services,,34539,284.01,11.9
8,,,EIH Ltd.,INE230A01023,Leisure Services,,24476,90.68,3.8
9,,,Sapphire Foods India Ltd.,INE806T01020,Leisure Services,,26983,78.14,3.27
10,,,DEVYANI INTERNATIONAL LIMITED,INE872J01023,Leisure Services,,46006,77.59,3.25
11,,,LEMON TREE HOTELS LTD,INE970X01018,Leisure Services,,56447,76.88,3.22
12,,,CHALET HOTELS LTD.,INE427F01016,Leisure Services,,8143,62.78,2.63


## **step 4** read all lines in valid ranges

In [67]:
full_data = np.array([0]*(len(base_headers) + 3)) # baseheaders + type + scheme_name + amc_name

In [68]:
investment_types = [rows[investment_type_idx].strip() for investment_type_idx in start_indexes]
investment_types[:5]

['Name of Instrument   ISIN Code Industry Yield Quantity Market Value (Rs.in Lacs) % to Net Assets',
 'Equity & Equity related',
 'Net Current Assets/(Liabilities)       -51.95 -2.18',
 'Notes :',
 '1  Face Value per unit: Rs.  : 10']

In [69]:
full_data = pd.DataFrame(columns= list(header_map.keys()) + ["type" , "scheme" , "amc name"])

In [70]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
for i in range(len(start_indexes)):
    start_index = start_indexes[i]
    if i != len(start_indexes)-1:
        end_index = start_indexes[i+1]
    else :
        end_index = len(df)
    
    for (index,row) in df.iloc[start_index:end_index].iterrows():
        
        values = header_map.copy()
        for (key , idx) in header_map.items():
            values[key] = row[idx]
        isin = values['ISIN']
        if not str(isin).lower().startswith("in"):
            continue
        print(f"{index} ",end=" , ") # just to keep track

        full_data = pd.concat([full_data , pd.DataFrame([values])],ignore_index=True)
print("sheet over")
 
        
        
        

3  , 4  , 5  , 6  , 7  , 8  , 9  , 10  , 11  , 12  , 13  , 14  , 15  , 16  , 17  , sheet over


In [71]:
start_indexes

array([ 0,  1, 23, 27, 28, 30, 32, 41, 43, 60])

In [72]:
full_data = full_data.drop_duplicates()

In [73]:
len(full_data)

15

In [74]:
full_data

Unnamed: 0,Name of Instrument,ISIN,Industry,Yield,Quantity,Market Value,Net Asset Value (NAV),type,scheme,amc name
0,,INE646L01027,Transport Services,,11079,479.09,20.07,,,
1,,INE053A01029,Leisure Services,,60289,461.03,19.31,,,
2,,INE797F01020,Leisure Services,,43712,307.69,12.89,,,
3,,INE776C01039,Transport Infrastructure,,410633,298.41,12.5,,,
4,,INE335Y01020,Leisure Services,,34539,284.01,11.9,,,
5,,INE230A01023,Leisure Services,,24476,90.68,3.8,,,
6,,INE806T01020,Leisure Services,,26983,78.14,3.27,,,
7,,INE872J01023,Leisure Services,,46006,77.59,3.25,,,
8,,INE970X01018,Leisure Services,,56447,76.88,3.22,,,
9,,INE427F01016,Leisure Services,,8143,62.78,2.63,,,


In [75]:
class A:
    def one(self):
        print("Hello")
        return 1


In [76]:
aa = A()
def two(a,b):
    print(f"{a+b} two")
    return 2
aa.one = two

In [77]:
aa.one(1,2)

3 two


2

In [78]:
header_row

['Name of Instrument',
 'NULL',
 'NULL',
 'ISIN Code',
 'Industry',
 'Yield',
 'Quantity',
 'Market Value (Rs.in Lacs)',
 '% to Net Assets']

In [None]:
start = 0
for i in range(len(header_row)):
    if header_row[i] != "NULL":
        end = i
        alter = df.iloc[:,start:end].fillna("").agg("".join,axis = 1)
        alter2 = df.drop(df.columns[start:end],axis = 1)
        pd.concat([alter , alter2] , axis = 1 )
        start = i

Unnamed: 0,0,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
0,Name of Instrument,ISIN Code,Industry,Yield,Quantity,Market Value (Rs.in Lacs),% to Net Assets
1,Equity & Equity related,,,,,,
2,Listed/Awaiting listing on Stock Exchange,,,,,,
3,Inter Globe Aviation Ltd,INE646L01027,Transport Services,,11079,479.09000000000003,20.07
4,INDIAN HOTELS CO LTD,INE053A01029,Leisure Services,,60289,461.03000000000003,19.310000000000002
5,Jubilant Foodworks Limited,INE797F01020,Leisure Services,,43712,307.69,12.89
6,GMR AIRPORTS LIMITED,INE776C01039,Transport Infrastructure,,410633,298.41,12.5
7,Indian Railway Catering And Tourism Corporati...,INE335Y01020,Leisure Services,,34539,284.01,11.9
8,EIH Ltd.,INE230A01023,Leisure Services,,24476,90.68,3.8
9,Sapphire Foods India Ltd.,INE806T01020,Leisure Services,,26983,78.14,3.27
