In [17]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics.pairwise import cosine_similarity
from joblib import load

import nltk
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import math
import torch
from transformers import BertModel, BertTokenizer
from tqdm import tqdm  
import faiss

# Ensure NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')



from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain import HuggingFacePipeline
from torch import cuda, bfloat16
import transformers
import torch
import pickle


[nltk_data] Downloading package punkt to /home/group36/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/group36/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/group36/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [51]:
data = pd.read_csv("ind_nifty50list.csv")
stock_mapping = data[['Symbol', "Company Name"]]
stock_mapping['Company Namext'] = stock_mapping['Company Name'].str.replace('Ltd.', '')

embedding_data = pd.read_pickle("Bert_Embeddings.pkl")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval() 


embeddings = np.array(embedding_data['bert_embeddings'].tolist()).astype('float32')

index = faiss.IndexFlatIP(embeddings.shape[1]) 
index.add(embeddings)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stock_mapping['Company Namext'] = stock_mapping['Company Name'].str.replace('Ltd.', '')


In [52]:
stock_symbol = ""

def get_bert_embedding(text):
    encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        output = model(**encoded_input)
    embeddings = output.last_hidden_state.mean(1)
    return embeddings.squeeze().numpy()


# get chunks from the data according to maximum cosine similarity
def get_chunks_symbol(query_vector, stock_data):
    scores = []

    for _, row in stock_data.iterrows():
        scores.append([[row['chunks'], row['tables'], row['type']], cosine_similarity([row['TF-IDF']], query_vector)[0][0]])

    scores.sort(key=lambda x: x[1], reverse=True)

    top_chunks = []
    for i in range(3):
        top_chunks.append(scores[i][0])
    return top_chunks


# get chunks 
def get_chunks_without_symbol(query):
    query_vector = get_bert_embedding(" ".join(preprocess_text(query)))

    # now here we can use cosine similarity or faiss model
    k = 3
    distances, indices = index.search(np.expand_dims(query_vector, axis=0), k)

    top_chunks = embedding_data.iloc[indices[0]]
    return top_chunks


def preprocess_text(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    tokens = [re.sub(r'[^a-zA-Z]', '', token) for token in tokens if token.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    return tokens

def process_head(query):
    symbol = ""
    
    for _, row in stock_mapping.iterrows():
        if(row['Symbol'] in query  or row['Company Name'] in query or row['Company Namext'] in query):
            symbol = row['Symbol']
            break
    global stock_symbol
    stock_symbol = symbol
    
    
    if(symbol != ""):
        with open(f'vectorizer/{symbol}_vectorizer.pkl', 'rb') as f:
            tfidf_vectorizer = pickle.load(f)
        stocks_data = pd.read_pickle(f"fstocks_data/{symbol}_data.pkl")
        query_vector = tfidf_vectorizer.transform([" ".join(preprocess_text(query))]).toarray()
        
        return get_chunks_symbol(query_vector, stocks_data)
    
    return get_chunks_without_symbol(query)


In [None]:
# queries
top_chunks = process_head("how ITC is contributing to employment generation")
top_chunks = process_head("how ITC is contributing to amrit kaal")
top_chunks = process_head("In which sectors ADANIENT invests?")

query = "What is the net profit of ITC company for this yaer ? "

In [56]:
# query = "among nifty 50 which stock has highest market cap"
# query = "What is the EBITDA of ITC company ? Give me from financial statements of company   EBITDA - Earnings before interest tax Depreciation Ammortization"
query = "Which comapny has largest market cap"
top_chunks = process_head(query)

top_chunks

Unnamed: 0,chunks,CompanyName,StockName,Year,PDF_Path,tables,type,bert_embeddings
78568,management estimate long-term compound annual ...,Tech Mahindra Limited,TECHM,2022-2023,extracted_pdfs/AR_22069_TECHM_2022_2023_260620...,output_data/data_22651.txt,Financial Statements,"[-0.33832258, -0.54165375, 0.34757358, 0.11431..."
43921,derivative retail institutional market share e...,Kotak Mahindra Bank Limited,KOTAKBANK,2022-2023,extracted_pdfs/AR_2022_2023.pdf,output_data/data_12363.txt,Financial Statements,"[0.33105066, -0.0840581, 0.20947589, -0.094901..."
12180,derivative retail institutional market share e...,Bajaj Finserv Limited,BAJAJFINSV,2022-2023,extracted_pdfs/AR_2022_2023.pdf,output_data/data_3650.txt,Financial Statements,"[0.33105066, -0.0840581, 0.20947589, -0.094901..."


In [57]:
top_chunks = top_chunks['chunks']
top_chunks

78568    management estimate long-term compound annual ...
43921    derivative retail institutional market share e...
12180    derivative retail institutional market share e...
Name: chunks, dtype: object

In [58]:
output = ""
for chunk in top_chunks:
    output += chunk + " \n\n "
output

'management estimate long-term compound annual ebitda growth rate consistent assumption market participant would make \n\n derivative retail institutional market share excluding bse derivative proprietary segment net npa customer asset st march business overview \n\n derivative retail institutional market share excluding bse derivative proprietary segment net npa customer asset st march business overview \n\n '

In [9]:

tables = set()
for chunk in top_chunks:
    for table in chunk[1].split(","):
        tables.add(table)


count = 0

table_content = ""
for table in tables:
    file_path = "preprocess/" + table
    try:
        
        with open(file_path, 'r') as f:
            file_content = f.read()
            print(file_content)
            print("\n\n")
            
            count += 1
            table_content += file_content + "\n\n\n"
    
        if(count == 2):
            break
    except Exception as e:
        print("Error: ", e)



Error:  [Errno 2] No such file or directory: 'preprocess/output_data/data_11272.txt'
INDEPENDENT AUDITOR’S REPORT		
To the Members of ITC Limited		
Report on the Audit of the Standalone Ind AS Financial	Institute of Chartered Accountants of India together with	
Statements	the  ethical  requirements  that  are  relevant  to  our  audit	
	of  the  financial  statements  under  the  provisions  of  the	
Opinion		
	Act  and  the  Rules  thereunder,  and  we  have  fulfilled  our	
We  have  audited  the  accompanying  standalone  Ind  AS		
	other  ethical  responsibilities 
in  accordance  with 
these	
financial statements of ITC Limited (“the Company”), which		
	requirements  and  the  Code  of  Ethics.  We  believe  that	
comprise  the  Balance  Sheet  as  at  March  31,  2023,  the		
	the  audit  evidence  we  have  obtained  is  sufficient  and	
Statement of Profit and Loss, including  the statement of		
	appropriate to provide a basis for our audit opinion on the	
Other Comprehensive I

In [38]:
fundamentals = ""

with open(f"minimised_tables/{stock_symbol}_tables.txt", 'r') as f:
    fundamentals_data = f.read().lower()
    fundamentals_data = fundamentals_data.replace("\n", " \n ")

for token in preprocess_text(query):
    if(token in fundamentals_data):
        fundamentals = fundamentals_data
        break

fundamentals

''

In [7]:
top_chunks

['crisis livelihood creation itc embarked even ambitious sustainability journey raise bar create secure better tomorrow following page provide glimpse itc multi-dimensional growth driver propelling company new horizon competitive sustainable progress contributing india journey amrit kaal \n\n ',
 "certificate standalone financial statement balance sheet statement profit loss statement change equity cash flow statement note financial statement independent auditor report guide subsidiary joint venture associate salient feature financial statement subsidiary joint venture associate form aoc- consolidated financial statement ten year glance financial highlight business responsibility sustainability report i-xlviii itc infotech business friendly solution creating enduring institution award accoladesreport account content itc contributing india amrit kaal creating value agriculture manufacturing servicescontents hyper-linked relevant page report click 'itc limited headerfooter page return co

In [24]:
# here we will load Llama model and pass in the chunks with the query

model_id = 'meta-llama/Llama-2-7b-chat-hf'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# set quantization configuration to load large model with less GPU memory
# this requires the bitsandbytes library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

# begin initializing HF items, you need an access token
hf_auth = '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'
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    token=hf_auth
)
tokenizer1 = AutoTokenizer.from_pretrained(model_id, use_auth_token=hf_auth, truncation=True)

text_generation_pipeline = pipeline(
    "text-generation",  # task
    model=model,
    tokenizer=tokenizer1,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
    max_length=200, 
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer1.eos_token_id,
    use_auth_token=hf_auth  # Passing token to pipeline
)

# enable evaluation mode to allow model inference
model.eval()

print(f"Model loaded on {device}")

Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.46s/it]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Model loaded on cuda:4


In [25]:
llm = HuggingFacePipeline(pipeline = pipeline, model_kwargs = {'temperature':0})
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer1)

In [62]:

# Generate text based on a prompt

# files needed to be sended for fundamentals -------------------------------------------------------------------------------------------------------------



# template = f"""Question: {query}

# Considering the following contexts only:

# {" ".join(top_chunks)}
# Answer:
# Answer your question using the provided contexts only . The Answer should be conscise, short and to the point. You should answer from the contexts provided to you only. 
# If the contexts are not sufficient to answer the question, respond with:
# "I don't have enough information to answer your question."
# """

template = f"""Question: {query}

Considering the following contexts only and answer from the provided context only:

{output}

Some fundamentals about the stock:
{fundamentals}

Answer:
Answer your question using the provided contexts only . The Answer should be conscise, short and to the point. You should answer from the contexts provided to you only. 
If the contexts are not suffcient to answer the question, respond with:
"I don't have enough information to answer your question."
"""
prompt = template
print(prompt)

generated_text = text_generator(prompt, max_length=40096, num_return_sequences=1)


Question: Which comapny has largest market cap

Considering the following contexts only and answer from the provided context only:

management estimate long-term compound annual ebitda growth rate consistent assumption market participant would make 

 derivative retail institutional market share excluding bse derivative proprietary segment net npa customer asset st march business overview 

 derivative retail institutional market share excluding bse derivative proprietary segment net npa customer asset st march business overview 

 

Some fundamentals about the stock:


Answer:
Answer your question using the provided contexts only . The Answer should be conscise, short and to the point. You should answer from the contexts provided to you only. 
If the contexts are not suffcient to answer the question, respond with:
"I don't have enough information to answer your question."



In [40]:
generated_text[0]['generated_text']

'Question: how ITC is contributing to employment generation\n\nConsidering the following contexts only:\n\nscheme sector expected play critical role boosting investment agri export farmer income employment generation building indian brand global market company included scheme several food business well agri business \n\n fatality reported essential indicator rehabilitated placed suitable employment whose family member placed suitable employment xxvitotal affected employeesworkers employeesworkers rehabilitated placed suitable employment whose family member placed suitable employment fy fy fy fy employee worker onsite accident entity pr ovide transition assistance programme facilitate continued employability management career ending resulting retirement termination employment yesno itc continually invests human capital development includes building skill capability contemporary providing employee diversity experience enhance employability workforce enable smooth transition alternate opp

In [63]:
generated_te = generated_text[0]['generated_text']
index = generated_te.find("I don't have enough information to answer your question.") + 56

generated_te[index:]


'"\n\nWhich company has the largest market capitalization?\n\nBased on the provided context, I can determine that the company\'s market capitalization is approximately $100 billion. However, I don\'t have enough information to identify the specific company with the largest market capitalization.\n\nPlease provide additional context or clarify your question to help me narrow down the answer.'

In [64]:
print(generated_te[index:])

"

Which company has the largest market capitalization?

Based on the provided context, I can determine that the company's market capitalization is approximately $100 billion. However, I don't have enough information to identify the specific company with the largest market capitalization.

Please provide additional context or clarify your question to help me narrow down the answer.


In [17]:
len(template)

9648

In [12]:
import pandas as pd
import os
import pickle

def convert_tfidf_string_to_array(tfidf_str):
    return eval(tfidf_str)

csv_folder = "stocks_data"

pickle_folder = "fstocks_data"

for csv_file in tqdm(os.listdir(csv_folder)):
    if csv_file.endswith(".csv"):
        df = pd.read_csv(os.path.join(csv_folder, csv_file))
        
        df['TF-IDF'] = df['TF-IDF'].apply(convert_tfidf_string_to_array)
        
        df.to_pickle(os.path.join(pickle_folder, csv_file.split(".")[0]+".pkl"))


100%|██████████| 40/40 [14:15<00:00, 21.38s/it]


In [3]:
!nvidia-smi

Tue Apr 23 15:37:23 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.125.06   Driver Version: 525.125.06   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:1A:00.0 Off |                  N/A |
| 30%   28C    P8    15W / 250W |   2095MiB / 11264MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:1B:00.0 Off |                  N/A |
| 30%   28C    P8    14W / 250W |    569MiB / 11264MiB |      0%      Default |
|       