In [3]:
import numpy as np
import pandas as pd
import nltk
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import math
import torch
from transformers import BertModel, BertTokenizer
from tqdm import tqdm  # Import tqdm

# Ensure NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /home/group36/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/group36/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/group36/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
data = pd.read_csv("Final_Chunks.csv")
data

Unnamed: 0.1,Unnamed: 0,chunks,CompanyName,StockName,Year,PDF_Path,tables
0,0,certification chief executive officer ceo chie...,Adani Enterprises Limited,ADANIENT,2022-2023,extracted_pdfs/AR_22076_ADANIENT_2022_2023_270...,output_data/data_2.txt
1,1,maintaining internal control system evaluated ...,Adani Enterprises Limited,ADANIENT,2022-2023,extracted_pdfs/AR_22076_ADANIENT_2022_2023_270...,output_data/data_2.txt
2,2,may managing director chief financial officera...,Adani Enterprises Limited,ADANIENT,2022-2023,extracted_pdfs/AR_22076_ADANIENT_2022_2023_270...,output_data/data_2.txt
3,3,chairman message group continue consolidate bu...,Adani Enterprises Limited,ADANIENT,2022-2023,extracted_pdfs/AR_22076_ADANIENT_2022_2023_270...,output_data/data_3.txt
4,5,energy utility business new industry end end g...,Adani Enterprises Limited,ADANIENT,2022-2023,extracted_pdfs/AR_22076_ADANIENT_2022_2023_270...,output_data/data_4.txt
...,...,...,...,...,...,...,...
85324,85582,sl noabbreviation expansion erm enterprise ris...,Wipro Limited,WIPRO,2022-2023,extracted_pdfs/AR_22029_WIPRO_2022_2023_170620...,output_data/data_24733.txt
85325,85583,gas goi government india gptw great place work...,Wipro Limited,WIPRO,2022-2023,extracted_pdfs/AR_22029_WIPRO_2022_2023_170620...,output_data/data_24733.txt
85326,85584,reporting council iisc indian institute scienc...,Wipro Limited,WIPRO,2022-2023,extracted_pdfs/AR_22029_WIPRO_2022_2023_170620...,output_data/data_24733.txt
85327,85585,leed leadership energy environmental design li...,Wipro Limited,WIPRO,2022-2023,extracted_pdfs/AR_22029_WIPRO_2022_2023_170620...,output_data/data_24733.txt


In [4]:
data['type'] = "Financial Statements"

data.drop(columns=['Unnamed: 0'], inplace=True)
data["chunks"] = data["chunks"].astype(str)

In [7]:
data

Unnamed: 0,chunks,CompanyName,StockName,Year,PDF_Path,tables,type
0,certification chief executive officer ceo chie...,Adani Enterprises Limited,ADANIENT,2022-2023,extracted_pdfs/AR_22076_ADANIENT_2022_2023_270...,output_data/data_2.txt,Financial Statements
1,maintaining internal control system evaluated ...,Adani Enterprises Limited,ADANIENT,2022-2023,extracted_pdfs/AR_22076_ADANIENT_2022_2023_270...,output_data/data_2.txt,Financial Statements
2,may managing director chief financial officera...,Adani Enterprises Limited,ADANIENT,2022-2023,extracted_pdfs/AR_22076_ADANIENT_2022_2023_270...,output_data/data_2.txt,Financial Statements
3,chairman message group continue consolidate bu...,Adani Enterprises Limited,ADANIENT,2022-2023,extracted_pdfs/AR_22076_ADANIENT_2022_2023_270...,output_data/data_3.txt,Financial Statements
4,energy utility business new industry end end g...,Adani Enterprises Limited,ADANIENT,2022-2023,extracted_pdfs/AR_22076_ADANIENT_2022_2023_270...,output_data/data_4.txt,Financial Statements
...,...,...,...,...,...,...,...
85324,sl noabbreviation expansion erm enterprise ris...,Wipro Limited,WIPRO,2022-2023,extracted_pdfs/AR_22029_WIPRO_2022_2023_170620...,output_data/data_24733.txt,Financial Statements
85325,gas goi government india gptw great place work...,Wipro Limited,WIPRO,2022-2023,extracted_pdfs/AR_22029_WIPRO_2022_2023_170620...,output_data/data_24733.txt,Financial Statements
85326,reporting council iisc indian institute scienc...,Wipro Limited,WIPRO,2022-2023,extracted_pdfs/AR_22029_WIPRO_2022_2023_170620...,output_data/data_24733.txt,Financial Statements
85327,leed leadership energy environmental design li...,Wipro Limited,WIPRO,2022-2023,extracted_pdfs/AR_22029_WIPRO_2022_2023_170620...,output_data/data_24733.txt,Financial Statements


In [12]:
def bert_embeddings(dataframe, text_column):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    model.eval() 

    def get_bert_embedding(text):
        encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            output = model(**encoded_input)
        embeddings = output.last_hidden_state.mean(1)
        return embeddings.squeeze().numpy()

    tqdm.pandas(desc="Calculating embeddings")
    dataframe['bert_embeddings'] = dataframe[text_column].progress_apply(get_bert_embedding)
    return dataframe

df_with_embeddings = bert_embeddings(data, "chunks")

Calculating embeddings: 100%|██████████| 85329/85329 [1:33:45<00:00, 15.17it/s]  


In [13]:
df_with_embeddings

Unnamed: 0,chunks,CompanyName,StockName,Year,PDF_Path,tables,type,bert_embeddings
0,certification chief executive officer ceo chie...,Adani Enterprises Limited,ADANIENT,2022-2023,extracted_pdfs/AR_22076_ADANIENT_2022_2023_270...,output_data/data_2.txt,Financial Statements,"[-0.044540554, 0.24380706, 0.49626133, -0.0890..."
1,maintaining internal control system evaluated ...,Adani Enterprises Limited,ADANIENT,2022-2023,extracted_pdfs/AR_22076_ADANIENT_2022_2023_270...,output_data/data_2.txt,Financial Statements,"[-0.22312321, 0.15393618, 0.3204804, -0.157337..."
2,may managing director chief financial officera...,Adani Enterprises Limited,ADANIENT,2022-2023,extracted_pdfs/AR_22076_ADANIENT_2022_2023_270...,output_data/data_2.txt,Financial Statements,"[-0.199803, -0.18062566, -0.0701496, -0.323280..."
3,chairman message group continue consolidate bu...,Adani Enterprises Limited,ADANIENT,2022-2023,extracted_pdfs/AR_22076_ADANIENT_2022_2023_270...,output_data/data_3.txt,Financial Statements,"[-0.01682909, -0.102615915, 0.4548606, -0.1362..."
4,energy utility business new industry end end g...,Adani Enterprises Limited,ADANIENT,2022-2023,extracted_pdfs/AR_22076_ADANIENT_2022_2023_270...,output_data/data_4.txt,Financial Statements,"[0.48414773, 0.23148488, 0.3422429, -0.1390451..."
...,...,...,...,...,...,...,...,...
85324,sl noabbreviation expansion erm enterprise ris...,Wipro Limited,WIPRO,2022-2023,extracted_pdfs/AR_22029_WIPRO_2022_2023_170620...,output_data/data_24733.txt,Financial Statements,"[-0.15394269, 0.021005256, 0.3600451, -0.12657..."
85325,gas goi government india gptw great place work...,Wipro Limited,WIPRO,2022-2023,extracted_pdfs/AR_22029_WIPRO_2022_2023_170620...,output_data/data_24733.txt,Financial Statements,"[-0.030427141, 0.28292853, 0.29721686, -0.1603..."
85326,reporting council iisc indian institute scienc...,Wipro Limited,WIPRO,2022-2023,extracted_pdfs/AR_22029_WIPRO_2022_2023_170620...,output_data/data_24733.txt,Financial Statements,"[-0.0649204, 0.11672379, 0.33168477, -0.126112..."
85327,leed leadership energy environmental design li...,Wipro Limited,WIPRO,2022-2023,extracted_pdfs/AR_22029_WIPRO_2022_2023_170620...,output_data/data_24733.txt,Financial Statements,"[-0.012953644, -0.110052794, 0.33166024, -0.12..."


In [14]:
df_with_embeddings.to_pickle("Bert_Embeddings.pkl")

In [11]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle


symbols = data['StockName'].unique()
for symbol in tqdm(symbols):

    symbol_data = data[data['StockName'] == symbol]
    
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_vectorizer.fit(symbol_data['chunks'])
    tfidf_vectors = tfidf_vectorizer.transform(symbol_data['chunks']).toarray()

    symbol_data['TF-IDF'] = tfidf_vectors.tolist()

    # Save the fitted vectorizer to a file
    vectorizer_file = os.path.join('vectorizer', f'{symbol}_vectorizer.pkl')
    with open(vectorizer_file, 'wb') as f:
        pickle.dump(tfidf_vectorizer, f)
    
    # Save the TF-IDF vectors to a file
    symbol_data.to_csv(f"stocks_data/{symbol}_data.csv")


  0%|          | 0/40 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  symbol_data['TF-IDF'] = tfidf_vectors.tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  symbol_data['TF-IDF'] = tfidf_vectors.tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  symbol_data['TF-IDF'] = tfidf_vectors.tolist()
A value is trying to be set on a copy of a slice from a DataFram