In [1]:
import pandas as pd

stocks_data_path = "DataIngestion/src/data/stocklist.csv"
stocks_list = pd.read_csv(stocks_data_path)
stocks_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   company_code  40 non-null     object
 1   company_name  40 non-null     object
dtypes: object(2)
memory usage: 768.0+ bytes


In [2]:
stocks_list.head()

Unnamed: 0,company_code,company_name
0,ADANIPORTS.NS,Adani Ports & SEZ Ltd.
1,ASIANPAINT.NS,Asian Paints Ltd.
2,AXISBANK.NS,Axis Bank Ltd.
3,BAJAJ-AUTO.NS,Bajaj Auto Ltd.
4,BAJFINANCE.NS,Bajaj Finance Ltd.


In [3]:
import aiohttp
import nest_asyncio

In [4]:
len(stocks_list)

40

In [5]:
from datetime import datetime, timedelta

# Current date
current_date = datetime.now()

# Date 20 years back
date_20_years_back = current_date - timedelta(days=365.25 * 20)
print(current_date, date_20_years_back)

2023-11-26 19:18:32.787342 2003-11-26 19:18:32.787342


In [6]:
import asyncio
import aiohttp
import pandas as pd
import io
from typing import List, Dict, Any
from datetime import datetime, timedelta


nest_asyncio.apply()

def get_timestamp_today():

    # # Get today's date
    # today = pd.Timestamp.today()
    current_date = datetime.now()

    # Date 20 years back
    date_20_years_back = current_date - timedelta(days=365.25 * 20)
    # Get the timestamp of today's beginning
    # start_ts = int(current_date.replace(hour=0, minute=0, second=0, microsecond=0).timestamp())
    start_ts = int(date_20_years_back.replace(hour=0, minute=0, second=0, microsecond=0).timestamp())

    # Get the timestamp of today's end
    # end_ts = int(date_20_years_back.replace(hour=23, minute=59, second=59, microsecond=999).timestamp())
    end_ts = int(current_date.replace(hour=23, minute=59, second=59, microsecond=999).timestamp())
    

    print("Start timestamp:", start_ts)
    print("End timestamp:", end_ts)
    return start_ts, end_ts


# Define URL template
URL = "https://query1.finance.yahoo.com/v7/finance/download/{0}?period1={1}&period2={2}&interval=1d&events=history"

async def download_stock_data(session: aiohttp.ClientSession, company_code: str, company_name: str, start_date: int, end_date: int) -> pd.DataFrame:
    """
    Asynchronously downloads stock data for a given company_code between start and end dates.

    :param session: aiohttp.ClientSession object for making HTTP requests.
    :param company_code: Stock company_code.
    :param start_date: Start date as a timestamp.
    :param end_date: End date as a timestamp.
    :return: DataFrame with stock data.
    """
    url = URL.format(company_code, start_date, end_date)
    try:
        async with session.get(url) as response:
            if response.status == 200:
                data = await response.text()
                df = pd.read_csv(io.StringIO(data))
                df['company_name'] = company_name
                df['company_code'] = company_code.split('.NS')[0]
                return df
            else:
                print(f"Failed to download data for {company_code}. HTTP status: {response.status}")
                return None
    except Exception as e:
        raise Exception(f"An error occurred while downloading data for {company_code}: {e}")


async def download_job(stocks_list: pd.DataFrame) -> pd.DataFrame:
    """
    Main function to download stock data for multiple company_codes.

    :param stocks_list: DataFrame containing stock company_codes.
    :param start_date: Start date in 'YYYY-MM-DD' format.
    :param end_date: End date in 'YYYY-MM-DD' format.
    :return: DataFrame with combined stock data.
    """
    # start_ts = int(pd.to_datetime(start_date).timestamp())
    # end_ts = int(pd.to_datetime(end_date).timestamp())

    start_ts, end_ts = get_timestamp_today()

    async with aiohttp.ClientSession() as session:
        tasks = [asyncio.create_task(download_stock_data(session, row['company_code'], row['company_name'], start_ts, end_ts)) for _, row in stocks_list.iterrows()]
        nifty_data_list = await asyncio.gather(*tasks)
        nifty_data_list = [item for item in nifty_data_list if item is not None]

        if len(nifty_data_list) == 0:
            raise Exception("There is no data")
        
        nifty_data_df = pd.concat([item for item in nifty_data_list if item is not None], ignore_index=True, axis=0)
        nifty_data_df['Date'] = pd.to_datetime(nifty_data_df['Date'])
        nifty_data_df['year'] = nifty_data_df['Date'].dt.year
        nifty_data_df['month'] = nifty_data_df['Date'].dt.month
        nifty_data_df['day'] = nifty_data_df['Date'].dt.day

        return nifty_data_df

# # Example usage
# stocks_list = pd.DataFrame({
#     'Symbol': ['ADANIPORTS.NS', 'BAJAJ-AUTO.NS'],
#     'Company Name': ['Adani Ports', 'Bajaj Auto Ltd.']
# })

# start_date = '2023-11-19'
# end_date = '2023-11-19'

# Run the asynchronous main function
# nifty_data_list = asyncio.run(main(stocks_list, start_date, end_date))

loop = asyncio.get_event_loop()

# Run the main function within the existing event loop
final_nifty_data_df = loop.run_until_complete(download_job(stocks_list))


Start timestamp: 1069785000
End timestamp: 1701023399
Failed to download data for INFRATEL.NS. HTTP status: 404


In [7]:
final_nifty_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 182108 entries, 0 to 182107
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   Date          182108 non-null  datetime64[ns]
 1   Open          181797 non-null  float64       
 2   High          181797 non-null  float64       
 3   Low           181797 non-null  float64       
 4   Close         181797 non-null  float64       
 5   Adj Close     181797 non-null  float64       
 6   Volume        181797 non-null  float64       
 7   company_name  182108 non-null  object        
 8   company_code  182108 non-null  object        
 9   year          182108 non-null  int32         
 10  month         182108 non-null  int32         
 11  day           182108 non-null  int32         
dtypes: datetime64[ns](1), float64(6), int32(3), object(2)
memory usage: 14.6+ MB


In [8]:
final_nifty_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 182108 entries, 0 to 182107
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   Date          182108 non-null  datetime64[ns]
 1   Open          181797 non-null  float64       
 2   High          181797 non-null  float64       
 3   Low           181797 non-null  float64       
 4   Close         181797 non-null  float64       
 5   Adj Close     181797 non-null  float64       
 6   Volume        181797 non-null  float64       
 7   company_name  182108 non-null  object        
 8   company_code  182108 non-null  object        
 9   year          182108 non-null  int32         
 10  month         182108 non-null  int32         
 11  day           182108 non-null  int32         
dtypes: datetime64[ns](1), float64(6), int32(3), object(2)
memory usage: 14.6+ MB


In [9]:
hello = final_nifty_data_df.to_csv('stock_data/stocks.csv',index=False)

In [10]:
final_nifty_data_df[final_nifty_data_df.Volume.isna()].company_code.unique()

array(['ADANIPORTS', 'ASIANPAINT', 'AXISBANK', 'BAJAJ-AUTO', 'BAJFINANCE',
       'BAJAJFINSV', 'BPCL', 'BHARTIARTL', 'CIPLA', 'COALINDIA',
       'DRREDDY', 'EICHERMOT', 'GAIL', 'GRASIM', 'HCLTECH', 'HDFCBANK',
       'HEROMOTOCO', 'HINDALCO', 'HINDPETRO', 'HINDUNILVR', 'ITC',
       'ICICIBANK', 'IBULHSGFIN', 'IOC', 'INDUSINDBK', 'INFY', 'JSWSTEEL',
       'KOTAKBANK', 'LT', 'M&M', 'MARUTI', 'NTPC', 'ONGC', 'POWERGRID',
       'RELIANCE', 'SBIN', 'SUNPHARMA', 'TCS'], dtype=object)

In [11]:
final_nifty_data_df.company_code.unique()

array(['ADANIPORTS', 'ASIANPAINT', 'AXISBANK', 'BAJAJ-AUTO', 'BAJFINANCE',
       'BAJAJFINSV', 'BPCL', 'BHARTIARTL', 'CIPLA', 'COALINDIA',
       'DRREDDY', 'EICHERMOT', 'GAIL', 'GRASIM', 'HCLTECH', 'HDFCBANK',
       'HEROMOTOCO', 'HINDALCO', 'HINDPETRO', 'HINDUNILVR', 'HDFC', 'ITC',
       'ICICIBANK', 'IBULHSGFIN', 'IOC', 'INDUSINDBK', 'INFY', 'JSWSTEEL',
       'KOTAKBANK', 'LT', 'M&M', 'MARUTI', 'NTPC', 'ONGC', 'POWERGRID',
       'RELIANCE', 'SBIN', 'SUNPHARMA', 'TCS'], dtype=object)

In [3]:
import requests
from bs4 import BeautifulSoup



In [4]:
def fetch_financial_news(company):
    # Construct the URL for the company's financial news
    url = f"https://www.google.com/finance/quote/{company}:NSE"

    # Send an HTTP GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the response
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract the financial news headlines
        financial_news_headlines = soup.find_all('div', class_='b3g-news-headline')

        # Print the financial news headlines
        for headline in financial_news_headlines:
            print(headline.text)
        return soup
    else:
        print(f"Error fetching financial news for {company}: {response.status_code}")
        return None

In [5]:
check = fetch_financial_news('CIPLA')

In [6]:
m = [div.find('a').get('href') for div in check.find_all('div', class_='z4rs2b')]

In [7]:
m

['https://www.livemint.com/market/live-blog/cipla-share-price-live-blog-for-30-nov-2023-11701311741801.html',

In [None]:
# check

In [21]:
# def fetch_financial_news_by_url(url):
#     # Construct the URL for the company's financial news
#     # url = f"https://www.google.com/finance/quote/{company}:NSE"

#     # Send an HTTP GET request to the URL
#     response = requests.get(url)

#     # Check if the request was successful
#     if response.status_code == 200:
#         # Parse the HTML content of the response
#         soup = BeautifulSoup(response.content, 'html.parser')

#         # Extract the financial news headlines
#         financial_news_headlines = soup.find_all('div', class_='b3g-news-headline')

#         # Print the financial news headlines
#         for headline in financial_news_headlines:
#             print(headline.text)
#         return soup
#     else:
#         print(f"Error fetching financial news for {url}: {response.status_code}")
#         return None

In [22]:
# check1 = fetch_financial_news_by_url(m[0])

def get_texts_soup_json(soup):
    json_texts = []
    script_tags = soup.find_all('script', {'type': 'application/json'})
    if script_tags:
        for script_tag in script_tags:
            json_texts.append(script_tag.text)
    return json_texts
    

def get_texts_soup_json_lrd(soup):
    json_texts = []
    script_tags = soup.find_all('script', {'type': 'application/ld+json'})
    if script_tags:
        for script_tag in script_tags:
            json_texts.append(script_tag.text)
    return json_texts


def get_texts_soup_from_p(soup):
    p_texts = []
    p_tags = soup.find_all('p')
    if p_tags:
        for p_tag in p_tags:
            p_texts.append(p_tag.text)
    return p_texts



all_texts = []
for url in m:
    url_soup = fetch_financial_news_by_url(url)
    if url_soup is not None:
        all_texts.append({
            'url':url,
            'p_texts': get_texts_soup_from_p(url_soup),
            'json_texts': get_texts_soup_json(url_soup),
            'json_lrd_texts': get_texts_soup_json_lrd(url_soup)
        })
    else:
        all_texts.append({
            'url':url,
            'p_texts': [],
            'json_texts': [],
            'json_lrd_texts': []
        })


# all_texts = [
#     {
#         'url':url,
#         'texts': [p_tag.text for p_tag in fetch_financial_news_by_url(url).find_all('p')]
#     }
#     for url in m if 
# ]

In [None]:
# help(check1)

In [None]:
# texts = [p_tag.text for p_tag in check1.find_all('p')]

In [50]:
all_texts

  'p_texts': ['My Account',
   'Follow us on:',
   'Powered By ',
   'Find & Invest in bonds issued by top corporates, PSU Banks, NBFCs, and much more. Invest as low as 10,000 and earn better returns than FD',
   'Invest Now',
   'Powered By ',
   "Unlock Your Trading Potential: Trade like Experts with SEBI registered creators, Learn from Courses & Webinars by India's Finest Finance Experts.",
   'Invest Now',
   '',
   'AMBAREESH BALIGA',
   'Fundamental, Stock Ideas, Multibaggers & Insights',
   'Subscribe',
   'CK NARAYAN',
   'Stock & Index F&O Trading Calls & Market Analysis',
   'Subscribe',
   'SUDARSHAN SUKHANI',
   'Technical Call, Trading Calls & Insights',
   'Subscribe',
   'T GNANASEKAR',
   'Commodity Trading Calls & Market Analysis',
   'Subscribe',
   'MECKLAI FINANCIALS',
   'Currency Derivatives Trading Calls & Insights',
   'Subscribe',
   'SHUBHAM AGARWAL',
   'Options Trading Advice and Market Analysis',
   'Subscribe',
   'MARKET SMITH INDIA',
   'Model portfolios

In [None]:
# texts
import json

In [None]:
all_texts[0]['json_lrd_texts'][1]
# Use script_tag = soup.find('script', {'type': 'application/json'})
# <script type="application/ld+json">

In [None]:
mm = '''
"2023-11-17T14:42:18+05:30", "articleBody": " Top active call options for Cipla at 17 Nov 14:42 were at strike price of ₹1250.0 (Expiry : 30 NOV 2023) & ₹1240.0 (Expiry : 30 NOV 2023) with prices ₹18.45 (+43.02%) & ₹25.0 (+44.51%) respectively.Top active put options for Cipla at 17 Nov 14:42 were at strike price of ₹1250.0 (Expiry : 30 NOV 2023) & ₹1200.0 (Expiry : 30 NOV 2023) with prices ₹14.0 (-38.46%) & ₹3.1 (-37.37%) respectively.Disclaimer: The Futures & Options data is at a delay of 15 minutes. ", "image": {
'''

In [None]:
import re

In [None]:
# Regular expression to extract the articleBody text
regex = r'"articleBody"\s*:\s*"(.*),\s*"\w+"\s*:'

regex1 = r'"articleBody"\s*:\s*"([^"]*)"'

# Extracting the articleBody text
match = re.search(regex1, all_texts[0]['json_lrd_texts'][1], flags=re.M)
extracted_text = match.group(0) if match else "No match found"
extracted_text

In [None]:
# help(re)

In [None]:
match = re.search(regex1, mm, flags=re.M)
extracted_text = match.group(0) if match else "No match found"
extracted_text