# Set-up the environment

In [5]:
# pip install pandas

In [6]:
# pip install selenium

In [9]:
# pip install bs4

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import datetime
import pandas as pd
import logging

# Configure the logger (do this once at the beginning of your script)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Making a GET Request

In [12]:
class Scraper:
    DRIVER = 30
    SCROLL_PAUSE_TIME = 5 #seconds
    MAX_SCROLLS = 1
    CHROME_OPTIONS = "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    BASE_PATH = 'if any base path is given'
    DRIVER_LOCATION = '444556454c4f504544206279203a2054414e414d59204d414e44414c'
    
    def __init__(self,url, element_selector, ts_selector, author_selector):
        self.url = url
        self.element_selector = element_selector # This is the comment selector
        self.timeStamp_selector = ts_selector # This is the timestamp selector
        self.cmd_selector = author_selector # This is the author selector
        self.driver_loc = bytes.fromhex(Scraper.DRIVER_LOCATION).decode('utf-8')
        logger.info(self.driver_loc)
        '''Start the Scraper'''
        logger.info("Scraper has been started....")
                
    def __convert_time_to_datetime(self, time_str):
        """Converts relative or absolute time string to "DD-MM-YYYY HH:MM" format."""
        try:
            if "mins" in time_str or "secs" in time_str:
                parts = time_str.split()
                minutes = 0
                seconds = 0

                if "mins" in time_str:
                    minutes = int(parts[0])
                if "secs" in time_str:
                    if "mins" in time_str:
                        seconds = int(parts[2])
                    else:
                        seconds = int(parts[0])

                now = datetime.datetime.now()
                time_delta = datetime.timedelta(minutes=minutes, seconds=seconds)
                past_time = now - time_delta
                return past_time.strftime("%d-%m-%Y %H:%M")

            else:
                # Absolute time (e.g., "10:12 AM Feb 12th" or "7:29 AM Aug 28th")
                time_str = time_str.replace("th", "").replace("st","").replace("nd","").replace("rd","") #remove suffixes
                try:
                    dt_object = datetime.datetime.strptime(time_str, "%I:%M %p %b %d %Y")
                    return dt_object.strftime("%d-%m-%Y %H:%M")
                except ValueError:
                    #if year is not present, add current year.
                    now = datetime.datetime.now()
                    dt_object = datetime.datetime.strptime(time_str + " " + str(now.year), "%I:%M %p %b %d %Y")
                    #check if the parsed date is in the future. If so, subtract one year.
                    if dt_object > now :
                        dt_object = datetime.datetime.strptime(time_str + " " + str(now.year -1), "%I:%M %p %b %d %Y")

                    return dt_object.strftime("%d-%m-%Y %H:%M")

        except ValueError:
            logger.warning(f"Invalid time string: {time_str}")
            return None  # Return None if the input string is invalid
    
    def __scrape_infinite_scroll(self):
        """Scrapes data from a page with infinite scrolling."""
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument(Scraper.CHROME_OPTIONS)
        driver = webdriver.Chrome(options=chrome_options)
        logger.info("Fetch the data from the website....")
        try:
            driver.get(self.url)

            # Wait for initial content to load
            WebDriverWait(driver, Scraper.DRIVER).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, '.postItem_text_paragraph__3XhZQ'))
            )

            scroll_count = 0
            while scroll_count < Scraper.MAX_SCROLLS:
                # Scroll to the bottom of the page
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

                # Wait for new content to load
                time.sleep(Scraper.SCROLL_PAUSE_TIME)

                # Check if we've reached the end (optional, but helpful)
                # You might need to adjust this check based on your website's behavior
                try:
                    # Example: If a specific element disappears when all content is loaded
                    if not driver.find_elements(By.CSS_SELECTOR, "/html"):
                        logger.info("Reached end of content.")
                        break
                except:
                    pass #if the check fails, just continue scrolling

                scroll_count += 1

            html = driver.page_source
            soup = BeautifulSoup(html, "html.parser")
            logger.info("The data fetched completed from the website....")
            return soup

        except Exception as e:
            logger.error(f"{e}")
            return None
        finally:
            driver.quit()
    
    def get_the_data(self):
        ''' Get the data from the website'''
        return self.__scrape_infinite_scroll()
    
    def __find_Comments(self, stock_name):
        ''' Find All the comments in the page'''
        logger.info("Find all the comments in the page....")
        elements = self.__scrape_infinite_scroll()
        comments = []
        all_comments = []

        for comment_element in elements.find_all("div", class_= ".postItem_text_paragraph__3XhZQ"):  # Replace class name
            # Get the comment
            comment_text = comment_element.get_text(strip=True)
            #Get the timestamp
            get_ts = comment_element.find_previous("div", class_= self.timeStamp_selector) # Replace with your timestamp class
            # Get the Author
            author = comment_element.find_previous("div", class_= self.cmd_selector).text.strip()
            if get_ts:
                timestamp_text = self.__convert_time_to_datetime(get_ts.find("div").text.strip().replace("schedule", ""))
            else:
                timestamp_text = "Timestamp not found"

            all_comments.append({"Stock": stock_name, "Comment": comment_text,'Commented By':author, "Timestamp": timestamp_text})
        
        logger.info("Successfully grab the Data!!")
        
        return all_comments
    
    def __Convert_to_DataFrame(self, stock_name):
        ''' Convert the comments to a pandas dataframe'''
        all_comments = self.__find_Comments(stock_name)
        df = pd.DataFrame(all_comments)
        df = df.sort_values(by='Timestamp', ascending=True)
        return df
    
    def show_Comments(self, stock_name):
        ''' Display all the comments in the page'''
        data = self.__Convert_to_DataFrame(stock_name)
        logger.info("************* The DATA *************")
        logger.info(data)
    
    def save_Comments(self, stock_name):
        ''' Save the comments to a csv file'''
        data = self.__Convert_to_DataFrame(stock_name)
        data.to_csv(f"comments_{stock_name}.csv", index=False) #BASE_PATH + "comments.csv"
        logger.info(f"File has been generated successfully!!")
        


In [3]:
def read_config_file(filename):
    """Reads a config file and returns a dictionary of stock names and URLs."""
    config_data = {}
    try:
        with open(filename, 'r') as file:
            for line in file:
                line = line.strip()  # Remove leading/trailing whitespace
                if line:  # Skip empty lines
                    try:
                        stock_name, url = line.split(': ', 1)  # Split at the first ': '
                        config_data[stock_name] = url
                    except ValueError:
                        print(f"Warning: Invalid line in config file: {line}")
    except FileNotFoundError:
        print(f"Error: Config file '{filename}' not found.")
    return config_data

In [4]:
url = "https://mmb.moneycontrol.com/forum-topics/stocks/deepak-nitrite-4602.html"

In [13]:
obj = Scraper(url, element_selector, ts_selector, author_selector)
# obj.save_Comments(stock_name)
# obj.show_Comments("Deepak Nitrite")
df = obj.get_the_data()

2025-03-19 19:11:41,286 - INFO - DEVELOPED by : TANAMY MANDAL
2025-03-19 19:11:41,287 - INFO - Scraper has been started....
2025-03-19 19:11:42,915 - INFO - Fetch the data from the website....
2025-03-19 19:12:08,829 - INFO - The data fetched completed from the website....


In [10]:
elements = df

In [8]:
## Static Parameters
config_filename = 'config.conf'
element_selector = "postItem_text_paragraph__3XhZQ"
ts_selector = "postItem_price1__3ojct"
author_selector = "postItem_heading__2odZU"
# stock_data = read_config_file(config_filename)

# if stock_data:
#     for stock_name, url in stock_data.items():
#         print(f"Stock: {stock_name}, URL: {url}")
#         obj = Scraper(url, element_selector, ts_selector, author_selector)
#         obj.save_Comments(stock_name)
# else:
#     print("No data read from the config file.")

In [34]:
# import tkinter as tk
# from tkinter import ttk, scrolledtext, filedialog, messagebox
# import pandas as pd
# from selenium import webdriver
# from selenium.webdriver.chrome.options import Options
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from bs4 import BeautifulSoup
# import time
# import datetime
# import logging

# # Configure logging
# logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# logger = logging.getLogger(__name__)

# class Scraper:
#     DRIVER = 30
#     SCROLL_PAUSE_TIME = 5  # seconds
#     MAX_SCROLLS = 1
#     CHROME_OPTIONS = "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
#     BASE_PATH = ''  # Base path for saving files
#     DRIVER_LOCATION = '444556454c4f504544206279203a2054414e414d59204d414e44414c'

#     def __init__(self, url, element_selector, ts_selector, author_selector):
#         self.url = url
#         self.element_selector = element_selector  # This is the comment selector
#         self.timeStamp_selector = ts_selector  # This is the timestamp selector
#         self.cmd_selector = author_selector  # This is the author selector
#         self.driver_loc = bytes.fromhex(Scraper.DRIVER_LOCATION).decode('utf-8')
#         logger.info("Scraper has been started....")
#         logger.info(self.driver_loc)

#     def __convert_time_to_datetime(self, time_str):
#         """Converts relative or absolute time string to "DD-MM-YYYY HH:MM" format."""
#         try:
#             if "mins" in time_str or "secs" in time_str:
#                 parts = time_str.split()
#                 minutes = 0
#                 seconds = 0

#                 if "mins" in time_str:
#                     minutes = int(parts[0])
#                 if "secs" in time_str:
#                     if "mins" in time_str:
#                         seconds = int(parts[2])
#                     else:
#                         seconds = int(parts[0])

#                 now = datetime.datetime.now()
#                 time_delta = datetime.timedelta(minutes=minutes, seconds=seconds)
#                 past_time = now - time_delta
#                 return past_time.strftime("%d-%m-%Y %H:%M")

#             else:
#                 time_str = time_str.replace("th", "").replace("st", "").replace("nd", "").replace("rd", "")
#                 try:
#                     dt_object = datetime.datetime.strptime(time_str, "%I:%M %p %b %d %Y")
#                     return dt_object.strftime("%d-%m-%Y %H:%M")
#                 except ValueError:
#                     now = datetime.datetime.now()
#                     dt_object = datetime.datetime.strptime(time_str + " " + str(now.year), "%I:%M %p %b %d %Y")
#                     if dt_object > now:
#                         dt_object = datetime.datetime.strptime(time_str + " " + str(now.year - 1), "%I:%M %p %b %d %Y")
#                     return dt_object.strftime("%d-%m-%Y %H:%M")

#         except ValueError:
#             logger.warning(f"Invalid time string: {time_str}")
#             return None

#     def __scrape_infinite_scroll(self):
#         """Scrapes data from a page with infinite scrolling."""
#         chrome_options = Options()
#         chrome_options.add_argument("--headless")
#         chrome_options.add_argument(Scraper.CHROME_OPTIONS)
#         driver = webdriver.Chrome(options=chrome_options)
#         logger.info("Fetch the data from the website....")
#         try:
#             driver.get(self.url)
#             WebDriverWait(driver, Scraper.DRIVER).until(
#                 EC.presence_of_element_located((By.CSS_SELECTOR, self.element_selector))
#             )

#             scroll_count = 0
#             while scroll_count < Scraper.MAX_SCROLLS:
#                 driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
#                 time.sleep(Scraper.SCROLL_PAUSE_TIME)
#                 try:
#                     if not driver.find_elements(By.CSS_SELECTOR, "/html"):
#                         logger.info("Reached end of content.")
#                         break
#                 except:
#                     pass
#                 scroll_count += 1

#             html = driver.page_source
#             soup = BeautifulSoup(html, "html.parser")
#             logger.info("The data fetched completed from the website....")
#             return soup

#         except Exception as e:
#             logger.error(f"{e}")
#             return None
#         finally:
#             driver.quit()

#     def get_the_data(self):
#         ''' Get the data from the website'''
#         return self.__scrape_infinite_scroll()

#     def __find_Comments(self, stock_name):
#         ''' Find All the comments in the page'''
#         logger.info("Find all the comments in the page....")
#         elements = self.__scrape_infinite_scroll()
#         comments = []
#         all_comments = []

#         for comment_element in elements.find_all("div", class_=self.element_selector):
#             comment_text = comment_element.get_text(strip=True)
#             get_ts = comment_element.find_previous("div", class_=self.timeStamp_selector)
#             author = comment_element.find_previous("div", class_=self.cmd_selector).text.strip()
#             if get_ts:
#                 timestamp_text = self.__convert_time_to_datetime(get_ts.find("div").text.strip().replace("schedule", ""))
#             else:
#                 timestamp_text = "Timestamp not found"

#             all_comments.append({"Stock": stock_name, "Comment": comment_text, 'Commented By': author, "Timestamp": timestamp_text})

#         logger.info("Successfully grab the Data!!")
#         return all_comments

#     def __Convert_to_DataFrame(self, stock_name):
#         ''' Convert the comments to a pandas dataframe'''
#         all_comments = self.__find_Comments(stock_name)
#         df = pd.DataFrame(all_comments)
#         df = df.sort_values(by='Timestamp', ascending=True)
#         return df

#     def show_Comments(self, stock_name, text_area):
#         ''' Display all the comments in the page'''
#         data = self.__Convert_to_DataFrame(stock_name)
#         logger.info("************* The DATA *************")
#         logger.info(data)
#         text_area.delete(1.0, tk.END)
#         text_area.insert(tk.END, data.to_string())

#     def save_Comments(self, stock_name):
#         ''' Save the comments to a csv file'''
#         data = self.__Convert_to_DataFrame(stock_name)
#         filepath = filedialog.asksaveasfilename(defaultextension=".csv", filetypes=[("CSV files", "*.csv")])
#         if filepath:
#             data.to_csv(filepath, index=False)
#             logger.info(f"File has been generated successfully!!")
#             messagebox.showinfo("Success", "File saved successfully!")