In [None]:
import os
from datetime import datetime, timedelta  # For working with dates
import time
import pandas as pd  # For working with DataFrames
from dotenv import load_dotenv
from openpyxl import load_workbook
from openpyxl.styles import NamedStyle, PatternFill, Border, Side
from openpyxl.utils import get_column_letter
from sqlalchemy import create_engine  # For crea

##################### LOADING IMPORTANT DATA ######################
# Load environment variables from the .env file
env_file_path = r'D:/Projects/.env'
load_dotenv(env_file_path)
# Giving output file name
output_file_path = 'MONTHLY.xlsx'
# Load data from different sheets in 'promotion.xlsx' into DataFrames
promotion_path = r'D:\Projects\promotion.xlsx'
region_df = pd.read_excel(promotion_path, sheet_name='Region')
aksiya_df = pd.read_excel(promotion_path, sheet_name='Aksiya')
paket_df = pd.read_excel(promotion_path, sheet_name='Paket')
types_df = pd.read_excel(promotion_path, sheet_name='TYPES')
##################### ACCESS ENV VARIABLES ######################
db_server = os.getenv("DB_SERVER")
db_database = os.getenv("DB_DATABASE_ASKGLOBAL")
db_user = os.getenv("DB_USER")
db_password = os.getenv("DB_PASSWORD")
db_port = os.getenv("DB_PORT")
db_driver_name = os.getenv("DB_DRIVER_NAME")

##################### PROCEDURE NAME ######################
procedure_name = os.getenv("MONTHLY")  # THIS IS HOURLY DATA GATHERING

start_date = datetime(2024,2,1).strftime('%Y%m%d')
tomorrow_date = datetime(2024, 3, 31).strftime('%Y%m%d')
##################### CONNECTION STRING AND SQL QUERY ######################
# Construct the connection string
conn_str = f"mssql+pyodbc://{db_user}:{db_password}@{db_server}:{db_port}/{db_database}?driver={db_driver_name}"
engine = create_engine(conn_str)

sql_query: str = f"""
DECLARE @DateBegin DATE = ?;
DECLARE @DateEnd DATE = ?;

EXEC {procedure_name}
@DataBegin = @DateBegin,
@DataEnd = @DateEnd;
"""

#####################  EXECUTION  ######################
df = pd.read_sql_query(sql_query, engine, params=(start_date, tomorrow_date))

In [None]:

df = df[df['DocName'].isin(['Оптовая реализация', 'Финансовая скидка', 'Возврат товара от покупателя'])]

In [None]:
df.drop(columns='Postavshik', inplace=True)

In [None]:
df = df[(df['DataEntered'].dt.year == 2024) & (df['DataEntered'].dt.month==2)]

In [None]:
df = pd.merge(df, region_df[['ClientMan', 'Region']], left_on='ClientManager', right_on='ClientMan', how='left')

df = pd.merge(df, aksiya_df[['Goodid', 'Aksiya']], left_on='Goodid', right_on='Goodid', how='left')
df = pd.merge(df, paket_df[['Goodid', 'Paket']], left_on='Goodid', right_on='Goodid', how='left')


In [None]:
dfx=df.INN.unique()
dfx

In [None]:
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor

url = "https://orginfo.uz/en/search/organizations/"
# inns = [inn for inn in df.INN.unique()[:30]]

def process_inn(inn):
    params = {'q': inn}
    response = requests.get(url, params=params)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        results = soup.find_all('a', class_='og-card')

        for result in results:
            organization_link = result['href']
            organization_url = f"https://orginfo.uz{organization_link}"

            organization_response = requests.get(organization_url)

            if organization_response.status_code == 200:
                organization_soup = BeautifulSoup(organization_response.text, 'html.parser')
                organization_name = organization_soup.select_one('.h1-seo').text.strip()
                details_rows = organization_soup.select('.row.border-bottom.py-3')

                with open("output.txt", "a", encoding="utf-8") as output_file:
                    output_file.write(f"Organization Name: {organization_name}\n")
                    output_file.write("Details:\n")

                    for row in details_rows:
                        span_elements = row.find_all('span')

                        if len(span_elements) >= 2:
                            key = span_elements[0].text.strip()
                            value = span_elements[1].text.strip()
                            output_file.write(f"{key} | {value}\n")

                    output_file.write("\n====================\n")

            else:
                with open("output.txt", "a", encoding="utf-8") as output_file:
                    output_file.write(f"Failed to retrieve information for organization at {organization_url}. Status Code: {organization_response.status_code}\n")

    else:
        with open("output.txt", "a", encoding="utf-8") as output_file:
            output_file.write(f"Failed to retrieve information for INN {inn}. Status Code: {response.status_code}\n")

# Use ThreadPoolExecutor to run multiple threads concurrently
# with ThreadPoolExecutor() as executor:
#     executor.map(process_inn, inns)

In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor

# Assuming `dfx` is pandas DataFrame containing 'INN' values
inn_list = [inn for inn in dfx]

# Set the URLs for the main page and the result page
url = "https://registr.stat.uz/ru/"
result_url = "https://registr.stat.uz/ru/result/"

# Set up Chrome options for headless mode
chrome_options = Options()
chrome_options.add_argument('--headless')  # Run Chrome in headless mode (without GUI)
chrome_options.add_argument('--disable-gpu')  # Disable GPU acceleration for headless mode
chrome_options.add_argument('--window-size=1920x1080')  # Set the window size for headless mode

# Create an empty list to store results
result_data = []

# Function to process a single inn value
def process_inn(inn):
    result_dict = {'INN': inn}

    try:
        # Set up the browser with headless mode
        driver = webdriver.Chrome(options=chrome_options)  # You need to have chromedriver installed

        driver.get(url)

        # Fill the form on the main page
        inn_input = driver.find_element(By.NAME, "OKPO") # it is written in the site as OKPO
        inn_input.send_keys(inn)

        checkbox = driver.find_element(By.XPATH, "//input[@type='checkbox']")
        checkbox.click()

        # Submit the form
        submit_button = driver.find_element(By.NAME, "submit")
        submit_button.click()

        # Wait for the result page to load
        WebDriverWait(driver, 10).until(EC.url_to_be(result_url))

        # Get the content of the result page
        result_page_content = driver.page_source

        # Process the result page content using BeautifulSoup
        soup = BeautifulSoup(result_page_content, 'html.parser')
        result_div = soup.find('div', {'id': 'demo2'})
        if result_div:
            result_text = result_div.text

            # Split the text into lines
            lines = result_text.split('\n')

            # Create a dictionary to store the extracted information
            for line in lines:
                # Split each line into parts based on ':'
                parts = line.split(':')
                if len(parts) == 2:
                    key = parts[0].strip()
                    value = parts[1].strip()
                    result_dict[key] = value
        else:
            result_dict['Result'] = 'Result div not found on the page'

    except Exception as e:
        result_dict['Result'] = f"Error processing INN {inn}: {str(e)}"

    finally:
        driver.quit()  # Close the browser window

    return result_dict

# Use ThreadPoolExecutor to run multiple threads concurrently
with ThreadPoolExecutor() as executor:
    result_data = list(executor.map(process_inn, inn_list))

# Convert the list of dictionaries to a DataFrame
result_df = pd.DataFrame(result_data)

# Export the DataFrame to Excel
result_df.to_excel('result_output.xlsx', index=False)