In [28]:
import os
import psycopg2

# Database configuration
db_name = os.getenv('DB_NAME', 'moshack')
db_user = os.getenv('DB_USER', 'postgres')
db_password = os.getenv('DB_PASSWORD', 'postgres')
db_host = os.getenv('DB_HOST', 'localhost')
db_port = os.getenv('DB_PORT', '5432')

def get_db_connection():
    conn = psycopg2.connect(
        dbname=db_name,
        user=db_user,
        password=db_password,
        host=db_host,
        port=db_port
    )
    return conn

def get_tables_and_fields():
    tables_and_fields = {}
    conn = get_db_connection()
    cur = conn.cursor()
    
    # Get the list of tables
    cur.execute("""
        SELECT table_name 
        FROM information_schema.tables 
        WHERE table_schema = 'public'
    """)
    tables = cur.fetchall()
    
    for table in tables:
        table_name = table[0]
        
        # Get the list of fields for the current table
        cur.execute(f"""
            SELECT column_name 
            FROM information_schema.columns 
            WHERE table_name = '{table_name}'
        """)
        fields = cur.fetchall()
        field_names = [field[0] for field in fields]
        
        tables_and_fields[table_name] = field_names
    
    cur.close()
    conn.close()
    
    return tables_and_fields

if __name__ == '__main__':
    tables_and_fields = get_tables_and_fields()
    for table, fields in tables_and_fields.items():
        print(f"Table: {table}")
        print(f"Fields: {fields}\n")


Table: monthly_asset_data
Fields: ['month', 'approximated_value', 'average_price', 'asset_name']

Table: monthly_asset_analytics
Fields: ['min_delta_price', 'min_month', 'max_month', 'data_points', 'avg_number', 'median_number', 'min_number', 'max_number', 'slope_number', 'intercept_number', 'max_delta_number', 'min_delta_number', 'avg_price', 'median_price', 'min_price', 'max_price', 'slope_price', 'intercept_price', 'max_delta_price', 'asset_name']

Table: vedomost_ostatkov_na_3_0
Fields: ['balance_statement_os_nma_npa', 'fixed_asset', 'inventory_number', 'okof', 'depreciation_group', 'depreciation_method', 'accounting_date', 'condition', 'useful_life', 'monthly_wear_norm', 'wear', 'balance_value', 'quantity', 'depreciation_amount', 'residual_value', 'account', 'doc_date', 'doc_account']

Table: oborotnaia_vedomost_po_sch_105_za__1
Fields: ['pp', 'inventory_item_number', 'reference_code', 'non_financial_asset_name', 'unit_of_measurement', 'quarter_balance_number_begin', 'quarter_bala

In [26]:
import os
import psycopg2
import pandas as pd
from datetime import datetime, timedelta

# Database configuration
db_name = os.getenv('DB_NAME', 'moshack')
db_user = os.getenv('DB_USER', 'postgres')
db_password = os.getenv('DB_PASSWORD', 'postgres')
db_host = os.getenv('DB_HOST', 'localhost')
db_port = os.getenv('DB_PORT', '5432')

def get_db_connection():
    conn = psycopg2.connect(
        dbname=db_name,
        user=db_user,
        password=db_password,
        host=db_host,
        port=db_port
    )
    return conn

def fetch_quarterly_data(table_name):
    conn = get_db_connection()
    query = f"SELECT * FROM {table_name}"
    df = pd.read_sql(query, conn)
    conn.close()
    return df

def create_monthly_table():
    conn = get_db_connection()
    cur = conn.cursor()
    create_table_query = """
    CREATE TABLE IF NOT EXISTS monthly_asset_data (
        asset_name TEXT,
        month DATE,
        approximated_value INT,
        average_price FLOAT
    )
    """
    cur.execute(create_table_query)
    conn.commit()
    cur.close()
    conn.close()

def clean_monthly_table():
    conn = get_db_connection()
    cur = conn.cursor()
    clean_table_query = "TRUNCATE TABLE monthly_asset_data"
    cur.execute(clean_table_query)
    conn.commit()
    cur.close()
    conn.close()

def insert_monthly_data(data):
    conn = get_db_connection()
    cur = conn.cursor()
    insert_query = """
    INSERT INTO monthly_asset_data (asset_name, month, approximated_value, average_price) 
    VALUES (%s, %s, %s, %s)
    """
    cur.executemany(insert_query, data)
    conn.commit()
    cur.close()
    conn.close()

def sum_similar_assets(df, name_column, numeric_columns):
    df[name_column] = df[name_column].str.strip().str.lower()
    # Convert 'NaN' strings to 0 before summing
    for col in numeric_columns:
        df[col] = df[col].replace('NaN', 0).astype(float)
    df_grouped = df.groupby([name_column, 'doc_quarter'])[numeric_columns].sum().reset_index()
    return df_grouped

def calculate_monthly_data(df, name_column, debit_column, credit_column, balance_column, quarter_column, amount_column):
    monthly_data = []
    
    # Map quarter strings to starting months
    quarter_to_month = {
        '1': (1, 3),   # January, February, March
        '2': (4, 6),   # April, May, June
        '3': (7, 9),   # July, August, September
        '4': (10, 12)  # October, November, December
    }
    
    for index, row in df.iterrows():
        asset_name = row[name_column]
        year_quarter = row[quarter_column]
        try:
            year, quarter = year_quarter.split('-Q')
            year = int(year)
            start_month, end_month = quarter_to_month[quarter]
        except (ValueError, KeyError):
            print(f"Invalid quarter format or value: {year_quarter}")
            continue
        
        # Handle NaN values by replacing them with 0
        monthly_debit = (row[debit_column] if row[debit_column] not in [None, ''] else 0) / 3
        monthly_credit = (row[credit_column] if row[credit_column] not in [None, ''] else 0) / 3
        balance_begin = row[balance_column] if row[balance_column] not in [None, ''] else 0
        
        monthly_amount = row[amount_column] / 3 if row[amount_column] not in [None, ''] else 0
        
        for month in range(start_month, end_month + 1):
            month_value = balance_begin + monthly_debit - monthly_credit
            balance_begin = month_value  # update for the next month
            month_date = datetime(year, month, 1)
            average_price = monthly_amount / month_value if month_value != 0 else 0
            monthly_data.append((asset_name, month_date, int(month_value), average_price))
    
    return monthly_data

def main():
    # Fetch quarterly data from both tables
    df_105 = fetch_quarterly_data('oborotnaia_vedomost_po_sch_105_za__1')
    df_3 = fetch_quarterly_data('oborotnosaldovaia_vedomost_po_sch__3')
    
    # Sum similar asset names for both dataframes
    numeric_columns_105 = ['quarter_turnover_debit_number', 'quarter_turnover_credit_number', 'quarter_balance_number_begin', 'quarter_balance_amount_begin']
    df_105 = sum_similar_assets(df_105, 'non_financial_asset_name', numeric_columns_105)
    
    numeric_columns_3 = ['turnover_for_period_number', 'turnover_credit_number', 'balance_start_period_number', 'turnover_for_period_amount']
    df_3 = sum_similar_assets(df_3, 'name', numeric_columns_3)
    
    # Create and clean the monthly data table
    create_monthly_table()
    clean_monthly_table()
    
    # Calculate monthly data for both dataframes
    monthly_data_105 = calculate_monthly_data(df_105, 'non_financial_asset_name', 'quarter_turnover_debit_number', 'quarter_turnover_credit_number', 'quarter_balance_number_begin', 'doc_quarter', 'quarter_balance_amount_begin')
    monthly_data_3 = calculate_monthly_data(df_3, 'name', 'turnover_for_period_number', 'turnover_credit_number', 'balance_start_period_number', 'doc_quarter', 'turnover_for_period_amount')
    
    # Combine monthly data
    combined_monthly_data = monthly_data_105 + monthly_data_3
    
    # Insert monthly data into the database
    insert_monthly_data(combined_monthly_data)

if __name__ == '__main__':
    main()

print("Monthly asset data approximation and insertion complete!")


  df = pd.read_sql(query, conn)


Monthly asset data approximation and insertion complete!


extract analytical data

In [30]:
import os
import numpy as np
import pandas as pd
import psycopg2
from scipy.stats import linregress
from dotenv import load_dotenv
from datetime import datetime
from statistics import median

# Load environment variables from .env file
load_dotenv()

# Database connection details from environment variables
db_name = os.getenv('DB_NAME', 'moshack')
db_user = os.getenv('DB_USER', 'postgres')
db_password = os.getenv('DB_PASSWORD', 'postgres')
db_host = os.getenv('DB_HOST', 'localhost')
db_port = os.getenv('DB_PORT', '5432')

# Connect to the database and fetch the data
def fetch_monthly_asset_data():
    conn = psycopg2.connect(
        dbname=db_name,
        user=db_user,
        password=db_password,
        host=db_host,
        port=db_port
    )
    query = "SELECT month, approximated_value, average_price, asset_name FROM monthly_asset_data"
    df = pd.read_sql(query, conn)
    conn.close()
    return df

def calculate_wave_turns(values):
    wave_turns = 0
    if len(values) > 1:
        direction = None
        for i in range(1, len(values)):
            if values[i] > values[i-1] and direction != 'up':
                wave_turns += 1
                direction = 'up'
            elif values[i] < values[i-1] and direction != 'down':
                wave_turns += 1
                direction = 'down'
    return wave_turns

def calculate_analytics(df):
    analytics_data = []
    unique_assets = df['asset_name'].unique()

    for asset in unique_assets:
        asset_data = df[df['asset_name'] == asset]
        months = asset_data['month'].values
        numbers = asset_data['approximated_value'].values
        prices = asset_data['average_price'].values

        if len(months) > 0:
            min_month = min(months)
            max_month = max(months)
            data_points = len(numbers)
            avg_number = float(np.mean(numbers))
            med_number = float(median(numbers))
            min_number = float(np.min(numbers))
            max_number = float(np.max(numbers))
            slope_number, intercept_number, _, _, _ = linregress(range(data_points), numbers)
            max_delta_number = float(np.max(np.abs(np.diff(numbers))))
            min_delta_number = float(np.min(np.abs(np.diff(numbers))))

            avg_price = float(np.mean(prices))
            med_price = float(median(prices))
            min_price = float(np.min(prices))
            max_price = float(np.max(prices))
            slope_price, intercept_price, _, _, _ = linregress(range(data_points), prices)
            max_delta_price = float(np.max(np.abs(np.diff(prices))))
            min_delta_price = float(np.min(np.abs(np.diff(prices))))

            wave_turns_number = calculate_wave_turns(numbers)
            wave_turns_price = calculate_wave_turns(prices)

            analytics_data.append((
                asset, min_month, max_month, data_points,
                avg_number, med_number, min_number, max_number,
                slope_number, intercept_number, max_delta_number, min_delta_number, wave_turns_number,
                avg_price, med_price, min_price, max_price,
                slope_price, intercept_price, max_delta_price, min_delta_price, wave_turns_price
            ))

    return analytics_data

def create_analytics_table():
    conn = psycopg2.connect(
        dbname=db_name,
        user=db_user,
        password=db_password,
        host=db_host,
        port=db_port
    )
    create_table_query = """
    CREATE TABLE IF NOT EXISTS monthly_asset_analytics (
        asset_name TEXT,
        min_month DATE,
        max_month DATE,
        data_points INT,
        avg_number FLOAT, median_number FLOAT, min_number FLOAT, max_number FLOAT,
        slope_number FLOAT, intercept_number FLOAT, max_delta_number FLOAT, min_delta_number FLOAT, wave_turns_number INT,
        avg_price FLOAT, median_price FLOAT, min_price FLOAT, max_price FLOAT,
        slope_price FLOAT, intercept_price FLOAT, max_delta_price FLOAT, min_delta_price FLOAT, wave_turns_price INT
    );
    """
    with conn.cursor() as cur:
        cur.execute(create_table_query)
        cur.execute('TRUNCATE TABLE monthly_asset_analytics;')
    conn.commit()
    conn.close()

def insert_analytics_data(data):
    conn = psycopg2.connect(
        dbname=db_name,
        user=db_user,
        password=db_password,
        host=db_host,
        port=db_port
    )
    cur = conn.cursor()
    columns = [
        'asset_name', 'min_month', 'max_month', 'data_points', 
        'avg_number', 'median_number', 'min_number', 'max_number', 
        'slope_number', 'intercept_number', 'max_delta_number', 'min_delta_number', 'wave_turns_number',
        'avg_price', 'median_price', 'min_price', 'max_price', 
        'slope_price', 'intercept_price', 'max_delta_price', 'min_delta_price', 'wave_turns_price'
    ]
    insert_query = f"""
    INSERT INTO monthly_asset_analytics ({', '.join(columns)})
    VALUES ({', '.join(['%s'] * len(columns))})
    """
    data = [[value if not isinstance(value, (np.int64, np.float64)) else value.item() for value in row] for row in data]
    cur.executemany(insert_query, data)
    conn.commit()
    cur.close()
    conn.close()

def main():
    df = fetch_monthly_asset_data()
    create_analytics_table()
    analytics_data = calculate_analytics(df)
    insert_analytics_data(analytics_data)

if __name__ == '__main__':
    main()

print("Monthly asset analytics calculation and insertion complete!")


  df = pd.read_sql(query, conn)


Monthly asset analytics calculation and insertion complete!


add embeddings

In [49]:
# Ensure we're in the correct environment
import sys
print("Python executable being used:", sys.executable)
print("Python version:", sys.version)

# Your original code with PyTorch and ruBERT model
import os
import pandas as pd
import psycopg2
from transformers import AutoTokenizer, AutoModel
import torch
from dotenv import load_dotenv
from psycopg2.extensions import register_adapter, AsIs
import numpy as np

# Register numpy types to psycopg2
def addapt_numpy_float64(numpy_float64):
    return AsIs(numpy_float64)
def addapt_numpy_int64(numpy_int64):
    return AsIs(numpy_int64)
register_adapter(np.float64, addapt_numpy_float64)
register_adapter(np.int64, addapt_numpy_int64)

# Load environment variables from .env file
load_dotenv()

# Database connection details from environment variables
db_name = os.getenv('DB_NAME', 'moshack')
db_user = os.getenv('DB_USER', 'postgres')
db_password = os.getenv('DB_PASSWORD', 'postgres')
db_host = os.getenv('DB_HOST', 'localhost')
db_port = os.getenv('DB_PORT', '5432')

# Verify if PyTorch is available
print("Checking PyTorch availability...")
print(f"PyTorch version: {torch.__version__}")
print(f"Is CUDA available: {torch.cuda.is_available()}")

# Function to load model and tokenizer
def load_model_and_tokenizer():
    try:
        print("Initializing ruBert model and tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
        model = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased")
        print("Model and tokenizer initialized.")
        return tokenizer, model
    except Exception as e:
        print(f"An error occurred while loading the model and tokenizer: {e}")
        return None, None

tokenizer, model = load_model_and_tokenizer()

if tokenizer is None or model is None:
    print("Failed to initialize model and tokenizer. Exiting...")
    exit(1)

def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embeddings

def add_embedding_column(conn):
    with conn.cursor() as cur:
        cur.execute("ALTER TABLE dictionary ADD COLUMN IF NOT EXISTS embedding BYTEA;")
    conn.commit()

def get_primary_key_column(conn, table_name):
    query = f"""
    SELECT a.attname
    FROM   pg_index i
    JOIN   pg_attribute a ON a.attrelid = i.indrelid
                         AND a.attnum = ANY(i.indkey)
    WHERE  i.indrelid = '{table_name}'::regclass
    AND    i.indisprimary;
    """
    with conn.cursor() as cur:
        cur.execute(query)
        result = cur.fetchone()
    return result[0] if result else None

def update_embeddings(conn):
    primary_key_column = get_primary_key_column(conn, 'dictionary')
    if not primary_key_column:
        print("No primary key found for table 'dictionary'. Exiting...")
        return

    query = f"SELECT {primary_key_column}, nazvanie_ste FROM dictionary;"
    df = pd.read_sql(query, conn)

    for index, row in df.iterrows():
        embedding = get_embedding(row['nazvanie_ste'])
        with conn.cursor() as cur:
            cur.execute(
                f"UPDATE dictionary SET embedding = %s WHERE {primary_key_column} = %s;",
                (psycopg2.Binary(embedding.tobytes()), row[primary_key_column])
            )
    conn.commit()

def main():
    conn = psycopg2.connect(
        dbname=db_name,
        user=db_user,
        password=db_password,
        host=db_host,
        port=db_port
    )

    # Add the embedding column
    add_embedding_column(conn)

    # Update embeddings in the dictionary table
    update_embeddings(conn)

    conn.close()

if __name__ == '__main__':
    main()

print("Data upload to PostgreSQL complete!")


Python executable being used: /Users/admin/.pyenv/versions/3.9.9/bin/python
Python version: 3.9.9 (main, Mar  3 2024, 15:43:18) 
[Clang 15.0.0 (clang-1500.1.0.2.5)]
Checking PyTorch availability...
PyTorch version: 2.3.1
Is CUDA available: False
Initializing ruBert model and tokenizer...
An error occurred while loading the model and tokenizer: 
AutoModel requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
Please note that you may need to restart your runtime after installation.

Failed to initialize model and tokenizer. Exiting...
No primary key found for table 'dictionary'. Exiting...
Data upload to PostgreSQL complete!


In [1]:
import os
import time
import pandas as pd
import psycopg2
import torch
from transformers import AutoTokenizer, AutoModel
from dotenv import load_dotenv
import numpy as np
import pickle

# Load environment variables from .env file
load_dotenv()

# Database connection details from environment variables
db_name = os.getenv('DB_NAME', 'moshack')
db_user = os.getenv('DB_USER', 'postgres')
db_password = os.getenv('DB_PASSWORD', 'postgres')
db_host = os.getenv('DB_HOST', 'localhost')
db_port = os.getenv('DB_PORT', '5432')

# Check if PyTorch is available
def check_pytorch():
    print("Checking PyTorch availability...")
    print("PyTorch version:", torch.__version__)
    print("Is CUDA available:", torch.cuda.is_available())

check_pytorch()

# Initialize the ruBert model and tokenizer
print("Initializing ruBert model and tokenizer...")
try:
    tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
    model = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased")
    print("ruBert model and tokenizer initialized successfully.")
except Exception as e:
    print("An error occurred while loading the model and tokenizer:", e)
    print("Failed to initialize model and tokenizer. Exiting...")
    exit()

def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

def add_embedding_column(conn):
    with conn.cursor() as cur:
        cur.execute("""
            ALTER TABLE dictionary 
            ADD COLUMN IF NOT EXISTS embedding BYTEA;
        """)
    conn.commit()

def update_embeddings(conn):
    df = pd.read_sql("SELECT nazvanie_ste FROM dictionary WHERE nazvanie_ste IS NOT NULL AND nazvanie_ste != '';", conn)
    total_rows = len(df)
    start_time = time.time()

    for index, row in df.iterrows():
        embedding = get_embedding(row['nazvanie_ste'])
        embedding_bytes = pickle.dumps(embedding)
        with conn.cursor() as cur:
            cur.execute("UPDATE dictionary SET embedding = %s WHERE nazvanie_ste = %s;", (psycopg2.Binary(embedding_bytes), row['nazvanie_ste']))
        
        elapsed_time = time.time() - start_time
        rows_processed = index + 1
        remaining_time = (elapsed_time / rows_processed) * (total_rows - rows_processed) if rows_processed > 0 else 0

        print(f"Processed {rows_processed}/{total_rows} rows. Elapsed time: {elapsed_time:.2f}s, Estimated remaining time: {remaining_time:.2f}s")

    conn.commit()

def main():
    conn = psycopg2.connect(
        dbname=db_name,
        user=db_user,
        password=db_password,
        host=db_host,
        port=db_port
    )

    add_embedding_column(conn)
    update_embeddings(conn)

    conn.close()

if __name__ == '__main__':
    main()

print("Data upload to PostgreSQL complete!")


  from .autonotebook import tqdm as notebook_tqdm


Checking PyTorch availability...
PyTorch version: 2.3.1
Is CUDA available: False
Initializing ruBert model and tokenizer...


Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  df = pd.read_sql("SELECT nazvanie_ste FROM dictionary WHERE nazvanie_ste IS NOT NULL AND nazvanie_ste != '';",

ruBert model and tokenizer initialized successfully.
Processed 1/1889 rows. Elapsed time: 0.31s, Estimated remaining time: 590.38s
Processed 2/1889 rows. Elapsed time: 0.35s, Estimated remaining time: 328.89s
Processed 3/1889 rows. Elapsed time: 0.39s, Estimated remaining time: 247.01s
Processed 4/1889 rows. Elapsed time: 0.43s, Estimated remaining time: 204.52s
Processed 5/1889 rows. Elapsed time: 0.47s, Estimated remaining time: 178.57s
Processed 6/1889 rows. Elapsed time: 0.51s, Estimated remaining time: 161.19s
Processed 7/1889 rows. Elapsed time: 0.55s, Estimated remaining time: 148.83s
Processed 8/1889 rows. Elapsed time: 0.59s, Estimated remaining time: 139.05s
Processed 9/1889 rows. Elapsed time: 0.63s, Estimated remaining time: 131.16s
Processed 10/1889 rows. Elapsed time: 0.67s, Estimated remaining time: 125.73s
Processed 11/1889 rows. Elapsed time: 0.71s, Estimated remaining time: 121.54s
Processed 12/1889 rows. Elapsed time: 0.75s, Estimated remaining time: 117.94s
Processe

Find closest

In [3]:
import os
import re
import pandas as pd
import psycopg2
import torch
from transformers import AutoTokenizer, AutoModel
from dotenv import load_dotenv
import numpy as np
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from unidecode import unidecode

# Load environment variables from .env file
load_dotenv()

# Database connection details from environment variables
db_name = os.getenv('DB_NAME', 'moshack')
db_user = os.getenv('DB_USER', 'postgres')
db_password = os.getenv('DB_PASSWORD', 'postgres')
db_host = os.getenv('DB_HOST', 'localhost')
db_port = os.getenv('DB_PORT', '5432')

# Check if PyTorch is available
def check_pytorch():
    print("Checking PyTorch availability...")
    print("PyTorch version:", torch.__version__)
    print("Is CUDA available:", torch.cuda.is_available())

check_pytorch()

# Initialize the ruBert model and tokenizer
print("Initializing ruBert model and tokenizer...")
try:
    tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
    model = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased")
    print("ruBert model and tokenizer initialized successfully.")
except Exception as e:
    print("An error occurred while loading the model and tokenizer:", e)
    print("Failed to initialize model and tokenizer. Exiting...")
    exit()

def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

def find_most_similar(text, top_n=5):
    input_embedding = get_embedding(text)

    conn = psycopg2.connect(
        dbname=db_name,
        user=db_user,
        password=db_password,
        host=db_host,
        port=db_port
    )

    df = pd.read_sql("SELECT nazvanie_ste, embedding FROM dictionary WHERE embedding IS NOT NULL;", conn)

    embeddings = []
    for index, row in df.iterrows():
        stored_embedding = pickle.loads(row['embedding'])
        embeddings.append(stored_embedding)

    # Calculate embedding-based distances
    distances = []
    for stored_embedding in embeddings:
        distance = np.linalg.norm(input_embedding - stored_embedding)
        distances.append(distance)

    df['embedding_distance'] = distances
    top_embedding_similar = df.nsmallest(top_n, 'embedding_distance')

    # Text-based search using TF-IDF and cosine similarity
    vectorizer = TfidfVectorizer().fit(df['nazvanie_ste'])
    tfidf_matrix = vectorizer.transform(df['nazvanie_ste'])
    query_vector = vectorizer.transform([text])

    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    df['text_similarity'] = cosine_similarities
    top_text_similar = df.nlargest(top_n, 'text_similarity')

    # Combine results from both methods and get unique top N
    combined_top = pd.concat([top_embedding_similar, top_text_similar]).drop_duplicates().nlargest(top_n, ['embedding_distance', 'text_similarity'])

    conn.close()

    return combined_top['nazvanie_ste'].tolist()

# Example usage
text_input = "Магнитная доска"
most_similar_rows = find_most_similar(text_input)
print(most_similar_rows)

Checking PyTorch availability...
PyTorch version: 2.3.1
Is CUDA available: False
Initializing ruBert model and tokenizer...


Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum lengt

ruBert model and tokenizer initialized successfully.
['Доска магнитно-маркерная настенная 120х180', 'Доска магнитно-маркерная 60х90см', 'Доска магнитно-маркерная 100х150 см', 'Доска\xa0100Х150 поворотная', 'Brauberg Доска пробковая 60 х 90 см']


Adding similar fields from dictionary

In [2]:
import os
import re
import pandas as pd
import psycopg2
import torch
from transformers import AutoTokenizer, AutoModel
from dotenv import load_dotenv
import numpy as np
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from unidecode import unidecode
import time
from sqlalchemy import create_engine

# Load environment variables from .env file
load_dotenv()

# Database connection details from environment variables
db_name = os.getenv('DB_NAME', 'moshack')
db_user = os.getenv('DB_USER', 'postgres')
db_password = os.getenv('DB_PASSWORD', 'postgres')
db_host = os.getenv('DB_HOST', 'localhost')
db_port = os.getenv('DB_PORT', '5432')

# SQLAlchemy engine
db_url = f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}"
engine = create_engine(db_url)

# Check if PyTorch is available
def check_pytorch():
    print("Checking PyTorch availability...")
    print("PyTorch version:", torch.__version__)
    print("Is CUDA available:", torch.cuda.is_available())

check_pytorch()

# Initialize the ruBert model and tokenizer
print("Initializing ruBert model and tokenizer...")
try:
    tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
    model = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased")
    print("ruBert model and tokenizer initialized successfully.")
except Exception as e:
    print("An error occurred while loading the model and tokenizer:", e)
    print("Failed to initialize model and tokenizer. Exiting...")
    exit()

def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

def find_most_similar(text, top_n=5):
    input_embedding = get_embedding(text)

    df = pd.read_sql("SELECT nazvanie_ste, embedding FROM dictionary WHERE embedding IS NOT NULL;", engine)

    embeddings = []
    for index, row in df.iterrows():
        stored_embedding = pickle.loads(row['embedding'])
        embeddings.append(stored_embedding)

    # Calculate embedding-based distances
    distances = []
    for stored_embedding in embeddings:
        distance = np.linalg.norm(input_embedding - stored_embedding)
        distances.append(distance)

    df['embedding_distance'] = distances
    top_embedding_similar = df.nsmallest(top_n, 'embedding_distance')

    # Text-based search using TF-IDF and cosine similarity
    vectorizer = TfidfVectorizer().fit(df['nazvanie_ste'])
    tfidf_matrix = vectorizer.transform(df['nazvanie_ste'])
    query_vector = vectorizer.transform([text])

    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    df['text_similarity'] = cosine_similarities
    top_text_similar = df.nlargest(top_n, 'text_similarity')

    # Combine results from both methods and get unique top N
    combined_top = pd.concat([top_embedding_similar, top_text_similar]).drop_duplicates().nlargest(top_n, ['embedding_distance', 'text_similarity'])

    return combined_top['nazvanie_ste'].tolist()

def update_monthly_asset_analytics():
    # Connect to the database using psycopg2
    conn = psycopg2.connect(
        dbname=db_name,
        user=db_user,
        password=db_password,
        host=db_host,
        port=db_port
    )
    cur = conn.cursor()

    # Add new column 'dictionary_name' to the table if it doesn't exist
    cur.execute("ALTER TABLE monthly_asset_analytics ADD COLUMN IF NOT EXISTS dictionary_name TEXT")
    conn.commit()

    # Fetch asset names from monthly_asset_analytics
    cur.execute("SELECT asset_name FROM monthly_asset_analytics")
    rows = cur.fetchall()
    total_rows = len(rows)
    start_time = time.time()

    for index, row in enumerate(rows):
        asset_name = row[0]
        most_similar = find_most_similar(asset_name, top_n=1)
        dictionary_name = most_similar[0] if most_similar else None
        
        if dictionary_name:
            cur.execute(
                "UPDATE monthly_asset_analytics SET dictionary_name = %s WHERE asset_name = %s",
                (dictionary_name, asset_name)
            )
            conn.commit()

        elapsed_time = time.time() - start_time
        rows_processed = index + 1
        remaining_time = (elapsed_time / rows_processed) * (total_rows - rows_processed) if rows_processed > 0 else 0

        print(f"Processed {rows_processed}/{total_rows} rows. Elapsed time: {elapsed_time:.2f}s, Estimated remaining time: {remaining_time:.2f}s")

    cur.close()
    conn.close()

# Example usage
text_input = "Магнитная доска"
most_similar_rows = find_most_similar(text_input)
print(most_similar_rows)

update_monthly_asset_analytics()
print("Monthly asset analytics table updated with dictionary names.")


Checking PyTorch availability...
PyTorch version: 2.3.1
Is CUDA available: False
Initializing ruBert model and tokenizer...


Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum lengt

ruBert model and tokenizer initialized successfully.
['Доска магнитно-маркерная настенная 120х180', 'Доска магнитно-маркерная 60х90см', 'Доска магнитно-маркерная 100х150 см', 'Доска\xa0100Х150 поворотная', 'Brauberg Доска пробковая 60 х 90 см']
Processed 1/611 rows. Elapsed time: 0.15s, Estimated remaining time: 94.55s
Processed 2/611 rows. Elapsed time: 0.29s, Estimated remaining time: 89.09s
Processed 3/611 rows. Elapsed time: 0.43s, Estimated remaining time: 87.24s
Processed 4/611 rows. Elapsed time: 0.57s, Estimated remaining time: 86.00s
Processed 5/611 rows. Elapsed time: 0.70s, Estimated remaining time: 85.13s
Processed 6/611 rows. Elapsed time: 0.84s, Estimated remaining time: 84.49s
Processed 7/611 rows. Elapsed time: 0.98s, Estimated remaining time: 84.46s
Processed 8/611 rows. Elapsed time: 1.19s, Estimated remaining time: 89.97s
Processed 9/611 rows. Elapsed time: 1.33s, Estimated remaining time: 89.12s
Processed 10/611 rows. Elapsed time: 1.47s, Estimated remaining time: 8

Getting better results using LLM

In [4]:
import os
import re
import pandas as pd
import psycopg2
import torch
from transformers import AutoTokenizer, AutoModel
from dotenv import load_dotenv
import numpy as np
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from unidecode import unidecode
import time
import requests
from sqlalchemy import create_engine

# Load environment variables from .env file
load_dotenv()

# Database connection details from environment variables
db_name = os.getenv('DB_NAME', 'moshack')
db_user = os.getenv('DB_USER', 'postgres')
db_password = os.getenv('DB_PASSWORD', 'postgres')
db_host = os.getenv('DB_HOST', 'localhost')
db_port = os.getenv('DB_PORT', '5432')

# SQLAlchemy engine
db_url = f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}"
engine = create_engine(db_url)

# Check if PyTorch is available
def check_pytorch():
    print("Checking PyTorch availability...")
    print("PyTorch version:", torch.__version__)
    print("Is CUDA available:", torch.cuda.is_available())

check_pytorch()

# Initialize the ruBert model and tokenizer
print("Initializing ruBert model and tokenizer...")
try:
    tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
    model = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased")
    print("ruBert model and tokenizer initialized successfully.")
except Exception as e:
    print("An error occurred while loading the model and tokenizer:", e)
    print("Failed to initialize model and tokenizer. Exiting...")
    exit()

def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

def find_most_similar(text, top_n=20):
    input_embedding = get_embedding(text)

    df = pd.read_sql("SELECT nazvanie_ste, embedding FROM dictionary WHERE embedding IS NOT NULL;", engine)

    embeddings = []
    for index, row in df.iterrows():
        stored_embedding = pickle.loads(row['embedding'])
        embeddings.append(stored_embedding)

    # Calculate embedding-based distances
    distances = []
    for stored_embedding in embeddings:
        distance = np.linalg.norm(input_embedding - stored_embedding)
        distances.append(distance)

    df['embedding_distance'] = distances
    top_embedding_similar = df.nsmallest(top_n, 'embedding_distance')

    # Text-based search using TF-IDF and cosine similarity
    vectorizer = TfidfVectorizer().fit(df['nazvanie_ste'])
    tfidf_matrix = vectorizer.transform(df['nazvanie_ste'])
    query_vector = vectorizer.transform([text])

    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    df['text_similarity'] = cosine_similarities
    top_text_similar = df.nlargest(top_n, 'text_similarity')

    # Combine results from both methods and get unique top N
    combined_top = pd.concat([top_embedding_similar, top_text_similar]).drop_duplicates().nlargest(top_n, ['embedding_distance', 'text_similarity'])

    return combined_top['nazvanie_ste'].tolist()

def get_best_match_llm(text, candidates):
    url = os.getenv('LLAMA_API_URL')
    token = os.getenv('LLAMA_API_ACCESS_TOKEN')
    headers = {
        "Authorization": f"Bearer {token}",
        "Content-Type": "application/json"
    }

    prompt = f"""Given the item '{text}', select the most similar item from the following list and return only the item name without any additional text:
{candidates}
"""

    payload = {"text": prompt}
    response = requests.post(f"{url}/interact", json=payload, headers=headers)

    print(f"Request URL: {url}/interact")
    print(f"Request Headers: {headers}")
    print(f"Request Payload: {payload}")

    if response.status_code == 200:
        result = response.json().get('response')
        # Extract the item name from the response (assuming it is in the last line)
        best_match = result.strip().split('\n')[-1].strip()
        return best_match
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return None

def update_monthly_asset_analytics():
    conn = psycopg2.connect(
        dbname=db_name,
        user=db_user,
        password=db_password,
        host=db_host,
        port=db_port
    )
    cur = conn.cursor()

    # Add new columns to the table if they don't exist
    cur.execute("ALTER TABLE monthly_asset_analytics ADD COLUMN IF NOT EXISTS dictionary_name TEXT")
    cur.execute("ALTER TABLE monthly_asset_analytics ADD COLUMN IF NOT EXISTS dictionary_llm_output TEXT")
    conn.commit()

    # Fetch asset names from monthly_asset_analytics where dictionary_llm_output is empty
    cur.execute("SELECT asset_name FROM monthly_asset_analytics WHERE dictionary_llm_output IS NULL OR dictionary_llm_output = ''")
    rows = cur.fetchall()
    total_rows = len(rows)
    start_time = time.time()

    mismatches = []

    for index, row in enumerate(rows):
        asset_name = row[0]
        most_similar_list = find_most_similar(asset_name, top_n=20)
        best_match = get_best_match_llm(asset_name, most_similar_list)
        
        if best_match:
            best_match_lower = best_match.lower()
            most_similar_lower = [item.lower() for item in most_similar_list]
            
            # Find the exact match in the original list
            matched_item = None
            for item in most_similar_list:
                if item.lower() == best_match_lower:
                    matched_item = item
                    break

            if matched_item:
                cur.execute(
                    "UPDATE monthly_asset_analytics SET dictionary_name = %s, dictionary_llm_output = %s WHERE asset_name = %s",
                    (matched_item, best_match, asset_name)
                )
                conn.commit()
            else:
                mismatches.append((asset_name, best_match))
                cur.execute(
                    "UPDATE monthly_asset_analytics SET dictionary_llm_output = %s WHERE asset_name = %s",
                    (best_match, asset_name)
                )
                conn.commit()

        elapsed_time = time.time() - start_time
        rows_processed = index + 1
        remaining_time = (elapsed_time / rows_processed) * (total_rows - rows_processed) if rows_processed > 0 else 0

        print(f"Processed {rows_processed}/{total_rows} rows. Elapsed time: {elapsed_time:.2f}s, Estimated remaining time: {remaining_time:.2f}s")
        print(f"Asset Name: {asset_name}, LLM Output: {best_match}, Matched: {matched_item}")

    cur.close()
    conn.close()

    print("Mismatches found:")
    for mismatch in mismatches:
        print(f"Asset Name: {mismatch[0]}, LLM Output: {mismatch[1]}")

# Example usage
update_monthly_asset_analytics()
print("Monthly asset analytics table updated with dictionary names.")


Checking PyTorch availability...
PyTorch version: 2.3.1
Is CUDA available: False
Initializing ruBert model and tokenizer...


Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum lengt

ruBert model and tokenizer initialized successfully.
Request URL: http://127.0.0.1:5001/interact
Request Headers: {'Authorization': 'Bearer your_secure_token', 'Content-Type': 'application/json'}
Request Payload: {'text': "Given the item 'фонарь фпс 4/6 пм в комплекте с азу', select the most similar item from the following list and return only the item name without any additional text:\n['ПЛАТФОРМЕННАЯ ТЕЛЕЖКА С РЕЗИНОВЫМ ПОКРЫТИЕМ ТПРН 2', 'Настенный кронштейн Digis DSM-8043', 'Аккумуляторный фонарь 3W LED ТРОФИ TSP3W', 'Трудовая книжка', 'Замок', 'Сейф мебельный Сейф (к1) VALBERG ASM-120TEL мебельный, эл.замок', 'Холодильник INDESIT ST 167, общий объем 300 л, верхняя морозильная камера 53 л, 60x66,5x167 см, белый', 'Налобный светодиодный фонарь Light Hot с магнитом, регулировка угла свечения и диммированием COB светильника, встроенный аккумулятор, заряд USB', 'Сетевой фильтр DEFENDER ES, 5 розеток, 5 м, белый', 'Кресло офисное Бюрократ CH-868AXSN экокожа черное', 'Удлинитель РАДИСТ П