In [None]:
import os
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from database_test import Item, TrueItem, Seller
from keras.models import Model
from keras.layers import Input, Dense, BatchNormalization, Dropout
from keras import regularizers

In [28]:
DATABASE_URL = 'postgresql://postgres:postgres@localhost:5432/OZON_parse'
engine = create_engine(DATABASE_URL)
Session = sessionmaker(bind=engine)
session = Session()

In [29]:
true_items = session.query(TrueItem).all()
all_items = session.query(Item).all()

In [30]:
def orm_to_df(items):
    records = []
    for item in items:
        records.append({
            "id": item.id,
            "title": item.title,
            "url": item.url,
            "price": item.price,
            "description": item.description,
            "year": item.year,
            "pages_count": item.pages_count,
            "circulation": item.circulation,
            "seller_id": item.seller_id,
            "seller_orders": item.seller.orders if item.seller else None,
            "seller_avg_item_rate": item.seller.avg_item_rate if item.seller else None,
            "seller_region": item.seller.region if item.seller else None,
            "days_to_deliver": item.delivery_days if hasattr(item, 'delivery_days') else None,
            "seller_age": (datetime.now() - item.seller.reg_date).days if item.seller and item.seller.reg_date else None
        })
    return pd.DataFrame(records)

In [31]:
df_true = orm_to_df(true_items)
df_all = orm_to_df(all_items)

In [32]:
text_features = ["title", "description"]
numeric_features = ["price", "year", "pages_count", "circulation", "seller_orders", "seller_avg_item_rate", "days_to_deliver", "seller_age"]
categorical_features = ["seller_region"]

In [33]:
for col in numeric_features:
    df_true[col] = df_true[col].fillna(df_true[col].mean())
    df_all[col] = df_all[col].fillna(df_true[col].mean())
for col in categorical_features:
    df_true[col] = df_true[col].fillna("unknown")
    df_all[col] = df_all[col].fillna("unknown")
    

  df_true[col] = df_true[col].fillna(df_true[col].mean())
  df_all[col] = df_all[col].fillna(df_true[col].mean())


In [34]:
df_true["title"].fillna("", inplace=True)
df_true["description"].fillna("", inplace=True)
df_all["title"].fillna("", inplace=True)
df_all["description"].fillna("", inplace=True)

text_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=100))
])

preprocessor = ColumnTransformer([
    ("num", RobustScaler(), numeric_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
])

X_true_text = text_pipeline.fit_transform(df_true["title"] + " " + df_true["description"])
X_all_text = text_pipeline.transform(df_all["title"] + " " + df_all["description"])

X_true = preprocessor.fit_transform(df_true)
X_all = preprocessor.transform(df_all)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_true["title"].fillna("", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_true["description"].fillna("", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alw

In [35]:
text_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=100))
])

preprocessor = ColumnTransformer([
    ("num", RobustScaler(), numeric_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
])

X_true_text = text_pipeline.fit_transform(df_true["title"] + " " + df_true["description"])
X_all_text = text_pipeline.transform(df_all["title"] + " " + df_all["description"])

X_true = preprocessor.fit_transform(df_true)
X_all = preprocessor.transform(df_all)

  return fnb._ureduce(a, func=_nanmedian, keepdims=keepdims,
  return _nanquantile_unchecked(


In [36]:
from scipy.sparse import hstack
X_true_combined = hstack([X_true, X_true_text])
X_all_combined = hstack([X_all, X_all_text])

In [37]:
X_true_combined = X_true_combined.toarray()
X_all_combined = X_all_combined.toarray()

# Replace any remaining NaN values with 0
X_true_combined = np.nan_to_num(X_true_combined, nan=0)
X_all_combined = np.nan_to_num(X_all_combined, nan=0)

# Build and train autoencoder
input_dim = X_all_combined.shape[1]
encoding_dim = 32

input_layer = Input(shape=(input_dim,))
x = Dense(256, activation='relu')(input_layer)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
x = Dense(128, activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
encoded = Dense(encoding_dim, activation='relu')(x)

x = Dense(128, activation='relu')(encoded)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
x = Dense(256, activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
decoded = Dense(input_dim, activation='linear')(x)

autoencoder = Model(inputs=input_layer, outputs=decoded)
autoencoder.compile(optimizer='adam', loss='mse')

# Train the model
autoencoder.fit(
    X_all_combined, 
    X_all_combined,
    epochs=50,
    batch_size=32,
    shuffle=True,
    validation_split=0.2
)

Epoch 1/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 11687853056.0000 - val_loss: 12935940096.0000
Epoch 2/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 12240025600.0000 - val_loss: 12933917696.0000
Epoch 3/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 11820708864.0000 - val_loss: 12931887104.0000
Epoch 4/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 12007697408.0000 - val_loss: 12929532928.0000
Epoch 5/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 11839026176.0000 - val_loss: 12926611456.0000
Epoch 6/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 11864115200.0000 - val_loss: 12923287552.0000
Epoch 7/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 11747193856.0000 - val_loss: 12919537664.0000
Epoch 8/50
[1m125/1

<keras.src.callbacks.history.History at 0x1ee8bffb8c0>

In [38]:
encoder = Model(inputs=input_layer, outputs=encoded)
X_all_encoded = encoder.predict(X_all_combined)
X_true_encoded = encoder.predict(X_true_combined)

[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 936us/step
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 734us/step


In [39]:
kmeans = KMeans(n_clusters=10, random_state=42)
labels = kmeans.fit_predict(X_all_encoded)

df_all["cluster"] = labels

In [40]:
distances = euclidean_distances(X_all_encoded, X_true_encoded)
df_all["min_distance_to_true"] = distances.min(axis=1)

threshold = df_all["min_distance_to_true"].quantile(0.9)
df_all["is_suspicious"] = df_all["min_distance_to_true"] > threshold

In [41]:
output_cols = ["id", "title", "url", "price", "seller_id", "cluster", "min_distance_to_true", "is_suspicious"]
df_all[output_cols].to_csv("suspicious_items.csv", index=False)
print("Готово! Файл сохранён как suspicious_items.csv")

Готово! Файл сохранён как suspicious_items.csv
