In [46]:
import os
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from database_test import Item, TrueItem, Seller
from keras.models import Model
from keras.layers import Input, Dense, BatchNormalization, Dropout
from keras import regularizers

import re

import random
from datetime import datetime, timedelta

In [47]:
DATABASE_URL = 'postgresql://postgres:postgres@localhost:5432/OZON_parse'
engine = create_engine(DATABASE_URL)
Session = sessionmaker(bind=engine)
session = Session()

In [48]:
true_items = session.query(TrueItem).all()
all_items = session.query(Item).all()
fake_items = session.query(TrueItem).limit(200).all()

In [49]:
def orm_to_df(items):
    records = []
    for item in items:
        records.append({
            "id": item.id,
            "title": item.title,
            "url": item.url,
            "price": item.price,
            "description": item.description,
            "year": item.year,
            "pages_count": item.pages_count,
            "circulation": item.circulation,
            "seller_id": item.seller_id,
            "seller_orders": item.seller.orders if item.seller else None,
            "seller_avg_item_rate": item.seller.avg_item_rate if item.seller else None,
            "seller_region": item.seller.region if item.seller else None,
            "days_to_deliver": item.delivery_days if hasattr(item, 'delivery_days') else None,
            "seller_age": (datetime.now() - item.seller.reg_date).days if item.seller and item.seller.reg_date else None,
            "warehouse_type": item.warehouse_type
        })
    return pd.DataFrame(records)

In [120]:
df_true = orm_to_df(true_items)
df_unknown = orm_to_df(all_items)
df_fake = orm_to_df(fake_items)

In [121]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import joblib

model = BertForSequenceClassification.from_pretrained("saved_model")
tokenizer = BertTokenizer.from_pretrained("saved_model")

label_encoder = joblib.load("saved_model/label_encoder.pkl")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

def predict_book_type(title: str):
    inputs = tokenizer(title, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class_id = torch.argmax(logits, dim=1).item()
    
    return label_encoder.inverse_transform([predicted_class_id])[0]

In [122]:
df_true["book_type"] = df_true["title"].apply(lambda x: predict_book_type(x))
df_unknown["book_type"] = df_unknown["title"].apply(lambda x: predict_book_type(x))

In [123]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('distiluse-base-multilingual-cased-v1') 

# Encode titles to embeddings
train_embeddings = model.encode(df_true['title'], convert_to_tensor=True).cpu().numpy()
test_embeddings = model.encode(df_unknown['title'], convert_to_tensor=True).cpu().numpy()

# Store embeddings as lists in one column
df_true["embedding"] = list(train_embeddings)
df_unknown["embedding"] = list(test_embeddings)

df_true["class"] = df_true["title"].apply(lambda x: re.findall(r"\d+", x)[0] if len(re.findall(r"\d+", x)) > 0 else 0)
df_unknown["class"] = df_unknown["title"].apply(lambda x: re.findall(r"\d+", x)[0] if len(re.findall(r"\d+", x)) > 0 else 0)

embeddings_array = np.stack(df_true['embedding'].values)  # shape: (N, 512)
embedding_dim = embeddings_array.shape[1]
embedding_df = pd.DataFrame(embeddings_array, columns=[f'emb_{i}' for i in range(embedding_dim)])

df_true = pd.concat([
    df_true.drop(columns=['embedding']),
    embedding_df
], axis=1)

embeddings_array = np.stack(df_unknown['embedding'].values)  # shape: (N, 512)
embedding_dim = embeddings_array.shape[1]
embedding_df = pd.DataFrame(embeddings_array, columns=[f'emb_{i}' for i in range(embedding_dim)])

df_unknown = pd.concat([
    df_unknown.drop(columns=['embedding']),
    embedding_df
], axis=1)

In [124]:
# Добавляем one-hot encoding к тренировочным данным
df_true = pd.get_dummies(df_true, columns=["book_type"])

# Добавляем one-hot encoding к тестовым данным
df_unknown = pd.get_dummies(df_unknown, columns=["book_type"])

# (Опционально) Выровнять колонки теста и трейна, если у них разные классы:
df_unknown = df_unknown.reindex(columns=df_true.columns, fill_value=0)


In [125]:
n_clusters = 3000
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
df_true['cluster'] = kmeans.fit_predict(df_true.drop(columns=[
    "id", "title", "url", "price", "description", "year", "pages_count",
    "circulation", "seller_id", "seller_orders", "seller_avg_item_rate",
    "seller_region", "days_to_deliver", "seller_age", "warehouse_type",
]))

df_true[['title', 'cluster', "price"]][df_true['cluster'] == 1]

Unnamed: 0,title,cluster,price
2732,История. Всеобщая история. 1914 - 1945 годы. 1...,1,745
2956,История. Всеобщая история. 1914 - 1945 годы. 1...,1,737


In [126]:
avg_cluster = {}

df_unknown['cluster'] = kmeans.predict(df_unknown.drop(columns=[
    "id", "title", "url", "price", "description", "year", "pages_count",
    "circulation", "seller_id", "seller_orders", "seller_avg_item_rate",
    "seller_region", "days_to_deliver", "seller_age", "warehouse_type",
]))

In [127]:
import joblib

joblib.dump(kmeans, "saved_model/clustering_model.joblib")

['saved_model/clustering_model.joblib']

In [128]:
avg_book_type = {}

for i in df_true.columns:
    if "book_type" in i:
        avg_book_type[i.replace("book_type_", "")] = df_true[df_true[i] == 1]["price"].mean()

pd.DataFrame([avg_book_type]).to_csv("saved_model/avg_book_type.csv", index=False)

In [129]:
avg_book_type

{'диагностические_материалы': np.float64(414.91210613598673),
 'контурные_карты': np.float64(307.1574074074074),
 'прописи': np.float64(325.6666666666667),
 'рабочая_тетрадь': np.float64(480.71149897330594),
 'учебник': np.float64(1585.161995898838)}

In [130]:
# df_unknown["true_price"] = df_unknown["cluster"].apply(lambda x: df_true[df_true["cluster"] == x]["price"].mean())
# df_true["true_price"] = df_true["cluster"].apply(lambda x: df_true[df_true["cluster"] == x]["price"].mean())

for i in df_true.columns:
    if "book_type" in i:
        df_unknown["true_price"] = df_unknown[i].apply(lambda x: avg_book_type[i.replace("book_type_", "")])
        df_true["true_price"] = df_true[i].apply(lambda x: avg_book_type[i.replace("book_type_", "")])

In [131]:
df_unknown["diff"] = (df_unknown["true_price"] - df_unknown["price"]) / df_unknown["true_price"]
df_true["diff"] = (df_true["true_price"] - df_true["price"]) / df_true["true_price"]

In [132]:
columns_to_fill = [
    "year", "pages_count", "seller_orders", "seller_avg_item_rate",
    "days_to_deliver", "seller_age", "class"
]

for c in columns_to_fill:
    df_true[c] = df_true[c].fillna(0)
    df_unknown[c] = df_unknown[c].fillna(0)

  df_true[c] = df_true[c].fillna(0)
  df_unknown[c] = df_unknown[c].fillna(0)


In [133]:
from sklearn.preprocessing import StandardScaler

df_true = pd.get_dummies(df_true, columns=["warehouse_type"])
df_unknown = pd.get_dummies(df_unknown, columns=["warehouse_type"])
df_unknown = df_unknown.reindex(columns=df_true.columns, fill_value=0)


df_true = df_true.drop(
    columns=[
        "id", "title", "url", "price", "description",
        "circulation", "seller_id", "seller_region"
    ]
)

df_unknown = df_unknown.drop(
    columns=[
        "id", "title", "url", "price", "description",
        "circulation", "seller_id", "seller_region"
    ]
)

df_true.columns = df_true.columns.astype(str)
df_unknown.columns = df_unknown.columns.astype(str)

In [134]:
# Масштабирование
scaler = StandardScaler()
X_real_scaled = scaler.fit_transform(df_true)
X_all_scaled = scaler.transform(df_unknown)

In [135]:
from sklearn.svm import OneClassSVM

ocsvm = OneClassSVM(kernel='rbf', gamma='auto', nu=0.05)  # nu — уровень "аномалий"
ocsvm.fit(X_real_scaled)

In [28]:
for i in list(df_true.columns):
    print(i, end=", ")

id, title, url, price, description, year, pages_count, circulation, seller_id, seller_orders, seller_avg_item_rate, seller_region, days_to_deliver, seller_age, warehouse_type, book_type, cluster, true_price, diff, 

In [136]:
preds = ocsvm.predict(X_all_scaled)  # 1 = норм, -1 = фейк/аномалия

df_unknown["anomaly_label"] = preds

In [138]:
df_unknown.to_csv('test.csv')

In [139]:
df_unknown[df_unknown["anomaly_label"] == -1]

Unnamed: 0,year,pages_count,seller_orders,seller_avg_item_rate,days_to_deliver,seller_age,class,emb_0,emb_1,emb_2,...,book_type_контурные_карты,book_type_прописи,book_type_рабочая_тетрадь,book_type_учебник,cluster,true_price,diff,warehouse_type_fbs,warehouse_type_ozon,anomaly_label
5,0.0,368.0,212.0,3.0,0,213.0,0,0.046510,-0.076799,-0.008660,...,False,False,True,False,1197,1585.161996,-0.226373,True,False,-1
7,2023.0,320.0,150000.0,4.9,0,1126.0,0,0.007544,0.024672,0.079122,...,False,False,False,False,526,1585.161996,0.743244,False,True,-1
8,0.0,96.0,460.0,5.0,0,61.0,9,0.025206,0.008362,0.047997,...,False,False,False,True,954,1585.161996,0.659341,True,False,-1
11,0.0,80.0,460.0,5.0,0,61.0,2,0.024475,0.033238,0.009539,...,False,False,True,False,331,1585.161996,0.241087,True,False,-1
19,2019.0,192.0,706.0,4.5,0,335.0,7,-0.021964,0.051056,0.049067,...,False,False,False,True,896,1585.161996,-0.539275,True,False,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13067,0.0,95.0,460.0,5.0,0,61.0,3,0.034182,0.079745,0.052261,...,False,False,False,False,1824,1585.161996,0.515507,True,False,-1
13069,0.0,80.0,89.0,0.0,0,61.0,6,0.011050,0.028481,0.014603,...,False,False,True,False,710,1585.161996,-0.190415,True,False,-1
13070,1973.0,0.0,48300.0,4.9,0,761.0,0,-0.050001,0.078269,-0.022265,...,False,False,False,True,465,1585.161996,0.900326,True,False,-1
13073,2024.0,47.0,0.0,0.0,0,0.0,0,0.018535,0.041612,-0.034527,...,False,False,False,False,711,1585.161996,0.764693,False,True,-1


In [140]:
import joblib
joblib.dump(ocsvm, "saved_model/one_class_svm_test.joblib")
joblib.dump(scaler, "saved_model/standard_scaler.joblib")

['saved_model/standard_scaler.joblib']

In [81]:
df_unknown_old = df_unknown.copy()

In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sentence_transformers import SentenceTransformer
import pandas as pd

# Выбор только настоящих книг (например, те, где 'book_type' известен)
train_X = df_true.copy()

# Векторизация заголовков
model = SentenceTransformer("ai-forever/sbert_large_nlu_ru")
train_X['title_vector'] = model.encode(train_X['title'].tolist()).tolist()

# Извлекаем числовые и категориальные признаки
features = pd.DataFrame(train_X['title_vector'].tolist())
numeric = train_X[['price', 'year', 'pages_count', 'circulation', 'seller_orders', 'seller_avg_item_rate', 'days_to_deliver', 'seller_age', 'true_price']]
features = pd.concat([features, numeric.reset_index(drop=True)], axis=1)

# Масштабирование
scaler = StandardScaler()
X = scaler.fit_transform(features)

# Обучение Isolation Forest
clf = IsolationForest(contamination=0.1, random_state=42)
clf.fit(X)

In [82]:
df_fake_items = df_fake.copy()

df_fake_items["price"] = df_fake_items["price"].apply(lambda x: x * random.randint(50, 95) / 100)
df_fake_items["seller_age"] = df_fake_items["seller_age"].apply(lambda x: random.randint(5, 365))
df_fake_items["seller_orders"] = df_fake_items["seller_orders"].apply(lambda x: random.randint(200, 100000))
df_fake_items["seller_avg_item_rate"] = df_fake_items["seller_avg_item_rate"].apply(lambda x: round(random.uniform(2, 5), 2))

In [83]:
text_features = ["title", "description"]
numeric_features = ["price", "year", "pages_count", "seller_orders", "seller_avg_item_rate", "days_to_deliver", "seller_age"]
categorical_features = ["seller_region", "warehouse_type"]

In [84]:
df_true["target"] = 1
df_fake_items["target"] = 0

df_all = pd.concat([df_true, df_fake_items])

In [85]:
df_all = df_all.drop(columns=["id", "url", "circulation", "seller_id"])
df_unknown = df_unknown.drop(columns=["id", "url", "circulation", "seller_id"])

In [86]:
for col in numeric_features:
    df_unknown[col] = df_unknown[col].fillna(df_unknown[col].mean())
    df_all[col] = df_all[col].fillna(df_all[col].mean())
for col in categorical_features:
    df_unknown[col] = df_unknown[col].fillna("unknown")
    df_all[col] = df_all[col].fillna("unknown")
    

  df_unknown[col] = df_unknown[col].fillna(df_unknown[col].mean())
  df_all[col] = df_all[col].fillna(df_all[col].mean())


In [87]:
df_unknown["title"].fillna("", inplace=True)
df_unknown["description"].fillna("", inplace=True)
df_all["title"].fillna("", inplace=True)
df_all["description"].fillna("", inplace=True)

text_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=100))
])

preprocessor = ColumnTransformer([
    ("num", RobustScaler(), numeric_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
])

X_unknown_text = text_pipeline.fit_transform(df_unknown["title"] + " " + df_unknown["description"])
X_all_text = text_pipeline.transform(df_all["title"] + " " + df_all["description"])

X_unknown = preprocessor.fit_transform(df_unknown)
X_all = preprocessor.transform(df_all)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_unknown["title"].fillna("", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_unknown["description"].fillna("", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting valu

In [88]:
text_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=100))
])

preprocessor = ColumnTransformer([
    ("num", RobustScaler(), numeric_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
])

X_unknown_text = text_pipeline.fit_transform(df_unknown["title"] + " " + df_unknown["description"])
X_all_text = text_pipeline.transform(df_all["title"] + " " + df_all["description"])

X_unknown = preprocessor.fit_transform(df_unknown.drop(columns=["target"]))
X_all = preprocessor.transform(df_all.drop(columns=["target"]))

KeyError: "['target'] not found in axis"

In [None]:
from scipy.sparse import hstack
X_unknown_combined = hstack([X_unknown, X_unknown_text])
X_all_combined = hstack([X_all, X_all_text])

In [89]:
X_unknown_combined = X_unknown_combined.toarray()
X_all_combined = X_all_combined.toarray()

# Replace any remaining NaN values with 0
X_unknown_combined = np.nan_to_num(X_unknown_combined, nan=0)
X_all_combined = np.nan_to_num(X_all_combined, nan=0)

AttributeError: 'numpy.ndarray' object has no attribute 'toarray'

In [93]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

model = SVC()

model.fit(X_all_combined, df_all["target"])

predicted = model.predict(X_all_combined)

print(accuracy_score(df_all["target"], predicted))

predicted = model.predict(X_unknown_combined)

for index, target in enumerate(predicted):
    if target == 0:
        print(df_unknown_old["title"].values[index], df_unknown_old["url"].values[index])

0.9959525874530211
Вдох-выдох: Восстановите здоровье, перезагрузите разум и станьте счастливыми с помощью дыхания /product/vdoh-vydoh-vosstanovite-zdorove-perezagruzite-razum-i-stante-schastlivymi-s-pomoshchyu-dyhaniya-1888310768/
Информатика. 3 класс. Учебник. В двух частях. Часть 2 /product/informatika-3-klass-uchebnik-v-dvuh-chastyah-chast-2-1782413825/
Технология. 7 класс. Учебник /product/tehnologiya-7-klass-uchebnik-1783595633/
Примерные рабочие программы по учебным предметам и коррекционным курсам НОО слабослышащих и позднооглохших /product/primernye-rabochie-programmy-po-uchebnym-predmetam-i-korrektsionnym-kursam-noo-slaboslyshashchih-1782414159/
Новая история. 1800-1918. 8 класс -арт.65754 /product/novaya-istoriya-1800-1918-8-klass-art-65754-1851071268/
Окружающий мир. Народы России: дорога дружбы. Ярмарка мастеров России. 3 класс. Учебник /product/okruzhayushchiy-mir-narody-rossii-doroga-druzhby-yarmarka-masterov-rossii-3-klass-uchebnik-1782414109/
Литературное чтение. 4 клас

In [31]:
X_true_combined = X_true_combined.toarray()
X_all_combined = X_all_combined.toarray()

# Replace any remaining NaN values with 0
X_true_combined = np.nan_to_num(X_true_combined, nan=0)
X_all_combined = np.nan_to_num(X_all_combined, nan=0)

# Build and train autoencoder
input_dim = X_true_combined.shape[1]
encoding_dim = 32

input_layer = Input(shape=(input_dim,))
x = Dense(256, activation='relu')(input_layer)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
x = Dense(128, activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
encoded = Dense(encoding_dim, activation='relu')(x)

x = Dense(128, activation='relu')(encoded)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
x = Dense(256, activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
decoded = Dense(input_dim, activation='linear')(x)

autoencoder = Model(inputs=input_layer, outputs=decoded)
autoencoder.compile(optimizer='adam', loss='mse')

# Train the model
autoencoder.fit(
    X_true_combined, 
    X_true_combined,
    epochs=50,
    batch_size=32,
    shuffle=True,
    validation_split=0.2
)

Epoch 1/50
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - loss: 10179529728.0000 - val_loss: 1.9150
Epoch 2/50
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 9772133376.0000 - val_loss: 8.4933
Epoch 3/50
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 9596265472.0000 - val_loss: 15.1635
Epoch 4/50
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 9842004992.0000 - val_loss: 5.5289
Epoch 5/50
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 9721989120.0000 - val_loss: 0.4088
Epoch 6/50
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 9587584000.0000 - val_loss: 0.1188
Epoch 7/50
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 10465498112.0000 - val_loss: 1.1246
Epoch 8/50
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 9503616000.0000 

<keras.src.callbacks.history.History at 0x1711d00bd40>

In [32]:
encoder = Model(inputs=input_layer, outputs=encoded)
X_all_encoded = encoder.predict(X_all_combined)
X_true_encoded = encoder.predict(X_true_combined)

[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 693us/step


In [33]:
kmeans = KMeans(n_clusters=10, random_state=42)
labels = kmeans.fit_predict(X_all_encoded)

df_all["cluster"] = labels

In [34]:
distances = euclidean_distances(X_all_encoded, X_true_encoded)
df_all["min_distance_to_true"] = distances.min(axis=1)

threshold = df_all["min_distance_to_true"].quantile(0.9)
df_all["is_suspicious"] = df_all["min_distance_to_true"] > threshold

In [37]:
output_cols = ["title", "price", "cluster", "min_distance_to_true", "is_suspicious"]
df_all[output_cols].to_csv("suspicious_items.csv", index=False)
print("Готово! Файл сохранён как suspicious_items.csv")

Готово! Файл сохранён как suspicious_items.csv
