In [62]:
from sqlalchemy import create_engine, String, ARRAY, ForeignKey
from sqlalchemy.orm import declarative_base, sessionmaker, Mapped, mapped_column, relationship

import spacy
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC, OneClassSVM
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

from datetime import datetime

In [2]:
# installation
# python -m spacy download ru_core_news_md

# nlp = spacy.load("ru_core_news_lg") # 489 mb
nlp = spacy.load("ru_core_news_md") # 39 mb

In [3]:
Base = declarative_base()

In [4]:
class TrueItem(Base):
    __tablename__ = "education_seller"
    
    id: Mapped[int] = mapped_column(primary_key=True)
    title: Mapped[str]
    url: Mapped[str]
    price: Mapped[int]
    image: Mapped[str] = mapped_column(String(128))
    description: Mapped[str]
    year: Mapped[int]
    paper_type: Mapped[str]
    preview_type: Mapped[str]
    book_type: Mapped[str]
    pages_count: Mapped[int]
    circulation: Mapped[int]
    isbn: Mapped[list[str]] = mapped_column(ARRAY(String))
    class_: Mapped[int] = mapped_column(name="class")
    subject: Mapped[str]
    original_name: Mapped[str]
    author: Mapped[list[str]] = mapped_column(ARRAY(String))
    seller_id: Mapped[int] = mapped_column(
        ForeignKey("seller.id")
    )
    days_to_deliver: Mapped[int]
    
    seller = relationship("Seller", back_populates="true_items")
    
    def dict(self):
        return {
            "id": self.id,
            "title": self.title,
            "url": self.url,
            "price": self.price,
            "image": self.image,
            "description": self.description,
            "year": self.year,
            "paper_type": self.paper_type,
            "preview_type": self.preview_type,
            "book_type": self.book_type,
            "pages_count": self.pages_count,
            "circulation": self.circulation,
            "isbn": self.isbn,
            "class_": self.class_,
            "subject": self.subject,
            "original_name": self.original_name,
            "author": self.author,
            "seller_id": self.seller_id,
            "seller_reg_date": self.seller.reg_date,
            "seller_orders": self.seller.orders,
            "seller_avg_item_rate": self.seller.avg_item_rate,
            "seller_region": self.seller.region,
        }


class Item(Base):
    __tablename__ = "item"
    
    id: Mapped[int] = mapped_column(primary_key=True)
    title: Mapped[str]
    url: Mapped[str]
    price: Mapped[int]
    image: Mapped[str] = mapped_column(String(128))
    description: Mapped[str]
    year: Mapped[int]
    paper_type: Mapped[str]
    preview_type: Mapped[str]
    book_type: Mapped[str]
    pages_count: Mapped[int]
    circulation: Mapped[int]
    isbn: Mapped[list[str]] = mapped_column(ARRAY(String))
    class_: Mapped[int] = mapped_column(name="class")
    subject: Mapped[str]
    original_name: Mapped[str]
    author: Mapped[list[str]] = mapped_column(ARRAY(String))
    seller_id: Mapped[int] = mapped_column(
        ForeignKey("seller.id")
    )
    days_to_deliver: Mapped[int]
    
    seller = relationship("Seller", back_populates="items")
    
    def dict(self):
        return {
            "id": self.id,
            "title": self.title,
            "url": self.url,
            "price": self.price,
            "image": self.image,
            "description": self.description,
            "year": self.year,
            "paper_type": self.paper_type,
            "preview_type": self.preview_type,
            "book_type": self.book_type,
            "pages_count": self.pages_count,
            "circulation": self.circulation,
            "isbn": self.isbn,
            "class_": self.class_,
            "subject": self.subject,
            "original_name": self.original_name,
            "author": self.author,
            "seller_id": self.seller_id,
            "seller_reg_date": self.seller.reg_date,
            "seller_orders": self.seller.orders,
            "seller_avg_item_rate": self.seller.avg_item_rate,
            "seller_region": self.seller.region,
        }
    

class Seller(Base):
    __tablename__ = "seller"
    
    id: Mapped[int] = mapped_column(primary_key=True)
    reg_date: Mapped[datetime]
    orders: Mapped[int]
    avg_item_rate: Mapped[float]
    region: Mapped[str]
    
    items = relationship("Item", back_populates="seller")
    true_items = relationship("TrueItem", back_populates="seller")

In [5]:
engine = create_engine('postgresql://postgres:postgres@localhost:5432/OZON_parse')
Session = sessionmaker(engine, expire_on_commit=True)

In [6]:
def preprocessing_text(string: str) -> str:
    string = " ".join(
        token.lemma_ for token in nlp(string) if (
            (not token.is_stop) and (not token.is_punct) and (str(token) != '|')
        )
    )
    
    return string

In [7]:
def preprocessing_description(string: str) -> str:
    string = re.sub(r'\d+', '', string)
    string = re.sub(r'\b[^а-яА-ЯёЁ\s]+\b', '', string)
    
    string_list = string.split("Автор на обложке")
    if len(string_list) > 1:
        string = string_list[0]
    else:
        string = ""

    string = " ".join(
        token.lemma_ for token in nlp(string) if (
            (not token.is_stop) and (not token.is_punct) and (str(token) != '|')
            and (len(str(token)) > 3)
        )
    )
    
    return string

In [None]:
"""
maybe fake sellers

2146622
2146631
2146646

"""

''

In [18]:
with Session() as session:
    true_items = session.query(TrueItem).filter(TrueItem.seller_id != None).all()
    df_true_items = pd.DataFrame([item.dict() for item in true_items])    

    unknown_items = session.query(Item).filter(Item.seller_id != None).all()
    df_unknown_items = pd.DataFrame([item.dict() for item in unknown_items])  

In [20]:
df_unknown_items.columns

Index(['id', 'title', 'url', 'price', 'image', 'description', 'year',
       'paper_type', 'preview_type', 'book_type', 'pages_count', 'circulation',
       'isbn', 'class_', 'subject', 'original_name', 'author', 'seller_id',
       'seller_reg_date', 'seller_orders', 'seller_avg_item_rate',
       'seller_region'],
      dtype='object')

In [None]:
all_authors = []
all_authors_s = set()

for authors_list in df_true_items["author"]:
    if authors_list:
        for author in authors_list:
            if author not in all_authors_s:
                all_authors.append(author)
                all_authors_s.add(author)

del all_authors_s

In [24]:
df_true_items["description"] = df_true_items["description"].fillna("")
df_unknown_items["description"] = df_unknown_items["description"].fillna("")

df_true_items["class_"] = df_true_items["class_"].fillna(0)
df_unknown_items["class_"] = df_unknown_items["class_"].fillna(0)

median_year = df_true_items["year"].median()
df_true_items["year"] = df_true_items["year"].fillna(median_year)
median_year = df_unknown_items["year"].median()
df_unknown_items["year"] = df_unknown_items["year"].fillna(median_year)

df_true_items["paper_type"] = df_true_items["paper_type"].fillna("")
df_unknown_items["paper_type"] = df_unknown_items["paper_type"].fillna("")

df_true_items["preview_type"] = df_true_items["preview_type"].fillna("")
df_unknown_items["preview_type"] = df_unknown_items["preview_type"].fillna("")

df_true_items["book_type"] = df_true_items["book_type"].fillna("")
df_unknown_items["book_type"] = df_unknown_items["book_type"].fillna("")

df_true_items["seller_orders"] = df_true_items["seller_orders"].fillna(0)
df_unknown_items["seller_orders"] = df_unknown_items["seller_orders"].fillna(0)

df_true_items["seller_avg_item_rate"] = df_true_items["seller_avg_item_rate"].fillna(0)
df_unknown_items["seller_avg_item_rate"] = df_unknown_items["seller_avg_item_rate"].fillna(0)

df_true_items["days_reg_ago"] = df_true_items["seller_reg_date"].apply(lambda date: (datetime.now() - date).days)
df_unknown_items["days_reg_ago"] = df_unknown_items["seller_reg_date"].apply(lambda date: (datetime.now() - date).days)

avg_pages_count = df_true_items["pages_count"].mean()
df_true_items["pages_count"] = df_true_items["pages_count"].fillna(avg_pages_count)
avg_pages_count = df_unknown_items["pages_count"].mean()
df_unknown_items["pages_count"] = df_unknown_items["pages_count"].fillna(avg_pages_count)

In [25]:
df_true_items["title"] = df_true_items["title"].apply(preprocessing_text)
df_unknown_items["title"] = df_unknown_items["title"].apply(preprocessing_text)

In [26]:
df_true_items["description"] = df_true_items["description"].apply(preprocessing_description)
df_unknown_items["description"] = df_unknown_items["description"].apply(preprocessing_description)

In [27]:
df_true_items_1 = df_true_items.copy()
df_unknown_items_1 = df_unknown_items.copy()

In [28]:
df_true_items = df_true_items_1.copy()
df_unknown_items = df_unknown_items_1.copy()

In [30]:
# TF-IDF for titles

all_true_titles = ' '.join(df_true_items["title"])
all_unknown_titles = ' '.join(df_unknown_items["title"])

tfidf_titles = TfidfVectorizer(max_features=50)
transformed = tfidf_titles.fit_transform([all_true_titles, all_unknown_titles])

In [31]:
df = pd.DataFrame(
  transformed[0].T.todense(),
  index=tfidf_titles.get_feature_names_out(), columns=["TF-IDF"]
)

df = df.sort_values('TF-IDF', ascending=False)

useful_words_in_titles = df

In [32]:
# TF-IDF for descriptions

all_true_descriptions = ' '.join(df_true_items["description"])
all_unknown_descriptions = ' '.join(df_unknown_items["description"])

tfidf_descriptions = TfidfVectorizer(max_features=200)
transformed_descriptions = tfidf_descriptions.fit_transform([all_true_descriptions, all_unknown_descriptions])

In [33]:
df = pd.DataFrame(
  transformed_descriptions[0].T.todense(),
  index=tfidf_descriptions.get_feature_names_out(), columns=["TF-IDF"]
)

df = df.sort_values('TF-IDF', ascending=False)

useful_words_in_descriptions =  df

In [34]:
useful_words_in_descriptions_s = set(useful_words_in_descriptions.index)

for i, string in enumerate(df_true_items["description"]):
    words_list = string.split()
    
    final_words = []
    
    for word in words_list:
        if word in useful_words_in_descriptions_s:
            final_words.append(word)
    
    df_true_items.loc[df_true_items.index[i], "description"] = " ".join(final_words)
    
    
useful_words_in_titles_s = set(useful_words_in_titles.index)

for i, string in enumerate(df_true_items["title"]):
    words_list = string.split()
    
    final_words = []
    
    for word in words_list:
        if word in useful_words_in_descriptions_s:
            final_words.append(word)
    
    df_true_items.loc[df_true_items.index[i], "title"] = " ".join(final_words)

In [35]:
useful_words_in_descriptions_s = set(useful_words_in_descriptions.index)

for i, string in enumerate(df_unknown_items["description"]):
    words_list = string.split()
    
    final_words = []
    
    for word in words_list:
        if word in useful_words_in_descriptions_s:
            final_words.append(word)
    
    df_unknown_items.loc[df_unknown_items.index[i], "description"] = " ".join(final_words)
    
    
useful_words_in_titles_s = set(useful_words_in_titles.index)

for i, string in enumerate(df_unknown_items["title"]):
    words_list = string.split()
    
    final_words = []
    
    for word in words_list:
        if word in useful_words_in_descriptions_s:
            final_words.append(word)
    
    df_unknown_items.loc[df_unknown_items.index[i], "title"] = " ".join(final_words)

In [36]:
# True items
# Векторизация всех описаний сразу
tfidf_matrix = tfidf_descriptions.transform(df_true_items["description"])

# Преобразование разреженной матрицы в плотную (если необходимо)
tfidf_dense = tfidf_matrix.toarray()

# Добавление векторов в DataFrame
df_true_items["description_vector"] = list(tfidf_dense)


# Unknown items
tfidf_matrix = tfidf_descriptions.transform(df_unknown_items["description"])
tfidf_dense = tfidf_matrix.toarray()

df_unknown_items["description_vector"] = list(tfidf_dense)

In [37]:
# True items
tfidf_matrix = tfidf_titles.transform(df_true_items["title"])
tfidf_dense = tfidf_matrix.toarray()

df_true_items["title_vector"] = list(tfidf_dense)

# Unknown items
tfidf_matrix = tfidf_titles.transform(df_unknown_items["title"])
tfidf_dense = tfidf_matrix.toarray()

df_unknown_items["title_vector"] = list(tfidf_dense)

In [50]:
paper_types = list(set(np.concatenate([df_true_items["paper_type"].unique(), df_unknown_items["paper_type"].unique()])))
book_types = list(set(np.concatenate([df_true_items["book_type"].unique(), df_unknown_items["book_type"].unique()])))
preview_types = list(set(np.concatenate([df_true_items["preview_type"].unique(), df_unknown_items["preview_type"].unique()])))
regions = list(set(np.concatenate([df_true_items["seller_region"].unique(), df_unknown_items["seller_region"].unique()])))

In [51]:
paper_types_label_encoder = LabelEncoder()
encoded_paper_types = paper_types_label_encoder.fit_transform(paper_types)

book_types_label_encoder = LabelEncoder()
encoded_book_types = book_types_label_encoder.fit_transform(book_types)

preview_types_label_encoder = LabelEncoder()
encoded_preview_types = preview_types_label_encoder.fit_transform(preview_types)

regions_label_encoder = LabelEncoder()
encoded_regions = regions_label_encoder.fit_transform(regions)

In [52]:
df_true_items["paper_type"] = paper_types_label_encoder.transform(df_true_items["paper_type"])
df_true_items["book_type"] = book_types_label_encoder.transform(df_true_items["book_type"])
df_true_items["preview_type"] = preview_types_label_encoder.transform(df_true_items["preview_type"])
df_true_items["seller_region"] = regions_label_encoder.transform(df_true_items["seller_region"])

In [53]:
df_unknown_items["paper_type"] = paper_types_label_encoder.transform(df_unknown_items["paper_type"])
df_unknown_items["book_type"] = book_types_label_encoder.transform(df_unknown_items["book_type"])
df_unknown_items["preview_type"] = preview_types_label_encoder.transform(df_unknown_items["preview_type"])
df_unknown_items["seller_region"] = regions_label_encoder.transform(df_unknown_items["seller_region"])

In [54]:
df_true_items.head()

Unnamed: 0,id,title,url,price,image,description,year,paper_type,preview_type,book_type,...,original_name,author,seller_id,seller_reg_date,seller_orders,seller_avg_item_rate,seller_region,days_reg_ago,description_vector,title_vector
0,8,русский язык задание класс школа россия,/product/russkiy-yazyk-letnie-zadaniya-perehod...,185,15416e32-b929-4e7a-98df-67061d0d2231,,2024.0,10,4,4,...,Русский язык. Летние задания. Переходим во 2-й...,[Никишенкова Александра Викторовна],207249,2022-04-14 15:13:31.561572,1200000,4.9,0,1096,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,10,математика рабочий тетрадь класс часть фгос,/product/matematika-rabochaya-tetrad-1-klass-c...,227,544f263e-0d55-439f-b24d-f7cd6660ff4e,,2025.0,10,4,4,...,Математика. Рабочая тетрадь. 1 класс. В 2-х ч....,"[Моро Мария Игнатьевна, Волкова Светлана Ивано...",207249,2022-04-14 15:13:31.561572,1200000,4.9,0,1096,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,11,мир атлас,/product/okruzhayushchiy-mir-priroda-i-chelove...,233,0ed91455-16ae-4bdc-89b6-b97a6a9ac3ee,,2025.0,7,4,4,...,,[Сивоглазов Владислав Иванович],207249,2022-04-14 15:13:31.561572,1200000,4.9,0,1096,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.7071067811865475, ..."
3,12,английский язык грамматический класс английски...,/product/angliyskiy-yazyk-grammaticheskiy-tren...,204,80fdc53b-6431-4ba4-a6e9-3a396b7c4eda,,2025.0,10,4,4,...,,[Юшина Дарья Геннадьевна],207249,2022-04-14 15:13:31.561572,1200000,4.9,0,1096,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.8164965809277261, 0.0, ..."
4,14,русский язык рабочий тетрадь класс фгос школа ...,/product/russkiy-yazyk-rabochaya-tetrad-1-klas...,272,9732a425-31e4-4dc5-9220-f843838fc56c,,2025.0,10,4,4,...,Русский язык. Рабочая тетрадь. 1 класс,[Канакина Валентина Павловна],207249,2022-04-14 15:13:31.561572,1200000,4.9,0,1096,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [110]:
X_positive = df_true_items.drop(columns=[
    "id", "title", "image", "circulation", "subject", "original_name", 
    "author", "description", "isbn", "seller_reg_date", "seller_id", "url"
])

X_unlabeled = df_unknown_items.drop(columns=[
    "id", "title", "image", "circulation", "subject", "original_name", 
    "author", "description", "isbn", "seller_reg_date", "seller_id", "url"
])

In [112]:
X_positive

Unnamed: 0,price,year,paper_type,preview_type,book_type,pages_count,class_,seller_orders,seller_avg_item_rate,seller_region,days_reg_ago,description_vector,title_vector
0,185,2024.0,10,4,4,80.0,1.0,1200000,4.9,0,1096,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,227,2025.0,10,4,4,48.0,1.0,1200000,4.9,0,1096,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,233,2025.0,7,4,4,40.0,1.0,1200000,4.9,0,1096,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.7071067811865475, ..."
3,204,2025.0,10,4,4,80.0,2.0,1200000,4.9,0,1096,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.8164965809277261, 0.0, ..."
4,272,2025.0,10,4,4,64.0,1.0,1200000,4.9,0,1096,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2185,1203,2025.0,10,4,4,160.0,4.0,1200000,4.9,0,1096,"[0.2182178902359924, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2186,872,2025.0,10,2,4,96.0,4.0,1200000,4.9,0,1096,"[0.0, 0.20412414523193154, 0.0, 0.0, 0.0, 0.0,...","[0.0, 0.0, 0.0, 0.0, 0.4472135954999579, 0.0, ..."
2187,892,2025.0,10,4,4,176.0,2.0,1200000,4.9,0,1096,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2188,231,2024.0,10,4,4,64.0,4.0,1200000,4.9,0,1096,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.16439898...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [113]:
for i in range(200):
    X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
    
X_positive = X_positive.drop(columns=["description_vector"])

for i in range(50):
    X_positive[f"title_vector_{i}"] = X_positive["title_vector"].apply(lambda x: x[i])
    
X_positive = X_positive.drop(columns=["title_vector"])

for i in range(200):
    X_unlabeled[f"description_vector_{i}"] = X_unlabeled["description_vector"].apply(lambda x: x[i])
    
X_unlabeled = X_unlabeled.drop(columns=["description_vector"])

for i in range(50):
    X_unlabeled[f"title_vector_{i}"] = X_unlabeled["title_vector"].apply(lambda x: x[i])
    
X_unlabeled = X_unlabeled.drop(columns=["title_vector"])

  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vec

In [38]:
X_positive

Unnamed: 0,price,year,papper_type,preview_type,book_type,pages_count,class_,description_vector_0,description_vector_1,description_vector_2,...,title_vector_40,title_vector_41,title_vector_42,title_vector_43,title_vector_44,title_vector_45,title_vector_46,title_vector_47,title_vector_48,title_vector_49
0,289,2024.0,11,4,4,112.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.377964,0.000000,0.000000,0.0,0.377964,0.377964
1,119,2024.0,11,4,4,16.0,6.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
2,164,2024.0,11,4,4,48.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.447214,0.000000
3,185,2024.0,11,4,4,80.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.408248,0.408248
4,227,2025.0,11,4,4,48.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.408248,0.408248,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2185,3932,2023.0,11,0,4,208.0,9.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.447214,0.000000,0.447214,0.0,0.000000,0.000000
2186,755,2025.0,0,2,4,120.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.377964,0.0,0.377964,0.000000,0.000000,0.0,0.000000,0.755929
2187,3932,2023.0,11,0,4,208.0,9.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.447214,0.000000,0.447214,0.0,0.000000,0.000000
2188,393,2023.0,11,4,4,144.0,10.0,0.0,0.0,0.0,...,0.0,0.0,0.408248,0.0,0.000000,0.408248,0.000000,0.0,0.000000,0.000000


In [39]:
# X_positive = pd.concat([X_positive, X_unlabeled], ignore_index=True)

In [58]:
X_train, X_test, y_train, y_test = train_test_split(
    X_positive, [1] * len(X_positive), test_size=0.2, random_state=324
)

In [59]:
from sklearn.ensemble import IsolationForest

In [121]:
from sklearn.neighbors import LocalOutlierFactor

model = LocalOutlierFactor(novelty=True)
y = model.fit(X_train)

y = model.predict(X_unlabeled)

for i in range(len(y)):
    if y[i] == -1:
        print("www.ozon.ru" + df_unknown_items.loc[df_unknown_items.index[i], "url"])

# y = model.predict(X_test)

www.ozon.ru/product/angliyskiy-yazyk-sbornik-uprazhneniy-2-klass-fgos-angliyskiy-v-fokuse-bykova-nadezhda-879638932/?at=DqtDYORL2I6xRowWiVDZp9nCnJw4NlFzNGXQqfrgzONN
www.ozon.ru/product/chitayu-i-pishu-rabochaya-tetrad-k-knige-azbuka-moy-pervyy-uchebnik-chast-2-fgos-do-ignateva-489440165/?at=28t05EN2qT2YovVKHW0X916F0G9o6NikEGQqySDMmOAK
www.ozon.ru/product/volshebnye-linii-rabochaya-tetrad-dlya-podgotovki-k-shkole-chast-2-ilyuhina-vera-alekseevna-654253290/?at=qQtJY7pyAc577XzhJprn8AIA687REizNMQrWUR0Jg60
www.ozon.ru/product/matematika-rabochaya-tetrad-1-klass-chast-2-fgos-moro-mariya-ignatevna-volkova-svetlana-ivanovna-801993274/?at=PjtJz571YcApqpZktvn06NfrY3QV3HoM0wAPTV78kxo
www.ozon.ru/product/russkiy-yazyk-rabochaya-tetrad-1-klass-fgos-shkola-rossii-kanakina-valentina-pavlovna-802040542/?at=gpt4EYL1XF5NvpVwuAK3775cj44lEvS1YMq5giGJox9X
www.ozon.ru/product/koty-voiteli-tsikl-voiteli-stan-dikim-hanter-erin-285879805/?at=K8tZ7rjygTo3JMwRfQoGBgjU6ZyP8MSZnqQnWtE4rQxO
www.ozon.ru/product/russ



In [None]:
# Случайный вектор
# Кластеризация
# Перекрёстная энтропия

# Векторизация данных
# Анализ отклонений


In [117]:
def analyze_with_isolation_forest(X_positive, X_unlabeled):
    """
    Perform anomaly detection using Isolation Forest on the dataset.
    
    :param X_positive: DataFrame of known legitimate items (positive class).
    :param X_unlabeled: DataFrame of unknown items to analyze.
    :return: Tuple of (model, predictions on unlabeled data).
    """
    # Create labels for positive data
    y_positive = [1] * len(X_positive)
    
    # Split positive data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X_positive, y_positive, test_size=0.2, random_state=42
    )
    
    # Initialize Isolation Forest model
    model = IsolationForest(contamination=0.1, random_state=42)
    
    # Fit model on training data
    model.fit(X_train)
    
    # Predict on test data
    y_pred_test = model.predict(X_test)
    
    # Calculate accuracy on test data
    accuracy = accuracy_score(y_test, y_pred_test)
    print(f"Isolation Forest accuracy on test data: {accuracy}")
    
    # Predict anomalies on unlabeled data
    y_pred_unlabeled = model.predict(X_unlabeled)
    
    return model, y_pred_unlabeled


In [120]:
model, y = analyze_with_isolation_forest(X_positive, X_unlabeled)

for i in range(len(y)):
    if y[i] == -1:
        print("www.ozon.ru" + df_unknown_items.loc[df_unknown_items.index[i], "url"])

Isolation Forest accuracy on test data: 0.8926940639269406
www.ozon.ru/product/istoriya-drevnego-mira-konturnye-karty-5-klass-drubachevskaya-irina-leonidovna-ukolova-555697073/?at=ywtAOylPvsRWYoqLuEP9y5PsVlpBY2tMNZVDjIJqn896
www.ozon.ru/product/istoriya-novogo-vremeni-konturnye-karty-9-klass-1875435192/?at=w0tglRDEMcx0yAD0UKnK9vlCJvgKgWFVG2L0kI7AXgER
www.ozon.ru/product/russkiy-yazyk-3-y-klass-v-2-h-chastyah-ch-1-1783886995/?at=6WtZLGn62TE4loYXHOW4BgLtkpgjB8CKZAxrxsqMY8x8
www.ozon.ru/product/okruzhayushchiy-mir-3-klass-uchebnik-chast-2-808847824/?at=z6tOWm0Ylc4GMEMQHNWO8oBi8oOLOGTA5LNxWiPL1jM
www.ozon.ru/product/literaturnoe-chtenie-dnevnik-chitatelya-3-klass-fgos-boykina-m-v-bubnova-i-a-1271624244/?at=DqtDYORL2IvB0jEmcEBEojmikQJpYWIRWn52RTjJZRNZ
www.ozon.ru/product/igry-i-razvlecheniya-v-gruppe-prodlennogo-dnya-1784136517/?at=vQtrwyDnmuY2PXoRiPo9vX9uyrPO5ghkxnJ6rh8w0kME
www.ozon.ru/product/tablichnoe-umnozhenie-i-delenie-2-3-klass-840833625/?at=jYtZoy74LTZ23qgVC4JRNOzh79lJn2fvnx3vqiYg

In [69]:
model = IsolationForest()
model.fit(X_train, y_train)
y = model.predict(X_test)

accuracy_score(y, y_test)


1.0

In [76]:
from sklearn.svm import OneClassSVM
from sklearn.metrics import accuracy_score

model = OneClassSVM(kernel="poly")
model.fit(X_train, y_train)

predicted = model.predict(X_test)

accuracy_score(y_test, predicted)

0.4703196347031963

In [108]:
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

X_summary = pd.concat([X_positive, X_unlabeled], ignore_index=True)

linked = linkage(X_summary.drop(columns=["url"]), method='ward')

num_clusters = 2
clusters = fcluster(linked, num_clusters, criterion='maxclust')

# 5. Вывод результатов
print("Принадлежность объектов к кластерам:")
for i, cluster in enumerate(clusters):
    if cluster == 1:
        print("www.ozon.ru" + X_summary.loc[X_summary.index[i], "url"])

Принадлежность объектов к кластерам:
www.ozon.ru/product/chitayu-i-pishu-rabochaya-tetrad-k-knige-azbuka-moy-pervyy-uchebnik-chast-2-fgos-do-ignateva-489440165/?at=28t05EN2qT2YovVKHW0X916F0G9o6NikEGQqySDMmOAK
www.ozon.ru/product/volshebnye-linii-rabochaya-tetrad-dlya-podgotovki-k-shkole-chast-2-ilyuhina-vera-alekseevna-654253290/?at=qQtJY7pyAc577XzhJprn8AIA687REizNMQrWUR0Jg60
www.ozon.ru/product/koty-voiteli-tsikl-voiteli-stan-dikim-hanter-erin-285879805/?at=K8tZ7rjygTo3JMwRfQoGBgjU6ZyP8MSZnqQnWtE4rQxO
www.ozon.ru/product/russkiy-yazyk-6-klass-uchebnik-chast-1-1914498312/?at=08tYNrRXOc4wlnELHRnPLj1uQLRO7NFyB3DPQIJv4jG3
www.ozon.ru/product/koty-voiteli-tsikl-nachalo-plemen-pervaya-bitva-hanter-erin-474850004/?at=ywtAOylPvskKpnNyhKrgBAqHZq432ASVJVKrRf9YOrx1
www.ozon.ru/product/matematika-1-klass-rabochaya-tetrad-uglublennyy-uroven-chast-3-peterson-lyudmila-georgievna-1762948075/?at=WPtNryAL7h1ZWYoqS5yPVAzhP2zxVltJgwyEKCXrRpXX
www.ozon.ru/product/informatika-3-klass-uchebnik-v-dvuh-chasty

In [105]:
X_summary.loc[X_summary.index[2191], "url"]

'/product/chitayu-i-pishu-rabochaya-tetrad-k-knige-azbuka-moy-pervyy-uchebnik-chast-2-fgos-do-ignateva-489440165/?at=28t05EN2qT2YovVKHW0X916F0G9o6NikEGQqySDMmOAK'

In [None]:
from sklearn.cluster import DBSCAN

X_summary = pd.concat([X_positive, X_unlabeled], ignore_index=True)

model = DBSCAN(eps=0.5, min_samples=2)
X_summary["cluster"] = model.fit_predict(X_summary)

X_summary.cluster.unique()

TypeError: DBSCAN.__init__() got an unexpected keyword argument 'clusters'

In [None]:
from sqlalchemy import create_engine, String, ARRAY, ForeignKey
from sqlalchemy.orm import declarative_base, sessionmaker, Mapped, mapped_column, relationship

import spacy
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC, OneClassSVM
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

from datetime import datetime

In [None]:
from sqlalchemy import create_engine, String, ARRAY, ForeignKey
from sqlalchemy.orm import declarative_base, sessionmaker, Mapped, mapped_column, relationship

import spacy
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC, OneClassSVM
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

from datetime import datetime

In [None]:
# installation
# python -m spacy download ru_core_news_md

# nlp = spacy.load("ru_core_news_lg") # 489 mb
nlp = spacy.load("ru_core_news_md") # 39 mb

In [None]:
Base = declarative_base()

In [None]:
class TrueItem(Base):
    __tablename__ = "education_seller"
    
    id: Mapped[int] = mapped_column(primary_key=True)
    title: Mapped[str]
    url: Mapped[str]
    price: Mapped[int]
    image: Mapped[str] = mapped_column(String(128))
    description: Mapped[str]
    year: Mapped[int]
    paper_type: Mapped[str]
    preview_type: Mapped[str]
    book_type: Mapped[str]
    pages_count: Mapped[int]
    circulation: Mapped[int]
    isbn: Mapped[list[str]] = mapped_column(ARRAY(String))
    class_: Mapped[int] = mapped_column(name="class")
    subject: Mapped[str]
    original_name: Mapped[str]
    author: Mapped[list[str]] = mapped_column(ARRAY(String))
    seller_id: Mapped[int] = mapped_column(
        ForeignKey("seller.id")
    )
    days_to_deliver: Mapped[int]
    
    seller = relationship("Seller", back_populates="true_items")
    
    def dict(self):
        return {
            "id": self.id,
            "title": self.title,
            "url": self.url,
            "price": self.price,
            "image": self.image,
            "description": self.description,
            "year": self.year,
            "paper_type": self.paper_type,
            "preview_type": self.preview_type,
            "book_type": self.book_type,
            "pages_count": self.pages_count,
            "circulation": self.circulation,
            "isbn": self.isbn,
            "class_": self.class_,
            "subject": self.subject,
            "original_name": self.original_name,
            "author": self.author,
            "seller_id": self.seller_id,
            "seller_reg_date": self.seller.reg_date,
            "seller_orders": self.seller.orders,
            "seller_avg_item_rate": self.seller.avg_item_rate,
            "seller_region": self.seller.region,
        }


class Item(Base):
    __tablename__ = "item"
    
    id: Mapped[int] = mapped_column(primary_key=True)
    title: Mapped[str]
    url: Mapped[str]
    price: Mapped[int]
    image: Mapped[str] = mapped_column(String(128))
    description: Mapped[str]
    year: Mapped[int]
    paper_type: Mapped[str]
    preview_type: Mapped[str]
    book_type: Mapped[str]
    pages_count: Mapped[int]
    circulation: Mapped[int]
    isbn: Mapped[list[str]] = mapped_column(ARRAY(String))
    class_: Mapped[int] = mapped_column(name="class")
    subject: Mapped[str]
    original_name: Mapped[str]
    author: Mapped[list[str]] = mapped_column(ARRAY(String))
    seller_id: Mapped[int] = mapped_column(
        ForeignKey("seller.id")
    )
    days_to_deliver: Mapped[int]
    
    seller = relationship("Seller", back_populates="items")
    
    def dict(self):
        return {
            "id": self.id,
            "title": self.title,
            "url": self.url,
            "price": self.price,
            "image": self.image,
            "description": self.description,
            "year": self.year,
            "paper_type": self.paper_type,
            "preview_type": self.preview_type,
            "book_type": self.book_type,
            "pages_count": self.pages_count,
            "circulation": self.circulation,
            "isbn": self.isbn,
            "class_": self.class_,
            "subject": self.subject,
            "original_name": self.original_name,
            "author": self.author,
            "seller_id": self.seller_id,
            "seller_reg_date": self.seller.reg_date,
            "seller_orders": self.seller.orders,
            "seller_avg_item_rate": self.seller.avg_item_rate,
            "seller_region": self.seller.region,
        }
    

class Seller(Base):
    __tablename__ = "seller"
    
    id: Mapped[int] = mapped_column(primary_key=True)
    reg_date: Mapped[datetime]
    orders: Mapped[int]
    avg_item_rate: Mapped[float]
    region: Mapped[str]
    
    items = relationship("Item", back_populates="seller")
    true_items = relationship("TrueItem", back_populates="seller")

In [None]:
engine = create_engine('postgresql://postgres:postgres@localhost:5432/OZON_parse')
Session = sessionmaker(engine, expire_on_commit=True)

In [None]:
def preprocessing_text(string: str) -> str:
    string = " ".join(
        token.lemma_ for token in nlp(string) if (
            (not token.is_stop) and (not token.is_punct) and (str(token) != '|')
        )
    )
    
    return string

In [None]:
def preprocessing_description(string: str) -> str:
    string = re.sub(r'\d+', '', string)
    string = re.sub(r'\b[^а-яА-ЯёЁ\s]+\b', '', string)
    
    string_list = string.split("Автор на обложке")
    if len(string_list) > 1:
        string = string_list[0]
    else:
        string = ""

    string = " ".join(
        token.lemma_ for token in nlp(string) if (
            (not token.is_stop) and (not token.is_punct) and (str(token) != '|')
            and (len(str(token)) > 3)
        )
    )
    
    return string

In [None]:
"""
maybe fake sellers

2146622
2146631
2146646

"""

''

In [None]:
with Session() as session:
    true_items = session.query(TrueItem).filter(TrueItem.seller_id != None).all()
    df_true_items = pd.DataFrame([item.dict() for item in true_items])    

    unknown_items = session.query(Item).filter(Item.seller_id != None).all()
    df_unknown_items = pd.DataFrame([item.dict() for item in unknown_items])  

In [None]:
df_unknown_items.columns

Index(['id', 'title', 'url', 'price', 'image', 'description', 'year',
       'paper_type', 'preview_type', 'book_type', 'pages_count', 'circulation',
       'isbn', 'class_', 'subject', 'original_name', 'author', 'seller_id',
       'seller_reg_date', 'seller_orders', 'seller_avg_item_rate',
       'seller_region'],
      dtype='object')

In [None]:
all_authors = []
all_authors_s = set()

for authors_list in df_true_items["author"]:
    if authors_list:
        for author in authors_list:
            if author not in all_authors_s:
                all_authors.append(author)
                all_authors_s.add(author)

del all_authors_s

In [None]:
df_true_items["description"] = df_true_items["description"].fillna("")
df_unknown_items["description"] = df_unknown_items["description"].fillna("")

df_true_items["class_"] = df_true_items["class_"].fillna(0)
df_unknown_items["class_"] = df_unknown_items["class_"].fillna(0)

median_year = df_true_items["year"].median()
df_true_items["year"] = df_true_items["year"].fillna(median_year)
median_year = df_unknown_items["year"].median()
df_unknown_items["year"] = df_unknown_items["year"].fillna(median_year)

df_true_items["paper_type"] = df_true_items["paper_type"].fillna("")
df_unknown_items["paper_type"] = df_unknown_items["paper_type"].fillna("")

df_true_items["preview_type"] = df_true_items["preview_type"].fillna("")
df_unknown_items["preview_type"] = df_unknown_items["preview_type"].fillna("")

df_true_items["book_type"] = df_true_items["book_type"].fillna("")
df_unknown_items["book_type"] = df_unknown_items["book_type"].fillna("")

df_true_items["seller_orders"] = df_true_items["seller_orders"].fillna(0)
df_unknown_items["seller_orders"] = df_unknown_items["seller_orders"].fillna(0)

df_true_items["seller_avg_item_rate"] = df_true_items["seller_avg_item_rate"].fillna(0)
df_unknown_items["seller_avg_item_rate"] = df_unknown_items["seller_avg_item_rate"].fillna(0)

df_true_items["days_reg_ago"] = df_true_items["seller_reg_date"].apply(lambda date: (datetime.now() - date).days)
df_unknown_items["days_reg_ago"] = df_unknown_items["seller_reg_date"].apply(lambda date: (datetime.now() - date).days)

avg_pages_count = df_true_items["pages_count"].mean()
df_true_items["pages_count"] = df_true_items["pages_count"].fillna(avg_pages_count)
avg_pages_count = df_unknown_items["pages_count"].mean()
df_unknown_items["pages_count"] = df_unknown_items["pages_count"].fillna(avg_pages_count)

In [None]:
df_true_items["title"] = df_true_items["title"].apply(preprocessing_text)
df_unknown_items["title"] = df_unknown_items["title"].apply(preprocessing_text)

In [None]:
df_true_items["description"] = df_true_items["description"].apply(preprocessing_description)
df_unknown_items["description"] = df_unknown_items["description"].apply(preprocessing_description)

In [None]:
df_true_items_1 = df_true_items.copy()
df_unknown_items_1 = df_unknown_items.copy()

In [None]:
df_true_items = df_true_items_1.copy()
df_unknown_items = df_unknown_items_1.copy()

In [None]:
# TF-IDF for titles

all_true_titles = ' '.join(df_true_items["title"])
all_unknown_titles = ' '.join(df_unknown_items["title"])

tfidf_titles = TfidfVectorizer(max_features=50)
transformed = tfidf_titles.fit_transform([all_true_titles, all_unknown_titles])

In [None]:
df = pd.DataFrame(
  transformed[0].T.todense(),
  index=tfidf_titles.get_feature_names_out(), columns=["TF-IDF"]
)

df = df.sort_values('TF-IDF', ascending=False)

useful_words_in_titles = df

In [None]:
# TF-IDF for descriptions

all_true_descriptions = ' '.join(df_true_items["description"])
all_unknown_descriptions = ' '.join(df_unknown_items["description"])

tfidf_descriptions = TfidfVectorizer(max_features=200)
transformed_descriptions = tfidf_descriptions.fit_transform([all_true_descriptions, all_unknown_descriptions])

In [None]:
df = pd.DataFrame(
  transformed_descriptions[0].T.todense(),
  index=tfidf_descriptions.get_feature_names_out(), columns=["TF-IDF"]
)

df = df.sort_values('TF-IDF', ascending=False)

useful_words_in_descriptions =  df

In [None]:
useful_words_in_descriptions_s = set(useful_words_in_descriptions.index)

for i, string in enumerate(df_true_items["description"]):
    words_list = string.split()
    
    final_words = []
    
    for word in words_list:
        if word in useful_words_in_descriptions_s:
            final_words.append(word)
    
    df_true_items.loc[df_true_items.index[i], "description"] = " ".join(final_words)
    
    
useful_words_in_titles_s = set(useful_words_in_titles.index)

for i, string in enumerate(df_true_items["title"]):
    words_list = string.split()
    
    final_words = []
    
    for word in words_list:
        if word in useful_words_in_descriptions_s:
            final_words.append(word)
    
    df_true_items.loc[df_true_items.index[i], "title"] = " ".join(final_words)

In [None]:
useful_words_in_descriptions_s = set(useful_words_in_descriptions.index)

for i, string in enumerate(df_unknown_items["description"]):
    words_list = string.split()
    
    final_words = []
    
    for word in words_list:
        if word in useful_words_in_descriptions_s:
            final_words.append(word)
    
    df_unknown_items.loc[df_unknown_items.index[i], "description"] = " ".join(final_words)
    
    
useful_words_in_titles_s = set(useful_words_in_titles.index)

for i, string in enumerate(df_unknown_items["title"]):
    words_list = string.split()
    
    final_words = []
    
    for word in words_list:
        if word in useful_words_in_descriptions_s:
            final_words.append(word)
    
    df_unknown_items.loc[df_unknown_items.index[i], "title"] = " ".join(final_words)

In [None]:
# True items
# Векторизация всех описаний сразу
tfidf_matrix = tfidf_descriptions.transform(df_true_items["description"])

# Преобразование разреженной матрицы в плотную (если необходимо)
tfidf_dense = tfidf_matrix.toarray()

# Добавление векторов в DataFrame
df_true_items["description_vector"] = list(tfidf_dense)


# Unknown items
tfidf_matrix = tfidf_descriptions.transform(df_unknown_items["description"])
tfidf_dense = tfidf_matrix.toarray()

df_unknown_items["description_vector"] = list(tfidf_dense)

In [None]:
# True items
tfidf_matrix = tfidf_titles.transform(df_true_items["title"])
tfidf_dense = tfidf_matrix.toarray()

df_true_items["title_vector"] = list(tfidf_dense)

# Unknown items
tfidf_matrix = tfidf_titles.transform(df_unknown_items["title"])
tfidf_dense = tfidf_matrix.toarray()

df_unknown_items["title_vector"] = list(tfidf_dense)

In [None]:
paper_types = list(set(np.concatenate([df_true_items["paper_type"].unique(), df_unknown_items["paper_type"].unique()])))
book_types = list(set(np.concatenate([df_true_items["book_type"].unique(), df_unknown_items["book_type"].unique()])))
preview_types = list(set(np.concatenate([df_true_items["preview_type"].unique(), df_unknown_items["preview_type"].unique()])))
regions = list(set(np.concatenate([df_true_items["seller_region"].unique(), df_unknown_items["seller_region"].unique()])))

In [None]:
paper_types_label_encoder = LabelEncoder()
encoded_paper_types = paper_types_label_encoder.fit_transform(paper_types)

book_types_label_encoder = LabelEncoder()
encoded_book_types = book_types_label_encoder.fit_transform(book_types)

preview_types_label_encoder = LabelEncoder()
encoded_preview_types = preview_types_label_encoder.fit_transform(preview_types)

regions_label_encoder = LabelEncoder()
encoded_regions = regions_label_encoder.fit_transform(regions)

In [None]:
df_true_items["paper_type"] = paper_types_label_encoder.transform(df_true_items["paper_type"])
df_true_items["book_type"] = book_types_label_encoder.transform(df_true_items["book_type"])
df_true_items["preview_type"] = preview_types_label_encoder.transform(df_true_items["preview_type"])
df_true_items["seller_region"] = regions_label_encoder.transform(df_true_items["seller_region"])

In [None]:
df_unknown_items["paper_type"] = paper_types_label_encoder.transform(df_unknown_items["paper_type"])
df_unknown_items["book_type"] = book_types_label_encoder.transform(df_unknown_items["book_type"])
df_unknown_items["preview_type"] = preview_types_label_encoder.transform(df_unknown_items["preview_type"])
df_unknown_items["seller_region"] = regions_label_encoder.transform(df_unknown_items["seller_region"])

In [None]:
df_true_items.head()

Unnamed: 0,id,title,url,price,image,description,year,paper_type,preview_type,book_type,...,original_name,author,seller_id,seller_reg_date,seller_orders,seller_avg_item_rate,seller_region,days_reg_ago,description_vector,title_vector
0,8,русский язык задание класс школа россия,/product/russkiy-yazyk-letnie-zadaniya-perehod...,185,15416e32-b929-4e7a-98df-67061d0d2231,,2024.0,10,4,4,...,Русский язык. Летние задания. Переходим во 2-й...,[Никишенкова Александра Викторовна],207249,2022-04-14 15:13:31.561572,1200000,4.9,0,1096,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,10,математика рабочий тетрадь класс часть фгос,/product/matematika-rabochaya-tetrad-1-klass-c...,227,544f263e-0d55-439f-b24d-f7cd6660ff4e,,2025.0,10,4,4,...,Математика. Рабочая тетрадь. 1 класс. В 2-х ч....,"[Моро Мария Игнатьевна, Волкова Светлана Ивано...",207249,2022-04-14 15:13:31.561572,1200000,4.9,0,1096,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,11,мир атлас,/product/okruzhayushchiy-mir-priroda-i-chelove...,233,0ed91455-16ae-4bdc-89b6-b97a6a9ac3ee,,2025.0,7,4,4,...,,[Сивоглазов Владислав Иванович],207249,2022-04-14 15:13:31.561572,1200000,4.9,0,1096,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.7071067811865475, ..."
3,12,английский язык грамматический класс английски...,/product/angliyskiy-yazyk-grammaticheskiy-tren...,204,80fdc53b-6431-4ba4-a6e9-3a396b7c4eda,,2025.0,10,4,4,...,,[Юшина Дарья Геннадьевна],207249,2022-04-14 15:13:31.561572,1200000,4.9,0,1096,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.8164965809277261, 0.0, ..."
4,14,русский язык рабочий тетрадь класс фгос школа ...,/product/russkiy-yazyk-rabochaya-tetrad-1-klas...,272,9732a425-31e4-4dc5-9220-f843838fc56c,,2025.0,10,4,4,...,Русский язык. Рабочая тетрадь. 1 класс,[Канакина Валентина Павловна],207249,2022-04-14 15:13:31.561572,1200000,4.9,0,1096,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [None]:
X_positive = df_true_items.drop(columns=[
    "id", "title", "url", "image", "circulation", "subject", "original_name", 
    "author", "description", "isbn", "seller_reg_date", "seller_id"
])

X_unlabeled = df_unknown_items.drop(columns=[
    "id", "title", "url", "image", "circulation", "subject", "original_name", 
    "author", "description", "isbn", "seller_reg_date", "seller_id"
])

In [None]:
X_positive

Unnamed: 0,price,year,paper_type,preview_type,book_type,pages_count,class_,seller_orders,seller_avg_item_rate,seller_region,days_reg_ago,description_vector,title_vector
0,185,2024.0,10,4,4,80.0,1.0,1200000,4.9,0,1096,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,227,2025.0,10,4,4,48.0,1.0,1200000,4.9,0,1096,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,233,2025.0,7,4,4,40.0,1.0,1200000,4.9,0,1096,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.7071067811865475, ..."
3,204,2025.0,10,4,4,80.0,2.0,1200000,4.9,0,1096,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.8164965809277261, 0.0, ..."
4,272,2025.0,10,4,4,64.0,1.0,1200000,4.9,0,1096,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2185,1203,2025.0,10,4,4,160.0,4.0,1200000,4.9,0,1096,"[0.2182178902359924, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2186,872,2025.0,10,2,4,96.0,4.0,1200000,4.9,0,1096,"[0.0, 0.20412414523193154, 0.0, 0.0, 0.0, 0.0,...","[0.0, 0.0, 0.0, 0.0, 0.4472135954999579, 0.0, ..."
2187,892,2025.0,10,4,4,176.0,2.0,1200000,4.9,0,1096,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2188,231,2024.0,10,4,4,64.0,4.0,1200000,4.9,0,1096,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.16439898...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [None]:
for i in range(200):
    X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
    
X_positive = X_positive.drop(columns=["description_vector"])

for i in range(50):
    X_positive[f"title_vector_{i}"] = X_positive["title_vector"].apply(lambda x: x[i])
    
X_positive = X_positive.drop(columns=["title_vector"])

for i in range(200):
    X_unlabeled[f"description_vector_{i}"] = X_unlabeled["description_vector"].apply(lambda x: x[i])
    
X_unlabeled = X_unlabeled.drop(columns=["description_vector"])

for i in range(50):
    X_unlabeled[f"title_vector_{i}"] = X_unlabeled["title_vector"].apply(lambda x: x[i])
    
X_unlabeled = X_unlabeled.drop(columns=["title_vector"])

  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vec

In [None]:
X_positive

Unnamed: 0,price,year,papper_type,preview_type,book_type,pages_count,class_,description_vector_0,description_vector_1,description_vector_2,...,title_vector_40,title_vector_41,title_vector_42,title_vector_43,title_vector_44,title_vector_45,title_vector_46,title_vector_47,title_vector_48,title_vector_49
0,289,2024.0,11,4,4,112.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.377964,0.000000,0.000000,0.0,0.377964,0.377964
1,119,2024.0,11,4,4,16.0,6.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
2,164,2024.0,11,4,4,48.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.447214,0.000000
3,185,2024.0,11,4,4,80.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.408248,0.408248
4,227,2025.0,11,4,4,48.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.408248,0.408248,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2185,3932,2023.0,11,0,4,208.0,9.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.447214,0.000000,0.447214,0.0,0.000000,0.000000
2186,755,2025.0,0,2,4,120.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.377964,0.0,0.377964,0.000000,0.000000,0.0,0.000000,0.755929
2187,3932,2023.0,11,0,4,208.0,9.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.447214,0.000000,0.447214,0.0,0.000000,0.000000
2188,393,2023.0,11,4,4,144.0,10.0,0.0,0.0,0.0,...,0.0,0.0,0.408248,0.0,0.000000,0.408248,0.000000,0.0,0.000000,0.000000


In [None]:
# X_positive = pd.concat([X_positive, X_unlabeled], ignore_index=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_positive, [1] * len(X_positive), test_size=0.2, random_state=324
)

In [None]:
from sklearn.ensemble import IsolationForest

In [None]:
from sklearn.neighbors import LocalOutlierFactor

model = LocalOutlierFactor(novelty=True)
y = model.fit(X_train)

y = model.predict(X_test)

accuracy_score(y, y_test)

# y = model.predict(X_test)



0.954337899543379

In [None]:
# Случайный вектор
# Кластеризация
# Перекрёстная энтропия

# Векторизация данных
# Анализ отклонений


In [None]:
model = IsolationForest()
model.fit(X_train, y_train)
y = model.predict(X_test)

accuracy_score(y, y_test)


1.0

In [None]:
from sklearn.svm import OneClassSVM
from sklearn.metrics import accuracy_score

model = OneClassSVM(kernel="poly")
model.fit(X_train, y_train)

predicted = model.predict(X_test)

accuracy_score(y_test, predicted)

0.4703196347031963

In [None]:
from sklearn.cluster import DBSCAN

X_summary = pd.concat([X_positive, X_unlabeled], ignore_index=True)

model = DBSCAN(eps=0.5, min_samples=2)
X_summary["cluster"] = model.fit_predict(X_summary)

X_summary.cluster.unique()

TypeError: DBSCAN.__init__() got an unexpected keyword argument 'clusters'

In [None]:
y_predicted = model.predict(X_test)

AttributeError: This 'LocalOutlierFactor' has no attribute 'predict'

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_predicted)

ValueError: Found input variables with inconsistent numbers of samples: [5736, 438]

In [None]:
predicted = model.predict(X_unlabeled)

In [None]:
df_unknown_items.loc[df_unknown_items.index[2191]]

id                                                                   3488
title                                 материал класс базовый уровень фгос
url                     /product/algebra-i-nachala-matematicheskogo-an...
price                                                                 387
image                                11e29511-60a8-4762-9058-8c3afb504b62
description                                                              
year                                                               2025.0
paper_type                                                             10
preview_type                                                            2
book_type                                                               4
pages_count                                                         192.0
circulation                                                          None
isbn                                                      [9785091133622]
class_                                

In [None]:
for i, item in enumerate(predicted):
    if item == -1 :
        print("www.ozon.ru" + df_unknown_items.loc[df_unknown_items.index[i], "url"])

www.ozon.ru/product/russkiy-yazyk-5-klass-uchebnik-chast-2-fgos-ladyzhenskaya-taisa-alekseevna-baranov-863108838/?at=w0tglRDEMcVm3nGLsOJ7KmI89JO4GtNpBq8GI23nARx
www.ozon.ru/product/koty-voiteli-tsikl-voiteli-stan-dikim-hanter-erin-285879805/?at=K8tZ7rjygTo3JMwRfQoGBgjU6ZyP8MSZnqQnWtE4rQxO
www.ozon.ru/product/okruzhayushchiy-mir-1-klass-uchebnik-chast-1-fgos-shkola-rossii-pleshakov-andrey-anatolevich-862242789/?at=08tYNrRXOcrJoABLUknOVR5CM3j4OXfNAK7ArhkqD3Ml
www.ozon.ru/product/russkiy-yazyk-letnie-zadaniya-perehodim-vo-2-y-klass-shkola-rossii-nikishenkova-599158573/?at=QktJvN51GcrrGXOzhZZ1Y52SOm3XzXhrogB9ES9Rn7KJ
www.ozon.ru/product/matematika-1-klass-rabochaya-tetrad-uglublennyy-uroven-chast-3-peterson-lyudmila-georgievna-1762948075/?at=WPtNryAL7h1ZWYoqS5yPVAzhP2zxVltJgwyEKCXrRpXX
www.ozon.ru/product/informatika-3-klass-uchebnik-v-dvuh-chastyah-chast-2-1782413825/?at=79tnGWOX5t5BV899uK5KG8rc3VGN3wupz5R5GCX6on8L
www.ozon.ru/product/igralochka-stupenka-k-shkole-matematika-dlya-detey-5-6

In [None]:
from PIL import Image

In [None]:
def resize_image(image_path: str, target_size=(224, 224)):
    img = Image.open(image_path)
    img_resized = img.resize(target_size)
    img_resized_rgb = img.convert('RGB')

In [None]:
import numpy as np
from sklearn.ensemble import IsolationForest
from transformers import BertTokenizer, BertModel
import torch

# 1. Загрузка предобученного трансформера BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

  from .autonotebook import tqdm as notebook_tqdm
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]


In [None]:
model = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Функция для преобразования текста в эмбеддинги с помощью BERT
def get_bert_embeddings(texts):
    embeddings = []
    for text in texts:
        # Токенизация текста
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)
        # Получение эмбеддингов из последнего слоя BERT
        with torch.no_grad():
            outputs = model(**inputs)
        # Используем среднее по всем токенам для получения фиксированного вектора
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
        embeddings.append(embedding)
    return np.array(embeddings)

In [None]:
# 2. Подготовка данных
# Легальные описания книг (пример данных)
legit_descriptions = [
    "This is a textbook on algebra for 9th grade students.",
    "A comprehensive guide to geometry for high school students.",
    "Learn the basics of physics with this easy-to-understand book.",
    "An introduction to chemistry for beginners."
]

# Преобразование текстовых данных в эмбеддинги
legit_embeddings = get_bert_embeddings(legit_descriptions)

print(2)

# 3. Обучение модели обнаружения аномалий
# Используем Isolation Forest для обучения только на легальных данных
anomaly_detector = IsolationForest(contamination=0.1, random_state=42)
anomaly_detector.fit(legit_embeddings)

print(3)

# 4. Проверка новых данных
# Новые описания книг (некоторые из них могут быть аномалиями)
new_descriptions = [
    "This is a textbook on algebra for 9th grade students.",  # Легальный
    "Fake book description with random words xyz123.",        # Аномалия
    "A detailed guide to biology for college students.",      # Легальный
    "Cheap replica of a famous novel."                        # Аномалия
]

# Преобразование новых описаний в эмбеддинги
new_embeddings = get_bert_embeddings(new_descriptions)

# Предсказание: 1 — нормальный объект, -1 — аномалия
predictions = anomaly_detector.predict(new_embeddings)

# 5. Вывод результатов
for i, (description, prediction) in enumerate(zip(new_descriptions, predictions)):
    status = "Legitimate" if prediction == 1 else "Anomaly"
    print(f"Description {i+1}: {description} -> {status}")

2
3
Description 1: This is a textbook on algebra for 9th grade students. -> Legitimate
Description 2: Fake book description with random words xyz123. -> Legitimate
Description 3: A detailed guide to biology for college students. -> Anomaly
Description 4: Cheap replica of a famous novel. -> Legitimate


In [None]:
# installation
# python -m spacy download ru_core_news_md

# nlp = spacy.load("ru_core_news_lg") # 489 mb
nlp = spacy.load("ru_core_news_md") # 39 mb

In [None]:
Base = declarative_base()

In [None]:
class TrueItem(Base):
    __tablename__ = "education_seller"
    
    id: Mapped[int] = mapped_column(primary_key=True)
    title: Mapped[str]
    url: Mapped[str]
    price: Mapped[int]
    image: Mapped[str] = mapped_column(String(128))
    description: Mapped[str]
    year: Mapped[int]
    paper_type: Mapped[str]
    preview_type: Mapped[str]
    book_type: Mapped[str]
    pages_count: Mapped[int]
    circulation: Mapped[int]
    isbn: Mapped[list[str]] = mapped_column(ARRAY(String))
    class_: Mapped[int] = mapped_column(name="class")
    subject: Mapped[str]
    original_name: Mapped[str]
    author: Mapped[list[str]] = mapped_column(ARRAY(String))
    seller_id: Mapped[int] = mapped_column(
        ForeignKey("seller.id")
    )
    days_to_deliver: Mapped[int]
    
    seller = relationship("Seller", back_populates="true_items")
    
    def dict(self):
        return {
            "id": self.id,
            "title": self.title,
            "url": self.url,
            "price": self.price,
            "image": self.image,
            "description": self.description,
            "year": self.year,
            "paper_type": self.paper_type,
            "preview_type": self.preview_type,
            "book_type": self.book_type,
            "pages_count": self.pages_count,
            "circulation": self.circulation,
            "isbn": self.isbn,
            "class_": self.class_,
            "subject": self.subject,
            "original_name": self.original_name,
            "author": self.author,
            "seller_id": self.seller_id,
            "seller_reg_date": self.seller.reg_date,
            "seller_orders": self.seller.orders,
            "seller_avg_item_rate": self.seller.avg_item_rate,
            "seller_region": self.seller.region,
        }


class Item(Base):
    __tablename__ = "item"
    
    id: Mapped[int] = mapped_column(primary_key=True)
    title: Mapped[str]
    url: Mapped[str]
    price: Mapped[int]
    image: Mapped[str] = mapped_column(String(128))
    description: Mapped[str]
    year: Mapped[int]
    paper_type: Mapped[str]
    preview_type: Mapped[str]
    book_type: Mapped[str]
    pages_count: Mapped[int]
    circulation: Mapped[int]
    isbn: Mapped[list[str]] = mapped_column(ARRAY(String))
    class_: Mapped[int] = mapped_column(name="class")
    subject: Mapped[str]
    original_name: Mapped[str]
    author: Mapped[list[str]] = mapped_column(ARRAY(String))
    seller_id: Mapped[int] = mapped_column(
        ForeignKey("seller.id")
    )
    days_to_deliver: Mapped[int]
    
    seller = relationship("Seller", back_populates="items")
    
    def dict(self):
        return {
            "id": self.id,
            "title": self.title,
            "url": self.url,
            "price": self.price,
            "image": self.image,
            "description": self.description,
            "year": self.year,
            "paper_type": self.paper_type,
            "preview_type": self.preview_type,
            "book_type": self.book_type,
            "pages_count": self.pages_count,
            "circulation": self.circulation,
            "isbn": self.isbn,
            "class_": self.class_,
            "subject": self.subject,
            "original_name": self.original_name,
            "author": self.author,
            "seller_id": self.seller_id,
            "seller_reg_date": self.seller.reg_date,
            "seller_orders": self.seller.orders,
            "seller_avg_item_rate": self.seller.avg_item_rate,
            "seller_region": self.seller.region,
        }
    

class Seller(Base):
    __tablename__ = "seller"
    
    id: Mapped[int] = mapped_column(primary_key=True)
    reg_date: Mapped[datetime]
    orders: Mapped[int]
    avg_item_rate: Mapped[float]
    region: Mapped[str]
    
    items = relationship("Item", back_populates="seller")
    true_items = relationship("TrueItem", back_populates="seller")

In [None]:
engine = create_engine('postgresql://postgres:postgres@localhost:5432/OZON_parse')
Session = sessionmaker(engine, expire_on_commit=True)

In [None]:
def preprocessing_text(string: str) -> str:
    string = " ".join(
        token.lemma_ for token in nlp(string) if (
            (not token.is_stop) and (not token.is_punct) and (str(token) != '|')
        )
    )
    
    return string

In [None]:
def preprocessing_description(string: str) -> str:
    string = re.sub(r'\d+', '', string)
    string = re.sub(r'\b[^а-яА-ЯёЁ\s]+\b', '', string)
    
    string_list = string.split("Автор на обложке")
    if len(string_list) > 1:
        string = string_list[0]
    else:
        string = ""

    string = " ".join(
        token.lemma_ for token in nlp(string) if (
            (not token.is_stop) and (not token.is_punct) and (str(token) != '|')
            and (len(str(token)) > 3)
        )
    )
    
    return string

In [None]:
"""
maybe fake sellers

2146622
2146631
2146646

"""

''

In [None]:
with Session() as session:
    true_items = session.query(TrueItem).filter(TrueItem.seller_id != None).all()
    df_true_items = pd.DataFrame([item.dict() for item in true_items])    

    unknown_items = session.query(Item).filter(Item.seller_id != None).all()
    df_unknown_items = pd.DataFrame([item.dict() for item in unknown_items])  

In [None]:
df_unknown_items.columns

Index(['id', 'title', 'url', 'price', 'image', 'description', 'year',
       'paper_type', 'preview_type', 'book_type', 'pages_count', 'circulation',
       'isbn', 'class_', 'subject', 'original_name', 'author', 'seller_id',
       'seller_reg_date', 'seller_orders', 'seller_avg_item_rate',
       'seller_region'],
      dtype='object')

In [None]:
all_authors = []
all_authors_s = set()

for authors_list in df_true_items["author"]:
    if authors_list:
        for author in authors_list:
            if author not in all_authors_s:
                all_authors.append(author)
                all_authors_s.add(author)

del all_authors_s

In [None]:
df_true_items["description"] = df_true_items["description"].fillna("")
df_unknown_items["description"] = df_unknown_items["description"].fillna("")

df_true_items["class_"] = df_true_items["class_"].fillna(0)
df_unknown_items["class_"] = df_unknown_items["class_"].fillna(0)

median_year = df_true_items["year"].median()
df_true_items["year"] = df_true_items["year"].fillna(median_year)
median_year = df_unknown_items["year"].median()
df_unknown_items["year"] = df_unknown_items["year"].fillna(median_year)

df_true_items["paper_type"] = df_true_items["paper_type"].fillna("")
df_unknown_items["paper_type"] = df_unknown_items["paper_type"].fillna("")

df_true_items["preview_type"] = df_true_items["preview_type"].fillna("")
df_unknown_items["preview_type"] = df_unknown_items["preview_type"].fillna("")

df_true_items["book_type"] = df_true_items["book_type"].fillna("")
df_unknown_items["book_type"] = df_unknown_items["book_type"].fillna("")

df_true_items["seller_orders"] = df_true_items["seller_orders"].fillna(0)
df_unknown_items["seller_orders"] = df_unknown_items["seller_orders"].fillna(0)

df_true_items["seller_avg_item_rate"] = df_true_items["seller_avg_item_rate"].fillna(0)
df_unknown_items["seller_avg_item_rate"] = df_unknown_items["seller_avg_item_rate"].fillna(0)

df_true_items["days_reg_ago"] = df_true_items["seller_reg_date"].apply(lambda date: (datetime.now() - date).days)
df_unknown_items["days_reg_ago"] = df_unknown_items["seller_reg_date"].apply(lambda date: (datetime.now() - date).days)

avg_pages_count = df_true_items["pages_count"].mean()
df_true_items["pages_count"] = df_true_items["pages_count"].fillna(avg_pages_count)
avg_pages_count = df_unknown_items["pages_count"].mean()
df_unknown_items["pages_count"] = df_unknown_items["pages_count"].fillna(avg_pages_count)

In [None]:
df_true_items["title"] = df_true_items["title"].apply(preprocessing_text)
df_unknown_items["title"] = df_unknown_items["title"].apply(preprocessing_text)

In [None]:
df_true_items["description"] = df_true_items["description"].apply(preprocessing_description)
df_unknown_items["description"] = df_unknown_items["description"].apply(preprocessing_description)

In [None]:
df_true_items_1 = df_true_items.copy()
df_unknown_items_1 = df_unknown_items.copy()

In [None]:
df_true_items = df_true_items_1.copy()
df_unknown_items = df_unknown_items_1.copy()

In [None]:
# TF-IDF for titles

all_true_titles = ' '.join(df_true_items["title"])
all_unknown_titles = ' '.join(df_unknown_items["title"])

tfidf_titles = TfidfVectorizer(max_features=50)
transformed = tfidf_titles.fit_transform([all_true_titles, all_unknown_titles])

In [None]:
df = pd.DataFrame(
  transformed[0].T.todense(),
  index=tfidf_titles.get_feature_names_out(), columns=["TF-IDF"]
)

df = df.sort_values('TF-IDF', ascending=False)

useful_words_in_titles = df

In [None]:
# TF-IDF for descriptions

all_true_descriptions = ' '.join(df_true_items["description"])
all_unknown_descriptions = ' '.join(df_unknown_items["description"])

tfidf_descriptions = TfidfVectorizer(max_features=200)
transformed_descriptions = tfidf_descriptions.fit_transform([all_true_descriptions, all_unknown_descriptions])

In [None]:
df = pd.DataFrame(
  transformed_descriptions[0].T.todense(),
  index=tfidf_descriptions.get_feature_names_out(), columns=["TF-IDF"]
)

df = df.sort_values('TF-IDF', ascending=False)

useful_words_in_descriptions =  df

In [None]:
useful_words_in_descriptions_s = set(useful_words_in_descriptions.index)

for i, string in enumerate(df_true_items["description"]):
    words_list = string.split()
    
    final_words = []
    
    for word in words_list:
        if word in useful_words_in_descriptions_s:
            final_words.append(word)
    
    df_true_items.loc[df_true_items.index[i], "description"] = " ".join(final_words)
    
    
useful_words_in_titles_s = set(useful_words_in_titles.index)

for i, string in enumerate(df_true_items["title"]):
    words_list = string.split()
    
    final_words = []
    
    for word in words_list:
        if word in useful_words_in_descriptions_s:
            final_words.append(word)
    
    df_true_items.loc[df_true_items.index[i], "title"] = " ".join(final_words)

In [None]:
useful_words_in_descriptions_s = set(useful_words_in_descriptions.index)

for i, string in enumerate(df_unknown_items["description"]):
    words_list = string.split()
    
    final_words = []
    
    for word in words_list:
        if word in useful_words_in_descriptions_s:
            final_words.append(word)
    
    df_unknown_items.loc[df_unknown_items.index[i], "description"] = " ".join(final_words)
    
    
useful_words_in_titles_s = set(useful_words_in_titles.index)

for i, string in enumerate(df_unknown_items["title"]):
    words_list = string.split()
    
    final_words = []
    
    for word in words_list:
        if word in useful_words_in_descriptions_s:
            final_words.append(word)
    
    df_unknown_items.loc[df_unknown_items.index[i], "title"] = " ".join(final_words)

In [None]:
# True items
# Векторизация всех описаний сразу
tfidf_matrix = tfidf_descriptions.transform(df_true_items["description"])

# Преобразование разреженной матрицы в плотную (если необходимо)
tfidf_dense = tfidf_matrix.toarray()

# Добавление векторов в DataFrame
df_true_items["description_vector"] = list(tfidf_dense)


# Unknown items
tfidf_matrix = tfidf_descriptions.transform(df_unknown_items["description"])
tfidf_dense = tfidf_matrix.toarray()

df_unknown_items["description_vector"] = list(tfidf_dense)

In [None]:
# True items
tfidf_matrix = tfidf_titles.transform(df_true_items["title"])
tfidf_dense = tfidf_matrix.toarray()

df_true_items["title_vector"] = list(tfidf_dense)

# Unknown items
tfidf_matrix = tfidf_titles.transform(df_unknown_items["title"])
tfidf_dense = tfidf_matrix.toarray()

df_unknown_items["title_vector"] = list(tfidf_dense)

In [None]:
paper_types = list(set(np.concatenate([df_true_items["paper_type"].unique(), df_unknown_items["paper_type"].unique()])))
book_types = list(set(np.concatenate([df_true_items["book_type"].unique(), df_unknown_items["book_type"].unique()])))
preview_types = list(set(np.concatenate([df_true_items["preview_type"].unique(), df_unknown_items["preview_type"].unique()])))
regions = list(set(np.concatenate([df_true_items["seller_region"].unique(), df_unknown_items["seller_region"].unique()])))

In [None]:
paper_types_label_encoder = LabelEncoder()
encoded_paper_types = paper_types_label_encoder.fit_transform(paper_types)

book_types_label_encoder = LabelEncoder()
encoded_book_types = book_types_label_encoder.fit_transform(book_types)

preview_types_label_encoder = LabelEncoder()
encoded_preview_types = preview_types_label_encoder.fit_transform(preview_types)

regions_label_encoder = LabelEncoder()
encoded_regions = regions_label_encoder.fit_transform(regions)

In [None]:
df_true_items["paper_type"] = paper_types_label_encoder.transform(df_true_items["paper_type"])
df_true_items["book_type"] = book_types_label_encoder.transform(df_true_items["book_type"])
df_true_items["preview_type"] = preview_types_label_encoder.transform(df_true_items["preview_type"])
df_true_items["seller_region"] = regions_label_encoder.transform(df_true_items["seller_region"])

In [None]:
df_unknown_items["paper_type"] = paper_types_label_encoder.transform(df_unknown_items["paper_type"])
df_unknown_items["book_type"] = book_types_label_encoder.transform(df_unknown_items["book_type"])
df_unknown_items["preview_type"] = preview_types_label_encoder.transform(df_unknown_items["preview_type"])
df_unknown_items["seller_region"] = regions_label_encoder.transform(df_unknown_items["seller_region"])

In [None]:
df_true_items.head()

Unnamed: 0,id,title,url,price,image,description,year,paper_type,preview_type,book_type,...,original_name,author,seller_id,seller_reg_date,seller_orders,seller_avg_item_rate,seller_region,days_reg_ago,description_vector,title_vector
0,8,русский язык задание класс школа россия,/product/russkiy-yazyk-letnie-zadaniya-perehod...,185,15416e32-b929-4e7a-98df-67061d0d2231,,2024.0,10,4,4,...,Русский язык. Летние задания. Переходим во 2-й...,[Никишенкова Александра Викторовна],207249,2022-04-14 15:13:31.561572,1200000,4.9,0,1096,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,10,математика рабочий тетрадь класс часть фгос,/product/matematika-rabochaya-tetrad-1-klass-c...,227,544f263e-0d55-439f-b24d-f7cd6660ff4e,,2025.0,10,4,4,...,Математика. Рабочая тетрадь. 1 класс. В 2-х ч....,"[Моро Мария Игнатьевна, Волкова Светлана Ивано...",207249,2022-04-14 15:13:31.561572,1200000,4.9,0,1096,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,11,мир атлас,/product/okruzhayushchiy-mir-priroda-i-chelove...,233,0ed91455-16ae-4bdc-89b6-b97a6a9ac3ee,,2025.0,7,4,4,...,,[Сивоглазов Владислав Иванович],207249,2022-04-14 15:13:31.561572,1200000,4.9,0,1096,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.7071067811865475, ..."
3,12,английский язык грамматический класс английски...,/product/angliyskiy-yazyk-grammaticheskiy-tren...,204,80fdc53b-6431-4ba4-a6e9-3a396b7c4eda,,2025.0,10,4,4,...,,[Юшина Дарья Геннадьевна],207249,2022-04-14 15:13:31.561572,1200000,4.9,0,1096,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.8164965809277261, 0.0, ..."
4,14,русский язык рабочий тетрадь класс фгос школа ...,/product/russkiy-yazyk-rabochaya-tetrad-1-klas...,272,9732a425-31e4-4dc5-9220-f843838fc56c,,2025.0,10,4,4,...,Русский язык. Рабочая тетрадь. 1 класс,[Канакина Валентина Павловна],207249,2022-04-14 15:13:31.561572,1200000,4.9,0,1096,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [None]:
X_positive = df_true_items.drop(columns=[
    "id", "title", "url", "image", "circulation", "subject", "original_name", 
    "author", "description", "isbn", "seller_reg_date", "seller_id"
])

X_unlabeled = df_unknown_items.drop(columns=[
    "id", "title", "url", "image", "circulation", "subject", "original_name", 
    "author", "description", "isbn", "seller_reg_date", "seller_id"
])

In [None]:
X_positive

Unnamed: 0,price,year,paper_type,preview_type,book_type,pages_count,class_,seller_orders,seller_avg_item_rate,seller_region,days_reg_ago,description_vector,title_vector
0,185,2024.0,10,4,4,80.0,1.0,1200000,4.9,0,1096,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,227,2025.0,10,4,4,48.0,1.0,1200000,4.9,0,1096,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,233,2025.0,7,4,4,40.0,1.0,1200000,4.9,0,1096,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.7071067811865475, ..."
3,204,2025.0,10,4,4,80.0,2.0,1200000,4.9,0,1096,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.8164965809277261, 0.0, ..."
4,272,2025.0,10,4,4,64.0,1.0,1200000,4.9,0,1096,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2185,1203,2025.0,10,4,4,160.0,4.0,1200000,4.9,0,1096,"[0.2182178902359924, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2186,872,2025.0,10,2,4,96.0,4.0,1200000,4.9,0,1096,"[0.0, 0.20412414523193154, 0.0, 0.0, 0.0, 0.0,...","[0.0, 0.0, 0.0, 0.0, 0.4472135954999579, 0.0, ..."
2187,892,2025.0,10,4,4,176.0,2.0,1200000,4.9,0,1096,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2188,231,2024.0,10,4,4,64.0,4.0,1200000,4.9,0,1096,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.16439898...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [None]:
for i in range(200):
    X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
    
X_positive = X_positive.drop(columns=["description_vector"])

for i in range(50):
    X_positive[f"title_vector_{i}"] = X_positive["title_vector"].apply(lambda x: x[i])
    
X_positive = X_positive.drop(columns=["title_vector"])

for i in range(200):
    X_unlabeled[f"description_vector_{i}"] = X_unlabeled["description_vector"].apply(lambda x: x[i])
    
X_unlabeled = X_unlabeled.drop(columns=["description_vector"])

for i in range(50):
    X_unlabeled[f"title_vector_{i}"] = X_unlabeled["title_vector"].apply(lambda x: x[i])
    
X_unlabeled = X_unlabeled.drop(columns=["title_vector"])

  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vec

In [None]:
X_positive

Unnamed: 0,price,year,papper_type,preview_type,book_type,pages_count,class_,description_vector_0,description_vector_1,description_vector_2,...,title_vector_40,title_vector_41,title_vector_42,title_vector_43,title_vector_44,title_vector_45,title_vector_46,title_vector_47,title_vector_48,title_vector_49
0,289,2024.0,11,4,4,112.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.377964,0.000000,0.000000,0.0,0.377964,0.377964
1,119,2024.0,11,4,4,16.0,6.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
2,164,2024.0,11,4,4,48.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.447214,0.000000
3,185,2024.0,11,4,4,80.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.408248,0.408248
4,227,2025.0,11,4,4,48.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.408248,0.408248,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2185,3932,2023.0,11,0,4,208.0,9.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.447214,0.000000,0.447214,0.0,0.000000,0.000000
2186,755,2025.0,0,2,4,120.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.377964,0.0,0.377964,0.000000,0.000000,0.0,0.000000,0.755929
2187,3932,2023.0,11,0,4,208.0,9.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.447214,0.000000,0.447214,0.0,0.000000,0.000000
2188,393,2023.0,11,4,4,144.0,10.0,0.0,0.0,0.0,...,0.0,0.0,0.408248,0.0,0.000000,0.408248,0.000000,0.0,0.000000,0.000000


In [None]:
# X_positive = pd.concat([X_positive, X_unlabeled], ignore_index=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_positive, [1] * len(X_positive), test_size=0.2, random_state=324
)

In [None]:
from sklearn.ensemble import IsolationForest

In [None]:
from sklearn.neighbors import LocalOutlierFactor

model = LocalOutlierFactor(novelty=True)
y = model.fit(X_train)

y = model.predict(X_test)

accuracy_score(y, y_test)

# y = model.predict(X_test)



0.954337899543379

In [None]:
# Случайный вектор
# Кластеризация
# Перекрёстная энтропия

# Векторизация данных
# Анализ отклонений


In [None]:
model = IsolationForest()
model.fit(X_train, y_train)
y = model.predict(X_test)

accuracy_score(y, y_test)


1.0

In [None]:
from sklearn.svm import OneClassSVM
from sklearn.metrics import accuracy_score

model = OneClassSVM(kernel="poly")
model.fit(X_train, y_train)

predicted = model.predict(X_test)

accuracy_score(y_test, predicted)

0.4703196347031963

In [None]:
from sklearn.cluster import DBSCAN

X_summary = pd.concat([X_positive, X_unlabeled], ignore_index=True)

model = DBSCAN(eps=0.5, min_samples=2)
X_summary["cluster"] = model.fit_predict(X_summary)

X_summary.cluster.unique()

TypeError: DBSCAN.__init__() got an unexpected keyword argument 'clusters'

In [None]:
y_predicted = model.predict(X_test)

AttributeError: This 'LocalOutlierFactor' has no attribute 'predict'

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_predicted)

ValueError: Found input variables with inconsistent numbers of samples: [5736, 438]

In [None]:
predicted = model.predict(X_unlabeled)

In [None]:
df_unknown_items.loc[df_unknown_items.index[2191]]

id                                                                   3488
title                                 материал класс базовый уровень фгос
url                     /product/algebra-i-nachala-matematicheskogo-an...
price                                                                 387
image                                11e29511-60a8-4762-9058-8c3afb504b62
description                                                              
year                                                               2025.0
paper_type                                                             10
preview_type                                                            2
book_type                                                               4
pages_count                                                         192.0
circulation                                                          None
isbn                                                      [9785091133622]
class_                                

In [None]:
for i, item in enumerate(predicted):
    if item == -1 :
        print("www.ozon.ru" + df_unknown_items.loc[df_unknown_items.index[i], "url"])

www.ozon.ru/product/russkiy-yazyk-5-klass-uchebnik-chast-2-fgos-ladyzhenskaya-taisa-alekseevna-baranov-863108838/?at=w0tglRDEMcVm3nGLsOJ7KmI89JO4GtNpBq8GI23nARx
www.ozon.ru/product/koty-voiteli-tsikl-voiteli-stan-dikim-hanter-erin-285879805/?at=K8tZ7rjygTo3JMwRfQoGBgjU6ZyP8MSZnqQnWtE4rQxO
www.ozon.ru/product/okruzhayushchiy-mir-1-klass-uchebnik-chast-1-fgos-shkola-rossii-pleshakov-andrey-anatolevich-862242789/?at=08tYNrRXOcrJoABLUknOVR5CM3j4OXfNAK7ArhkqD3Ml
www.ozon.ru/product/russkiy-yazyk-letnie-zadaniya-perehodim-vo-2-y-klass-shkola-rossii-nikishenkova-599158573/?at=QktJvN51GcrrGXOzhZZ1Y52SOm3XzXhrogB9ES9Rn7KJ
www.ozon.ru/product/matematika-1-klass-rabochaya-tetrad-uglublennyy-uroven-chast-3-peterson-lyudmila-georgievna-1762948075/?at=WPtNryAL7h1ZWYoqS5yPVAzhP2zxVltJgwyEKCXrRpXX
www.ozon.ru/product/informatika-3-klass-uchebnik-v-dvuh-chastyah-chast-2-1782413825/?at=79tnGWOX5t5BV899uK5KG8rc3VGN3wupz5R5GCX6on8L
www.ozon.ru/product/igralochka-stupenka-k-shkole-matematika-dlya-detey-5-6

In [None]:
from PIL import Image

In [None]:
def resize_image(image_path: str, target_size=(224, 224)):
    img = Image.open(image_path)
    img_resized = img.resize(target_size)
    img_resized_rgb = img.convert('RGB')

In [None]:
import numpy as np
from sklearn.ensemble import IsolationForest
from transformers import BertTokenizer, BertModel
import torch

# 1. Загрузка предобученного трансформера BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

  from .autonotebook import tqdm as notebook_tqdm
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]


In [None]:
model = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Функция для преобразования текста в эмбеддинги с помощью BERT
def get_bert_embeddings(texts):
    embeddings = []
    for text in texts:
        # Токенизация текста
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)
        # Получение эмбеддингов из последнего слоя BERT
        with torch.no_grad():
            outputs = model(**inputs)
        # Используем среднее по всем токенам для получения фиксированного вектора
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
        embeddings.append(embedding)
    return np.array(embeddings)

In [None]:
# 2. Подготовка данных
# Легальные описания книг (пример данных)
legit_descriptions = [
    "This is a textbook on algebra for 9th grade students.",
    "A comprehensive guide to geometry for high school students.",
    "Learn the basics of physics with this easy-to-understand book.",
    "An introduction to chemistry for beginners."
]

# Преобразование текстовых данных в эмбеддинги
legit_embeddings = get_bert_embeddings(legit_descriptions)

print(2)

# 3. Обучение модели обнаружения аномалий
# Используем Isolation Forest для обучения только на легальных данных
anomaly_detector = IsolationForest(contamination=0.1, random_state=42)
anomaly_detector.fit(legit_embeddings)

print(3)

# 4. Проверка новых данных
# Новые описания книг (некоторые из них могут быть аномалиями)
new_descriptions = [
    "This is a textbook on algebra for 9th grade students.",  # Легальный
    "Fake book description with random words xyz123.",        # Аномалия
    "A detailed guide to biology for college students.",      # Легальный
    "Cheap replica of a famous novel."                        # Аномалия
]

# Преобразование новых описаний в эмбеддинги
new_embeddings = get_bert_embeddings(new_descriptions)

# Предсказание: 1 — нормальный объект, -1 — аномалия
predictions = anomaly_detector.predict(new_embeddings)

# 5. Вывод результатов
for i, (description, prediction) in enumerate(zip(new_descriptions, predictions)):
    status = "Legitimate" if prediction == 1 else "Anomaly"
    print(f"Description {i+1}: {description} -> {status}")

2
3
Description 1: This is a textbook on algebra for 9th grade students. -> Legitimate
Description 2: Fake book description with random words xyz123. -> Legitimate
Description 3: A detailed guide to biology for college students. -> Anomaly
Description 4: Cheap replica of a famous novel. -> Legitimate


In [452]:
y_predicted = model.predict(X_test)

AttributeError: This 'LocalOutlierFactor' has no attribute 'predict'

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_predicted)

ValueError: Found input variables with inconsistent numbers of samples: [5736, 438]

In [70]:
predicted = model.predict(X_unlabeled)

In [91]:
df_unknown_items.loc[df_unknown_items.index[2191]]

id                                                                   3488
title                                 материал класс базовый уровень фгос
url                     /product/algebra-i-nachala-matematicheskogo-an...
price                                                                 387
image                                11e29511-60a8-4762-9058-8c3afb504b62
description                                                              
year                                                               2025.0
paper_type                                                             10
preview_type                                                            2
book_type                                                               4
pages_count                                                         192.0
circulation                                                          None
isbn                                                      [9785091133622]
class_                                

In [77]:
for i, item in enumerate(predicted):
    if item == -1 :
        print("www.ozon.ru" + df_unknown_items.loc[df_unknown_items.index[i], "url"])

www.ozon.ru/product/russkiy-yazyk-5-klass-uchebnik-chast-2-fgos-ladyzhenskaya-taisa-alekseevna-baranov-863108838/?at=w0tglRDEMcVm3nGLsOJ7KmI89JO4GtNpBq8GI23nARx
www.ozon.ru/product/koty-voiteli-tsikl-voiteli-stan-dikim-hanter-erin-285879805/?at=K8tZ7rjygTo3JMwRfQoGBgjU6ZyP8MSZnqQnWtE4rQxO
www.ozon.ru/product/okruzhayushchiy-mir-1-klass-uchebnik-chast-1-fgos-shkola-rossii-pleshakov-andrey-anatolevich-862242789/?at=08tYNrRXOcrJoABLUknOVR5CM3j4OXfNAK7ArhkqD3Ml
www.ozon.ru/product/russkiy-yazyk-letnie-zadaniya-perehodim-vo-2-y-klass-shkola-rossii-nikishenkova-599158573/?at=QktJvN51GcrrGXOzhZZ1Y52SOm3XzXhrogB9ES9Rn7KJ
www.ozon.ru/product/matematika-1-klass-rabochaya-tetrad-uglublennyy-uroven-chast-3-peterson-lyudmila-georgievna-1762948075/?at=WPtNryAL7h1ZWYoqS5yPVAzhP2zxVltJgwyEKCXrRpXX
www.ozon.ru/product/informatika-3-klass-uchebnik-v-dvuh-chastyah-chast-2-1782413825/?at=79tnGWOX5t5BV899uK5KG8rc3VGN3wupz5R5GCX6on8L
www.ozon.ru/product/igralochka-stupenka-k-shkole-matematika-dlya-detey-5-6

In [1]:
from PIL import Image

In [None]:
def resize_image(image_path: str, target_size=(224, 224)):
    img = Image.open(image_path)
    img_resized = img.resize(target_size)
    img_resized_rgb = img.convert('RGB')

In [1]:
import numpy as np
from sklearn.ensemble import IsolationForest
from transformers import BertTokenizer, BertModel
import torch

# 1. Загрузка предобученного трансформера BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

  from .autonotebook import tqdm as notebook_tqdm
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]


In [2]:
model = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
# Функция для преобразования текста в эмбеддинги с помощью BERT
def get_bert_embeddings(texts):
    embeddings = []
    for text in texts:
        # Токенизация текста
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)
        # Получение эмбеддингов из последнего слоя BERT
        with torch.no_grad():
            outputs = model(**inputs)
        # Используем среднее по всем токенам для получения фиксированного вектора
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
        embeddings.append(embedding)
    return np.array(embeddings)

In [4]:
# 2. Подготовка данных
# Легальные описания книг (пример данных)
legit_descriptions = [
    "This is a textbook on algebra for 9th grade students.",
    "A comprehensive guide to geometry for high school students.",
    "Learn the basics of physics with this easy-to-understand book.",
    "An introduction to chemistry for beginners."
]

# Преобразование текстовых данных в эмбеддинги
legit_embeddings = get_bert_embeddings(legit_descriptions)

print(2)

# 3. Обучение модели обнаружения аномалий
# Используем Isolation Forest для обучения только на легальных данных
anomaly_detector = IsolationForest(contamination=0.1, random_state=42)
anomaly_detector.fit(legit_embeddings)

print(3)

# 4. Проверка новых данных
# Новые описания книг (некоторые из них могут быть аномалиями)
new_descriptions = [
    "This is a textbook on algebra for 9th grade students.",  # Легальный
    "Fake book description with random words xyz123.",        # Аномалия
    "A detailed guide to biology for college students.",      # Легальный
    "Cheap replica of a famous novel."                        # Аномалия
]

# Преобразование новых описаний в эмбеддинги
new_embeddings = get_bert_embeddings(new_descriptions)

# Предсказание: 1 — нормальный объект, -1 — аномалия
predictions = anomaly_detector.predict(new_embeddings)

# 5. Вывод результатов
for i, (description, prediction) in enumerate(zip(new_descriptions, predictions)):
    status = "Legitimate" if prediction == 1 else "Anomaly"
    print(f"Description {i+1}: {description} -> {status}")

2
3
Description 1: This is a textbook on algebra for 9th grade students. -> Legitimate
Description 2: Fake book description with random words xyz123. -> Legitimate
Description 3: A detailed guide to biology for college students. -> Anomaly
Description 4: Cheap replica of a famous novel. -> Legitimate
