In [388]:
from sqlalchemy import create_engine, String, ARRAY
from sqlalchemy.orm import declarative_base, sessionmaker, Mapped, mapped_column

import spacy
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC, OneClassSVM
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from pulearn import ElkanotoPuClassifier

In [2]:
# installation
# python -m spacy download ru_core_news_md

# nlp = spacy.load("ru_core_news_lg") # 489 mb
nlp = spacy.load("ru_core_news_md") # 39 mb

In [3]:
Base = declarative_base()

In [4]:
class TrueItem(Base):
    __tablename__ = "education_seller"
    
    id: Mapped[int] = mapped_column(primary_key=True)
    title: Mapped[str]
    url: Mapped[str]
    price: Mapped[int]
    image: Mapped[str] = mapped_column(String(128))
    description: Mapped[str]
    year: Mapped[int]
    papper_type: Mapped[str]
    preview_type: Mapped[str]
    book_type: Mapped[str]
    pages_count: Mapped[int]
    circulation: Mapped[int]
    isbn: Mapped[list[str]] = mapped_column(ARRAY(String))
    class_: Mapped[int] = mapped_column(name="class")
    subject: Mapped[str]
    original_name: Mapped[str]
    author: Mapped[list[str]] = mapped_column(ARRAY(String))
    
    def dict(self):
        return {
            "id": self.id,
            "title": self.title,
            "url": self.url,
            "price": self.price,
            "image": self.image,
            "description": self.description,
            "year": self.year,
            "papper_type": self.papper_type,
            "preview_type": self.preview_type,
            "book_type": self.book_type,
            "pages_count": self.pages_count,
            "circulation": self.circulation,
            "isbn": self.isbn,
            "class_": self.class_,
            "subject": self.subject,
            "original_name": self.original_name,
            "author": self.author
        }


class Item(Base):
    __tablename__ = "item"
    
    id: Mapped[int] = mapped_column(primary_key=True)
    title: Mapped[str]
    url: Mapped[str]
    price: Mapped[int]
    image: Mapped[str] = mapped_column(String(128))
    description: Mapped[str]
    year: Mapped[int]
    papper_type: Mapped[str]
    preview_type: Mapped[str]
    book_type: Mapped[str]
    pages_count: Mapped[int]
    circulation: Mapped[int]
    isbn: Mapped[list[str]] = mapped_column(ARRAY(String))
    class_: Mapped[int] = mapped_column(name="class")
    subject: Mapped[str]
    original_name: Mapped[str]
    author: Mapped[list[str]] = mapped_column(ARRAY(String))
    
    def dict(self):
        return {
            "id": self.id,
            "title": self.title,
            "url": self.url,
            "price": self.price,
            "image": self.image,
            "description": self.description,
            "year": self.year,
            "papper_type": self.papper_type,
            "preview_type": self.preview_type,
            "book_type": self.book_type,
            "pages_count": self.pages_count,
            "circulation": self.circulation,
            "isbn": self.isbn,
            "class_": self.class_,
            "subject": self.subject,
            "original_name": self.original_name,
            "author": self.author
        }

In [5]:
engine = create_engine('postgresql://postgres:postgres@localhost:5432/OZON_parse')
Session = sessionmaker(engine, expire_on_commit=True)

In [38]:
def preprocessing_text(string: str) -> str:
    string = " ".join(
        token.lemma_ for token in nlp(string) if (
            (not token.is_stop) and (not token.is_punct) and (str(token) != '|')
        )
    )
    
    return string

In [164]:
def preprocessing_description(string: str) -> str:
    string = re.sub(r'\d+', '', string)
    string = re.sub(r'\b[^а-яА-ЯёЁ\s]+\b', '', string)
    
    string_list = string.split("Автор на обложке")
    if len(string_list) > 1:
        string = string_list[0]
    else:
        string = ""

    string = " ".join(
        token.lemma_ for token in nlp(string) if (
            (not token.is_stop) and (not token.is_punct) and (str(token) != '|')
            and (len(str(token)) > 3)
        )
    )
    
    return string

In [250]:
with Session() as session:
    true_items = session.query(TrueItem).all()
    df_true_items = pd.DataFrame([item.dict() for item in true_items])    

    unknown_items = session.query(Item).all()
    df_unknown_items = pd.DataFrame([item.dict() for item in unknown_items])  

In [249]:
all_authors = []
all_authors_s = set()

for authors_list in df_true_items["author"]:
    if authors_list:
        for author in authors_list:
            if author not in all_authors_s:
                all_authors.append(author)
                all_authors_s.add(author)

del all_authors_s

In [364]:
df_true_items["description"] = df_true_items["description"].fillna("")
df_unknown_items["description"] = df_unknown_items["description"].fillna("")

df_true_items["class_"] = df_true_items["class_"].fillna(0)
df_unknown_items["class_"] = df_unknown_items["class_"].fillna(0)

median_year = df_true_items["year"].median()
df_true_items["year"] = df_true_items["year"].fillna(median_year)
median_year = df_unknown_items["year"].median()
df_unknown_items["year"] = df_unknown_items["year"].fillna(median_year)

df_true_items["papper_type"] = df_true_items["papper_type"].fillna("")
df_unknown_items["papper_type"] = df_unknown_items["papper_type"].fillna("")

df_true_items["preview_type"] = df_true_items["preview_type"].fillna("")
df_unknown_items["preview_type"] = df_unknown_items["preview_type"].fillna("")

df_true_items["book_type"] = df_true_items["book_type"].fillna("")
df_unknown_items["book_type"] = df_unknown_items["book_type"].fillna("")

avg_pages_count = df_true_items["pages_count"].mean()
df_true_items["pages_count"] = df_true_items["pages_count"].fillna(avg_pages_count)
avg_pages_count = df_unknown_items["pages_count"].mean()
df_unknown_items["pages_count"] = df_unknown_items["pages_count"].fillna(avg_pages_count)

In [252]:
df_true_items["title"] = df_true_items["title"].apply(preprocessing_text)
df_unknown_items["title"] = df_unknown_items["title"].apply(preprocessing_text)

In [254]:
df_true_items["description"] = df_true_items["description"].apply(preprocessing_description)
df_unknown_items["description"] = df_unknown_items["description"].apply(preprocessing_description)

In [255]:
df_true_items_1 = df_true_items.copy()
df_unknown_items_1 = df_unknown_items.copy()

In [363]:
df_true_items = df_true_items_1.copy()
df_unknown_items = df_unknown_items_1.copy()

In [365]:
# TF-IDF for titles

all_true_titles = ' '.join(df_true_items["title"])
all_unknown_titles = ' '.join(df_unknown_items["title"])

tfidf_titles = TfidfVectorizer(max_features=50)
transformed = tfidf_titles.fit_transform([all_true_titles, all_unknown_titles])

In [366]:
df = pd.DataFrame(
  transformed[0].T.todense(),
  index=tfidf_titles.get_feature_names_out(), columns=["TF-IDF"]
)

df = df.sort_values('TF-IDF', ascending=False)

useful_words_in_titles = df

In [367]:
# TF-IDF for descriptions

all_true_descriptions = ' '.join(df_true_items["description"])
all_unknown_descriptions = ' '.join(df_unknown_items["description"])

tfidf_descriptions = TfidfVectorizer(max_features=200)
transformed_descriptions = tfidf_descriptions.fit_transform([all_true_descriptions, all_unknown_descriptions])

In [368]:
df = pd.DataFrame(
  transformed_descriptions[0].T.todense(),
  index=tfidf_descriptions.get_feature_names_out(), columns=["TF-IDF"]
)

df = df.sort_values('TF-IDF', ascending=False)

useful_words_in_descriptions =  df

In [369]:
useful_words_in_descriptions_s = set(useful_words_in_descriptions.index)

for i, string in enumerate(df_true_items["description"]):
    words_list = string.split()
    
    final_words = []
    
    for word in words_list:
        if word in useful_words_in_descriptions_s:
            final_words.append(word)
    
    df_true_items.loc[df_true_items.index[i], "description"] = " ".join(final_words)
    
    
useful_words_in_titles_s = set(useful_words_in_titles.index)

for i, string in enumerate(df_true_items["title"]):
    words_list = string.split()
    
    final_words = []
    
    for word in words_list:
        if word in useful_words_in_descriptions_s:
            final_words.append(word)
    
    df_true_items.loc[df_true_items.index[i], "title"] = " ".join(final_words)

In [370]:
useful_words_in_descriptions_s = set(useful_words_in_descriptions.index)

for i, string in enumerate(df_unknown_items["description"]):
    words_list = string.split()
    
    final_words = []
    
    for word in words_list:
        if word in useful_words_in_descriptions_s:
            final_words.append(word)
    
    df_unknown_items.loc[df_unknown_items.index[i], "description"] = " ".join(final_words)
    
    
useful_words_in_titles_s = set(useful_words_in_titles.index)

for i, string in enumerate(df_unknown_items["title"]):
    words_list = string.split()
    
    final_words = []
    
    for word in words_list:
        if word in useful_words_in_descriptions_s:
            final_words.append(word)
    
    df_unknown_items.loc[df_unknown_items.index[i], "title"] = " ".join(final_words)

In [371]:
# True items
# Векторизация всех описаний сразу
tfidf_matrix = tfidf_descriptions.transform(df_true_items["description"])

# Преобразование разреженной матрицы в плотную (если необходимо)
tfidf_dense = tfidf_matrix.toarray()

# Добавление векторов в DataFrame
df_true_items["description_vector"] = list(tfidf_dense)


# Unknown items
tfidf_matrix = tfidf_descriptions.transform(df_unknown_items["description"])
tfidf_dense = tfidf_matrix.toarray()

df_unknown_items["description_vector"] = list(tfidf_dense)

In [372]:
# True items
tfidf_matrix = tfidf_titles.transform(df_true_items["title"])
tfidf_dense = tfidf_matrix.toarray()

df_true_items["title_vector"] = list(tfidf_dense)

# Unknown items
tfidf_matrix = tfidf_titles.transform(df_unknown_items["title"])
tfidf_dense = tfidf_matrix.toarray()

df_unknown_items["title_vector"] = list(tfidf_dense)

In [373]:
papper_types = list(set(np.concatenate([df_true_items["papper_type"].unique(), df_unknown_items["papper_type"].unique()])))
book_types = list(set(np.concatenate([df_true_items["book_type"].unique(), df_unknown_items["book_type"].unique()])))
preview_types = list(set(np.concatenate([df_true_items["preview_type"].unique(), df_unknown_items["preview_type"].unique()])))

In [374]:
papper_types_label_encoder = LabelEncoder()
encoded_papper_types = papper_types_label_encoder.fit_transform(papper_types)

book_types_label_encoder = LabelEncoder()
encoded_book_types = book_types_label_encoder.fit_transform(book_types)

preview_types_label_encoder = LabelEncoder()
encoded_preview_types = preview_types_label_encoder.fit_transform(preview_types)

In [375]:
df_true_items["papper_type"] = papper_types_label_encoder.transform(df_true_items["papper_type"])
df_true_items["book_type"] = book_types_label_encoder.transform(df_true_items["book_type"])
df_true_items["preview_type"] = preview_types_label_encoder.transform(df_true_items["preview_type"])

In [376]:
df_unknown_items["papper_type"] = papper_types_label_encoder.transform(df_unknown_items["papper_type"])
df_unknown_items["book_type"] = book_types_label_encoder.transform(df_unknown_items["book_type"])
df_unknown_items["preview_type"] = preview_types_label_encoder.transform(df_unknown_items["preview_type"])

In [377]:
df_true_items

Unnamed: 0,id,title,url,price,image,description,year,papper_type,preview_type,book_type,pages_count,circulation,isbn,class_,subject,original_name,author,description_vector,title_vector
0,455,русский язык тетрадь учебный достижение класс ...,/product/russkiy-yazyk-tetrad-uchebnyh-dostizh...,289,fef768ad-fb55-44aa-b748-3a755f47fc97,,2024.0,11,4,4,112.0,,[9785090895163],4.0,"Русский язык, Русский (неродной) и родной (нер...",,[Канакина Валентина Павловна],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,3,история россия контурный карта класс,/product/istoriya-rossii-konturnye-karty-6-kla...,119,5d552dd2-ac31-42cf-9edc-195d5061f067,,2024.0,11,4,4,16.0,,[9785091121476],6.0,"Исторические дисциплины, История",История России. Контурные карты. 6 класс,[Тороп Валерия Валерьевна],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,5,математика задание класс школа россия,/product/matematika-letnie-zadaniya-perehodim-...,164,829a6739-0ae2-4b5a-84b2-5f8896b0135d,,2024.0,11,4,4,48.0,,[9785091158885],1.0,Математика,Математика. Летние задания. Переходим во 2-й к...,[Светин Андрей Валентинович],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,8,русский язык задание класс школа россия,/product/russkiy-yazyk-letnie-zadaniya-perehod...,185,15416e32-b929-4e7a-98df-67061d0d2231,,2024.0,11,4,4,80.0,,[9785091129922],1.0,Русский язык,Русский язык. Летние задания. Переходим во 2-й...,[Никишенкова Александра Викторовна],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,10,математика рабочий тетрадь класс часть фгос,/product/matematika-rabochaya-tetrad-1-klass-c...,227,544f263e-0d55-439f-b24d-f7cd6660ff4e,,2025.0,11,4,4,48.0,,[9785091233582],1.0,Математика,Математика. Рабочая тетрадь. 1 класс. В 2-х ч....,"[Моро Мария Игнатьевна, Волкова Светлана Ивано...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2185,2187,класс учебный пособие часть,/product/biologiya-9-klass-uchebnoe-posobie-v-...,3932,397ccc42-d28f-45a9-80c7-6ccfcd4b4ebe,учебный пособие выполнить крупный шрифт предна...,2023.0,11,0,4,208.0,,[9785090953634],9.0,Биология,,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2186,1071,немецкий язык учебный пособие немецкий язык на...,/product/nemetskiy-yazyk-chitaem-i-pishem-a1-u...,755,501889ae-cf15-4ab2-ad88-62073f3aa464,,2025.0,0,2,4,120.0,,[9785091191943],0.0,Иностранный язык,,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2187,2188,класс учебный пособие часть,/product/biologiya-9-klass-uchebnoe-posobie-v-...,3932,3105ae1d-3bfc-4aca-bdab-1e46bc7b573a,учебный пособие выполнить крупный шрифт предна...,2023.0,11,0,4,208.0,,[9785090953641],9.0,Биология,,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2188,1048,класс базовый уровень проверочный контрольный ...,/product/himiya-10-klass-bazovyy-uroven-prover...,393,412dadad-253f-4a75-b48a-b9eb8adfd47f,,2023.0,11,4,4,144.0,,[9785090967839],10.0,Химия,,"[Габриелян Олег Сергеевич, Лысова Галина Георг...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4082482904638..."


In [408]:
X_positive = df_true_items.drop(columns=[
    "id", "title", "url", "image", "circulation", "subject", "original_name", 
    "author", "description", "isbn"
])

X_unlabeled = df_unknown_items.drop(columns=[
    "id", "title", "url", "image", "circulation", "subject", "original_name", 
    "author", "description", "isbn"
])

In [379]:
X_positive

Unnamed: 0,price,year,papper_type,preview_type,book_type,pages_count,class_,description_vector,title_vector
0,289,2024.0,11,4,4,112.0,4.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,119,2024.0,11,4,4,16.0,6.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,164,2024.0,11,4,4,48.0,1.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,185,2024.0,11,4,4,80.0,1.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,227,2025.0,11,4,4,48.0,1.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...,...,...
2185,3932,2023.0,11,0,4,208.0,9.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2186,755,2025.0,0,2,4,120.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2187,3932,2023.0,11,0,4,208.0,9.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2188,393,2023.0,11,4,4,144.0,10.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4082482904638..."


In [414]:
for i in range(200):
    X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
    
X_positive = X_positive.drop(columns=["description_vector"])

for i in range(50):
    X_positive[f"title_vector_{i}"] = X_positive["title_vector"].apply(lambda x: x[i])
    
X_positive = X_positive.drop(columns=["title_vector"])

for i in range(200):
    X_unlabeled[f"description_vector_{i}"] = X_unlabeled["description_vector"].apply(lambda x: x[i])
    
X_unlabeled = X_unlabeled.drop(columns=["description_vector"])

for i in range(50):
    X_unlabeled[f"title_vector_{i}"] = X_unlabeled["title_vector"].apply(lambda x: x[i])
    
X_unlabeled = X_unlabeled.drop(columns=["title_vector"])

  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vector_{i}"] = X_positive["description_vector"].apply(lambda x: x[i])
  X_positive[f"description_vec

In [409]:
X_positive

Unnamed: 0,price,year,papper_type,preview_type,book_type,pages_count,class_,description_vector,title_vector
0,289,2024.0,11,4,4,112.0,4.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,119,2024.0,11,4,4,16.0,6.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,164,2024.0,11,4,4,48.0,1.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,185,2024.0,11,4,4,80.0,1.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,227,2025.0,11,4,4,48.0,1.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...,...,...
2185,3932,2023.0,11,0,4,208.0,9.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2186,755,2025.0,0,2,4,120.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2187,3932,2023.0,11,0,4,208.0,9.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2188,393,2023.0,11,4,4,144.0,10.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4082482904638..."


In [463]:
X_positive = pd.concat([X_positive, X_unlabeled], ignore_index=True)

In [464]:
X_train, X_test, y_train, y_test = train_test_split(
    X_positive, [1] * len(X_positive), test_size=0.2, random_state=324
)

In [425]:
from sklearn.ensemble import IsolationForest

In [466]:
from sklearn.neighbors import LocalOutlierFactor

model = LocalOutlierFactor(novelty=False)
y = model.fit_predict(X_train)

# y = model.predict(X_test)

In [448]:
model = IsolationForest(contamination=0.03)
model.fit(X_train, y_train)

In [452]:
y_predicted = model.predict(X_test)

AttributeError: This 'LocalOutlierFactor' has no attribute 'predict'

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_predicted)

ValueError: Found input variables with inconsistent numbers of samples: [5736, 438]

In [461]:
predicted = model.predict(X_unlabeled)



In [446]:
df_unknown_items.loc[df_unknown_items.index[429]]

id                                                                  372
title                                  учебник русский язык класс часть
url                   /product/uchebnik-russkiy-yazyk-2-klass-chast-...
price                                                              1039
image                              57dddb01-8d08-40a5-a261-badbaf3af128
description           учебник русский язык класс часть просвещение ш...
year                                                             2024.0
papper_type                                                           0
preview_type                                                          4
book_type                                                             4
pages_count                                                       144.0
circulation                                                        None
isbn                                                    [9785091106268]
class_                                                          

In [469]:
for i, item in enumerate(predicted):
    if item == -1 :
        print("www.ozon.ru" + df_unknown_items.loc[df_unknown_items.index[i], "url"])

www.ozon.ru/product/okruzhayushchiy-mir-1-klass-pleshakov-chast-2-b-u-uchebnik-fgos-shkola-rossii-1832608177/?at=Z8tXKo3qZhyKG0GOty2O4jKF8Oq0JXcp7jl6PHXNzoy8
www.ozon.ru/product/koty-voiteli-tsikl-voiteli-stan-dikim-hanter-erin-285879805/?at=K8tZ7rjygTo3JMwRfQoGBgjU6ZyP8MSZnqQnWtE4rQxO
www.ozon.ru/product/koty-voiteli-tsikl-voiteli-ogon-i-led-hanter-erin-289049231/?at=46tR47p2NTPmOOZ7hpvg265UkE2DMINwmyXJS8OWYv0
www.ozon.ru/product/gotovimsya-k-shkole-rabochaya-tetrad-dlya-detey-6-7-let-komplekt-chast-1-2-fgos-do-shevelev-1178279234/?at=r2t4QD5RKFmYJQy7tKJMxKrf4wjzrMFQ5No7rfgAOZWA
www.ozon.ru/product/rabochie-tetradi-shkola-rossii-2-klass-novyy-fgos-komplekt-kanakina-valentina-pavlovna-volkova-1140680585/?at=ywtAOylPvs53LvOZTVV5K80H5pY54RhNKy3wyI16ED7Y
www.ozon.ru/product/koty-voiteli-tsikl-nachalo-plemen-pervaya-bitva-hanter-erin-474850004/?at=ywtAOylPvskKpnNyhKrgBAqHZq432ASVJVKrRf9YOrx1
www.ozon.ru/product/russkiy-yazyk-4-klass-rabochaya-tetrad-chast-2-klimanova-l-f-1849192961/?at=qQt

In [1]:
from PIL import Image

In [None]:
def resize_image(image_path: str, target_size=(224, 224)):
    img = Image.open(image_path)
    img_resized = img.resize(target_size)
    img_resized_rgb = img.convert('RGB')