<a href="https://colab.research.google.com/github/sweonurulu/colab_notebooks/blob/main/muzayede2_ysa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
import pandas as pd
import numpy as np
import re
import datetime
import string
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import word_tokenize
from snowballstemmer import TurkishStemmer
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from PIL import Image
from io import BytesIO
import tensorflow as tf


#pd.set_option('display.max_colwidth', None)
#pd.reset_option('display.max_colwidth')

## GET DATASET

In [None]:
no_of_day = 7

In [None]:
# Verileri al
response = requests.get("http://95.70.184.79:3066/data/get_item_data/{0}".format(no_of_day))

In [None]:
# Yanıtın JSON verisini al
data = response.json()

In [None]:
# JSON verisinden item_data ve auction_data'yı DataFrame'lere dönüştür
item_data = pd.DataFrame(data["item_data"])
auction_data = pd.DataFrame(data["auction_data"])

In [None]:
item_df = item_data.copy()
auction_df = auction_data.copy()

In [None]:
dataset=pd.merge(left=auction_df,right=item_df,how="outer",on="auction_id")

In [None]:
df=dataset.copy()

In [None]:
# Değiştirme işlemi yapılmayacak sütunlar listesi
excluded_columns = ["item_description", "item_subdescription"]

In [None]:
# Belirtilen sütunlar hariç tüm sütunlarda boş string ve tek boşlukları np.nan ile değiştir
df.loc[:, ~df.columns.isin(excluded_columns)] = df.loc[:, ~df.columns.isin(excluded_columns)].replace({"": np.nan, " ": np.nan})

## CATEGORIZATION

 ### item_name' a göre kategorilendirme

In [None]:
# Müzayede açıklamaları için Kategorileri belirleyen kelimeler ve bu kelimelere karşılık gelen kategoriler
categories_item_name = {
    'obje': 'obje',

    'efemera' : 'efemera',
    'kartpostal' : 'efemera',
    'pul' : 'efemera',
    'mnh' : 'efemera',
    'fdc' : 'efemera',
    'filateli' : "efemera",
    'fi̇lateli̇' : "efemera",

    'tablo' : 'eser',
    'resim' : 'eser',
    'heykel' : 'eser',

    'para': 'nümismatik',
    'banknot': 'nümismatik',

    'kitap' : 'kitap',
    'dergi' : 'kitap',
    'çizgi roman' : 'kitap'
}

In [None]:
def categorize_item_name(description):
    description = description.lower()  # Küçük harfe dönüştürme
    for keyword, category in categories_item_name.items():
        if keyword in description:
            return category
        else:
            return np.nan

In [None]:
df['item_category'] = df['item_name'].apply(categorize_item_name)

 ### auction_description'a göre kategorilendirme

In [None]:
# Müzayede açıklamaları için Kategorileri belirleyen kelimeler ve bu kelimelere karşılık gelen kategoriler
categories = {
    'para': 'nümismatik',
    'banknot': 'nümismatik',
    'nümismatik': 'nümismatik',
    'nümi̇smati̇k' : "nümismatik",
    'numismatik' : 'nümismatik',
    'numi̇smati̇k' : 'nümismatik',
    'nümizmatik' : 'nümismatik',
    'kağit' : 'nümismatik',

    'fdc' : 'efemera',
    'pul' : 'efemera',
    'kartpostal' : 'efemera',
    'filateli' : "efemera",
    'fi̇lateli̇' : "efemera",
    'efemera' : 'efemera',
    'fila' : 'efemera',

    'obje': 'obje',

    'resim' : 'eser',
    'heykel' : 'eser',
    'tablo' : 'eser',
    'eser' : 'eser',
    'sanat' : 'eser',
    'resi̇m' : 'eser',
    'antika' : 'eser',
    'art' : 'eser',

    'kitap' : 'kitap',
    'kitabhane' : 'kitap',
    'çi̇zgi̇ roman' : 'kitap',
    'çizgi roman' : 'kitap',
    'çi̇zgi roman' : 'kitap',
    'çizgi roman' : 'kitap',
    'dergi' : 'kitap',
    'ki̇tap' : 'kitap'
}

In [None]:
def categorize_auctions(description):
    description = description.lower()  # Küçük harfe dönüştürme
    for keyword, category in categories.items():
        if keyword in description:
            return category
    return 'diğer'

In [None]:
df['item_category'].fillna(df['auction_description'].apply(categorize_auctions), inplace=True)

In [None]:
df["item_category"].value_counts()

item_category
nümismatik    4791
efemera       4246
eser          3358
diğer         3298
obje          1731
kitap          388
Name: count, dtype: int64

In [None]:
pd.crosstab(df["item_category"], df["item_sold"])

item_sold,0,1
item_category,Unnamed: 1_level_1,Unnamed: 2_level_1
diğer,1772,1526
efemera,2318,1928
eser,1765,1593
kitap,56,332
nümismatik,1695,3096
obje,916,815


### CATEGORY TO INT

In [None]:
dummy_cols = pd.get_dummies(df["item_category"], drop_first=True)

In [None]:
#df = pd.concat([df, dummy_cols], axis=1)

In [None]:
#df.drop("item_category",axis=1,inplace=True)

## PREPROCESSING

**takip sayısı sütunundaki boş değerleri 0 ile dolduralım.**

In [None]:
df["item_tracking_no"] = df["item_tracking_no"].replace(np.nan,0)

**veritiplerini düzenleyelim.**

In [None]:
df["auction_account"]=df["auction_account"].astype("string")
df["auction_datetime"] = pd.to_datetime(df["auction_datetime"])
df["auction_description"]=df["auction_description"].astype("string")
df["item_name"]=df["item_name"].astype("string")
df["item_img"]=df["item_img"].astype("string")
df["item_description"]=df["item_description"].astype("string")
df["item_reserve_price"]=df["item_reserve_price"].astype("string")
df["item_sold"]=df["item_sold"].astype("int16")
df["item_tracking_no"]=df["item_tracking_no"].astype("uint")
df["item_subdescription"]=df["item_subdescription"].astype("string")

**Tarih sütununu parçalara ayıralım.**

In [None]:
df[["auction_datetime_day", "auction_datetime_month", "auction_datetime_year",
    "auction_datetime_hour", "auction_datetime_minute"]] = df["auction_datetime"].apply(
        lambda dt: pd.Series([dt.day, dt.month, dt.year, dt.hour, dt.minute]))

**USD verileri TL birimine dönüştürelim.**

In [None]:
for i in range(df.shape[0]):
    if df["item_reserve_price"][i].endswith("TL")==False:
        df = df.drop(i)

**Açılış fiyatı kısmındaki TL ibaresini silelim ve Açılış fiyatı verilerini integer formate çevirelim.**

In [None]:
df["item_reserve_price"] = df["item_reserve_price"].str.replace(".", "").str.replace(" TL", "").astype(int)

**İhtiyaç duymayacağımız sütunları silelim.**

In [None]:
df.drop(["auction_url","auction_id","item_id","estimate_price","item_price","auction_datetime","auction_account","item_name"],axis=1,inplace=True)

**İki DataFrame'deki açıklama ve alt açıklama sütunlarını birleştirerek tek bir sütun oluşturalım.**

In [None]:
df["item_merged_description"] = df['item_description'].str.cat(df['item_subdescription'], sep=' ')

**açıklama ve alt açıklama sütunları ile işimiz kalmadı. onları silelim.**

In [None]:
df.drop(["item_description","item_subdescription"],axis=1,inplace=True)

## NLP

In [None]:
str_cols =["auction_description","item_merged_description"]

**Tüm metinsel verileri küçük harfe dönüştürelim.**

In [None]:
for col in str_cols:
  df[col] = df[col].str.lower()

**Sütunların içindeki noktalama işaretlerini silme işlemi**

In [None]:
punctutations = string.punctuation

In [None]:
for col in str_cols:
    df[col] = df[col].apply(lambda x: ''.join(char if char not in punctutations else ' ' for char in x))

**Türkçe stopwordsleri indirelim.**

In [None]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
sw = stopwords.words("turkish")

**Stopwordsleri silelim.**

In [None]:
for col in str_cols:
  df[col] = df[col].apply(lambda x: " ".join(x for x in str(x).split() if x not in sw))

**Tokenization**

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
for col in str_cols:
  df[col] = df[col].apply(word_tokenize)

**Stemming**

In [None]:
stemmer=TurkishStemmer()

In [None]:
for col in str_cols:
  df[col] = df[col].apply(lambda x: [stemmer.stemWord(word) for word in x])

**Vectorization - Word2Vec**

In [None]:
# Word2Vec modelini eğitme
word2vec_model_auction = Word2Vec(df['auction_description'], vector_size=100, window=5, min_count=1, workers=4)
word2vec_model_item = Word2Vec(df['item_merged_description'], vector_size=100, window=5, min_count=1, workers=4)

# Her kelimenin vektörünü almak için bir fonksiyon tanımlama
def get_word_vector(model, word):
    try:
        return model.wv[word]
    except KeyError:
        # Eğer kelime modelde bulunmuyorsa rastgele bir vektör döndür
        return [0] * model.vector_size

# Her kelimenin vektörünü içeren bir sütun oluşturma
df['auction_description_word2vec'] = df['auction_description'].apply(lambda x: [get_word_vector(word2vec_model_auction, word) for word in x])
df['item_description_word2vec'] = df['item_merged_description'].apply(lambda x: [get_word_vector(word2vec_model_item, word) for word in x])

**vektörize ettiğimiz sütunları veriden silelim.**

In [None]:
df.drop(["auction_description","item_merged_description"],axis=1,inplace=True)

## IMAGE PROCESSING

In [None]:
df["item_category"].unique()

array(['eser', 'obje', 'nümismatik', 'kitap', 'diğer', 'efemera'],
      dtype=object)

In [None]:
df_eser = df[df["item_category"]=="eser"]
df_obje = df[df["item_category"]=="obje"]
df_numismatik = df[df["item_category"]=="nümismatik"]
df_kitap = df[df["item_category"]=="kitap"]
df_efemera = df[df["item_category"]=="efemera"]
df_diger = df[df["item_category"]=="diğer"]

KeyError: 'item_category'

In [None]:
df_eser.drop(["item_img","item_category"],axis=1,inplace=True)
df_obje.drop(["item_img","item_category"],axis=1,inplace=True)
df_numismatik.drop(["item_img","item_category"],axis=1,inplace=True)
df_kitap.drop(["item_img","item_category"],axis=1,inplace=True)
df_efemera.drop(["item_img","item_category"],axis=1,inplace=True)
df_diger.drop(["item_img","item_category"],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_eser.drop(["item_img","item_category"],axis=1,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_obje.drop(["item_img","item_category"],axis=1,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_numismatik.drop(["item_img","item_category"],axis=1,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#return

In [None]:
#df_eser = df[df["eser"]==True]

In [None]:
def download_and_convert_image(url):
  # URL'den görüntüyü indirme
  response = requests.get(url)
  # İndirilen görüntüyü açma
  img = Image.open(BytesIO(response.content))
  # Görüntüyü NumPy dizisine dönüştürme
  img_array = np.array(img)
  return img_array

In [None]:
# Verileri işlemeden önce görüntülerin boyutunu optimize etme
#df_eser['item_img_data'] = df_eser['item_img'].apply(download_and_convert_image)

## YSA

In [None]:
df_eser.head()

Unnamed: 0,item_lot_no,item_reserve_price,item_sold,item_tracking_no,item_bid_no,auction_datetime_day,auction_datetime_month,auction_datetime_year,auction_datetime_hour,auction_datetime_minute,auction_description_word2vec,item_description_word2vec
0,1,6500,0,17,0,19,5,2024,20,2,"[[0.16169731, 0.04864459, -0.3438917, -0.35814...","[[0.018508967, 0.030341908, -0.009048991, -0.0..."
1,2,7000,0,13,0,19,5,2024,20,2,"[[0.16169731, 0.04864459, -0.3438917, -0.35814...","[[0.044633817, 0.05498405, -0.010951872, -0.05..."
2,3,5000,0,6,0,19,5,2024,20,2,"[[0.16169731, 0.04864459, -0.3438917, -0.35814...","[[0.044628568, 0.059887193, 0.004412044, -0.08..."
3,4,5000,0,6,0,19,5,2024,20,2,"[[0.16169731, 0.04864459, -0.3438917, -0.35814...","[[0.073178135, 0.09534733, -0.016787391, -0.09..."
4,5,5000,0,4,0,19,5,2024,20,2,"[[0.16169731, 0.04864459, -0.3438917, -0.35814...","[[0.23480542, 0.47016644, -0.062148158, -0.613..."


In [None]:
X = df_eser.drop("item_sold",axis=1)
y = df_eser["item_sold"]

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

In [None]:
y_train.shape

(2686,)

In [None]:
# Model oluşturma
model = tf.keras.Sequential([
    tf.keras.layers.Reshape((-1, 1, X_train.shape[1])),  # Add a timestep dimension of 1
    tf.keras.layers.LSTM(64, return_sequences=True, input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
# Model derleme
model.compile(optimizer='adam',
              loss='binary_crossentropy',  # İki sınıflı çapraz entropi kayıp fonksiyonu
              metrics=['accuracy'])

In [None]:
import numpy as np

def convert_to_np_array(word2vec_column):
    return np.array([np.array(vec) for vec in word2vec_column])

X_train['auction_description_word2vec'] = convert_to_np_array(X_train['auction_description_word2vec'])
X_train['item_description_word2vec'] = convert_to_np_array(X_train['item_description_word2vec'])

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2686,) + inhomogeneous part.

In [None]:
# Model eğitimi
model.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val))

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).

In [None]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2686 entries, 17323 to 5924
Data columns (total 11 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   item_lot_no                   2686 non-null   int64 
 1   item_reserve_price            2686 non-null   int64 
 2   item_tracking_no              2686 non-null   uint64
 3   item_bid_no                   2686 non-null   int64 
 4   auction_datetime_day          2686 non-null   int64 
 5   auction_datetime_month        2686 non-null   int64 
 6   auction_datetime_year         2686 non-null   int64 
 7   auction_datetime_hour         2686 non-null   int64 
 8   auction_datetime_minute       2686 non-null   int64 
 9   auction_description_word2vec  2686 non-null   object
 10  item_description_word2vec     2686 non-null   object
dtypes: int64(8), object(2), uint64(1)
memory usage: 251.8+ KB
