In [1]:
import pandas as pd
import numpy as np
import tensorflow
import random

from PIL import Image
from urllib.request import urlopen
from keras.models import load_model
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.applications.vgg19 import VGG19, preprocess_input
from tensorflow.keras.models import Model



In [2]:
df = pd.read_csv("/kaggle/input/fashion5k/database5K.csv").drop("Unnamed: 0", axis=1)

In [5]:
items = df.drop('user', axis=1)
items = items.drop_duplicates(subset=['asin'], keep='first')

In [7]:
def preprocess_data_for_vgg19(df):
    preprocessed_images = []
    invalid_rows = []

    for index, row in df.iterrows():
        try:
            response = urlopen(row['imUrl'])
            img = Image.open(response)
        except:
            invalid_rows.append(index)
            continue
        img = img.resize((224, 224))
        img = img.convert('RGB')
        img_arr = np.array(img).astype(np.float32)
        if img_arr.shape != (224, 224, 3):
            print(index)
            invalid_rows.append(index)
            continue
        img_arr = preprocess_input(img_arr)
        preprocessed_images.append(img_arr)

    df = df.drop(invalid_rows)
    preprocessed_images = np.array(preprocessed_images)
return preprocessed_images, df

In [None]:
images, items = preprocess_data_for_vgg19(items)

In [None]:
mask = df['imUrl'].isin(items['imUrl'])
df = df[mask]

In [None]:
feat_extractor = load_model('/kaggle/input/modelvgg2/best_model.h5')

In [None]:
feat_extractor.summary()

In [None]:
imgs_features = feat_extractor.predict(images)
print("features successfully extracted!")
imgs_features.shape

In [None]:
user_list = df["user"].dropna().unique()
item_list = items["asin"].unique()

In [None]:
n_users = len(user_list)
n_items = len(item_list)
print('Number of users:', n_users)
print('Number of items:', n_items)

In [None]:
new_ratings = []
for user in user_list:
    user_items = df[df["user"] == user]["asin"].unique()
    new_items = np.setdiff1d(item_list, user_items)
    if len(user_items) > 30:
        continue
    n = random.randint(30, 40)
    new_user_ratings = pd.DataFrame({
        "user": [user] * n,
        "asin": np.random.choice(new_items, size = n),
        "rating": np.random.randint(1, 6, size = n)
    })
    new_ratings.append(new_user_ratings)
    
ratings = pd.concat(new_ratings)

In [None]:
df1 = df.dropna(subset=['user'])
new_df = df1[['user', 'asin', 'rating']]
ratings = pd.concat([new_df, ratings])
ratings['rating'] = ratings['rating'].astype(int)
ratings.drop_duplicates(subset=['user', 'asin'], inplace=True)
ratings.reset_index(drop=True, inplace=True)

In [None]:
np.save("imgs_features.npy", imgs_features)
np.save("images.npy", images)
df.to_csv('/kaggle/working/fashion.csv', index=False)
ratings.to_csv('/kaggle/working/ratings.csv', index=False)
items.to_csv('/kaggle/working/items.csv', index=False)