In [5]:
import pandas as pd
import numpy as np
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.preprocessing import LabelEncoder

# Load the existing dataset
data = pd.read_csv('Fashion_dataset.csv')

# Check for missing values
print(data.isnull().sum())

# Fill NaN values (if any) with appropriate values
data.fillna({
    'price': data['price'].mean(),
    'colour': 'unknown',
    'brand': 'unknown',
    'ratingCount': data['ratingCount'].mean(),
    'avg_rating': data['avg_rating'].mean(),
    'description': ''
}, inplace=True)

# Save the cleaned dataset (optional)
data.to_csv('Fashion_dataset_cleaned.csv', index=False)

# Encode categorical variables
label_encoders = {}
for column in ['colour', 'brand']:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Save label encoders for future use
joblib.dump(label_encoders['colour'], 'label_encoder_colour.pkl')
joblib.dump(label_encoders['brand'], 'label_encoder_brand.pkl')

# Create a new column for combined features
data['combined_features'] = data['description'] + ' ' + data['brand'].astype(str) + ' ' + data['colour'].astype(str)

# Ensure there are no NaN values in the combined_features column
data['combined_features'] = data['combined_features'].fillna('')

# Compute TF-IDF features for the combined features
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(data['combined_features'])

# Load user preferences (for demonstration)
preferred_brands = [0]  # Example brand IDs
preferred_colours = [0, 1]  # Example colour IDs
preferred_keywords = ['striped', 'high waist']  # Example keywords

# Convert preferred keywords to a single string
preferred_keywords_str = ' '.join(preferred_keywords)

# Compute similarity scores based on the user’s preferred features
preferred_features_str = preferred_keywords_str + ' ' + ' '.join(map(str, preferred_brands)) + ' ' + ' '.join(map(str, preferred_colours))
preferred_features_vector = tfidf_vectorizer.transform([preferred_features_str])

# Compute similarity scores
cosine_similarities = linear_kernel(preferred_features_vector, tfidf_matrix).flatten()

# Get top 10 recommendations
top_indices = cosine_similarities.argsort()[-10:][::-1]
recommendations = data.iloc[top_indices]

print(recommendations[['p_id', 'price', 'brand', 'colour', 'description']])


Unnamed: 0         0
p_id              18
name              18
price             18
colour            21
brand             18
img               18
ratingCount     7749
avg_rating      7749
description       18
p_attributes      18
dtype: int64
            p_id   price  brand  colour  \
7472  17060798.0  1499.0    404      34   
4652   6705745.0  1599.0    675      24   
4249  18143380.0  2399.0    751      22   
2078  16707888.0  2299.0    721       3   
2205  16707940.0  2299.0    721      31   
2144  16707884.0  2299.0    721       2   
2030  16707822.0  2299.0    721       3   
6928  18488956.0  3999.0    363       3   
4685  11023146.0  2099.0    675      47   
7095  19190906.0  3999.0    506       5   

                                            description  
7472  Pink striped tunic, striped with ethnic print ...  
4652  Maroon high-rise striped high-rise parallel tr...  
4249  Pick these fashionable trousers and opt for an...  
2078  Move from day to night with ease when you do