In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

file_path = "/kaggle/input/clothing-fit-dataset-for-size-recommendation/renttherunway_final_data.json"
data = pd.read_json(file_path, lines=True)

df = pd.DataFrame(data)

In [2]:
df['category'].unique()

array(['romper', 'gown', 'sheath', 'dress', 'leggings', 'top', 'jumpsuit',
       'sweater', 'jacket', 'shirtdress', 'maxi', 'shift', 'pants',
       'shirt', 'mini', 'skirt', 'pullover', 'blouse', 'suit', 'coat',
       'trench', 'bomber', 'cape', 'blazer', 'vest', 'duster', 'ballgown',
       'tank', 'poncho', 'frock', 'tunic', 'cardigan', 'culottes', 'down',
       'trouser', 'midi', 'pant', 'legging', 'print', 'knit', 'culotte',
       'sweatshirt', 'peacoat', 'kaftan', 'overalls', 'jogger', 'tee',
       'combo', 'henley', 'cami', 'blouson', 'turtleneck', 'trousers',
       'overcoat', 'hoodie', 't-shirt', 'caftan', 'tight', 'kimono',
       'for', 'crewneck', 'skirts', 'parka', 'buttondown', 'skort',
       'sweatershirt', 'sweatpants', 'jeans'], dtype=object)

In [3]:
df['category'].nunique()

68

In [4]:
def get_best_match(string, choices):
    return process.extractOne(string, choices)[0]

unique_categories = []

for category in df['category'].unique():
    if category not in unique_categories:
        similar_strings = [c for c in unique_categories if fuzz.ratio(category, c) >= 80]
        if similar_strings:
            for similar_string in similar_strings:
                df['category'] = df['category'].replace(similar_string, category)
        unique_categories.append(category)

In [5]:
df['category'].nunique()

59

In [6]:
df['category'].unique()

array(['romper', 'gown', 'sheath', 'dress', 'legging', 'top', 'jumpsuit',
       'sweater', 'jacket', 'shirtdress', 'maxi', 'skirts', 'pant',
       'mini', 'pullover', 'blouse', 'suit', 'coat', 'trench', 'bomber',
       'cape', 'blazer', 'vest', 'duster', 'ballgown', 'tank', 'poncho',
       'frock', 'tunic', 'cardigan', 'culotte', 'down', 'trousers',
       'midi', 'print', 'knit', 'sweatershirt', 'peacoat', 'caftan',
       'overalls', 'jogger', 'tee', 'combo', 'henley', 'cami', 'blouson',
       'turtleneck', 'overcoat', 'hoodie', 't-shirt', 'tight', 'kimono',
       'for', 'crewneck', 'parka', 'buttondown', 'skort', 'sweatpants',
       'jeans'], dtype=object)

In [7]:
df.dropna(inplace=True)

In [8]:
df

Unnamed: 0,fit,user_id,bust size,item_id,weight,rating,rented for,review_text,body type,review_summary,category,height,size,age,review_date
0,fit,420272,34d,2260466,137lbs,10.0,vacation,An adorable romper! Belt and zipper were a lit...,hourglass,So many compliments!,romper,"5' 8""",14,28.0,"April 20, 2016"
1,fit,273551,34b,153475,132lbs,10.0,other,I rented this dress for a photo shoot. The the...,straight & narrow,I felt so glamourous!!!,gown,"5' 6""",12,36.0,"June 18, 2013"
3,fit,909926,34c,126335,135lbs,8.0,formal affair,I rented this for my company's black tie award...,pear,Dress arrived on time and in perfect condition.,dress,"5' 5""",8,34.0,"February 12, 2014"
4,fit,151944,34b,616682,145lbs,10.0,wedding,I have always been petite in my upper body and...,athletic,Was in love with this dress !!!,gown,"5' 9""",12,27.0,"September 26, 2016"
5,fit,734848,32b,364092,138lbs,8.0,date,Didn't actually wear it. It fit perfectly. The...,athletic,Traditional with a touch a sass,dress,"5' 8""",8,45.0,"April 30, 2016"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192539,fit,66386,34dd,2252812,140lbs,10.0,work,Fit like a glove!,hourglass,LOVE IT!!! First Item Im thinking of buying!,jumpsuit,"5' 9""",8,42.0,"May 18, 2016"
192540,fit,118398,32c,682043,100lbs,10.0,work,The pattern contrast on this dress is really s...,petite,LOVE it!,dress,"5' 1""",4,29.0,"September 30, 2016"
192541,fit,47002,36a,683251,135lbs,6.0,everyday,"Like the other DVF wraps, the fit on this is f...",straight & narrow,"Loud patterning, flattering fit",dress,"5' 8""",8,31.0,"March 4, 2016"
192542,fit,961120,36c,126335,165lbs,10.0,wedding,This dress was PERFECTION. it looked incredib...,pear,loved this dress it was comfortable and photog...,dress,"5' 6""",16,31.0,"November 25, 2015"


In [9]:
df['height_inches'] = df['height'].apply(lambda x: int(x.split("'")[0]) * 12 + int(x.split("'")[1].replace('"', '')))

# convert into inches 

# Convert weight to numeric values
df['weight_lbs'] = df['weight'].apply(lambda x: int(x.replace('lbs', '')))

# Group by item_id 
product_df = df.groupby('item_id').agg({
    'review_text': lambda x: ' '.join(x),
    'review_summary': lambda x: ' '.join(x),
    'height_inches': 'mean',
    'weight_lbs': 'mean',
    'age': 'mean',
    'rating': 'mean', 
    'rented for': lambda x: ', '.join(x.unique()) 
}).reset_index()


product_df.rename(columns={'height_inches': 'avg_height', 'weight_lbs': 'avg_weight', 'age': 'avg_age', 'rating': 'avg_rating'}, inplace=True)



In [10]:
product_df

# product_df.to_excel('product_df.xlsx', index=False)

Unnamed: 0,item_id,review_text,review_summary,avg_height,avg_weight,avg_age,avg_rating,rented for
0,123373,The dress was beautiful and very comfortable. ...,It was a very beautiful black tie wedding I lo...,65.297189,141.108434,35.178715,8.871486,"wedding, formal affair, party, other, vacation..."
1,123793,"Fit great, super flattering Limited range of a...",Stunning gown. Wore this for heart ball and re...,65.050222,132.706056,31.671344,9.528804,"formal affair, wedding, other, party, date, work"
2,124204,"This dress is a ""WOW."" It steals the show, sp...",When I opened the RTR box and saw these sequin...,65.090909,137.614876,33.800000,9.325620,"formal affair, party, other, wedding, vacation..."
3,124553,Loved the dress. A little short in the front ...,Lovely wedding guest dress Fabulous! Added a b...,65.450000,139.655556,33.211111,8.527778,"wedding, other, work, formal affair, date, par..."
4,125424,The dress would have been perfect in a size 10...,I didn't wear it but it would have been perfec...,65.680672,136.683473,32.742297,9.025210,"wedding, party, other, formal affair, date, va..."
...,...,...,...,...,...,...,...,...
5731,2963850,This skirt is super cute. The waistline hits a...,I wore this to work I was not able to wear the...,64.250000,134.875000,29.625000,9.750000,"everyday, other, work"
5732,2964470,I loved this sweater from Tory Burch. I didn't...,perfect sweater! Looks professional and dressy...,61.666667,153.333333,38.333333,10.000000,"everyday, work"
5733,2965009,"This fur coat was SO fun to wear, and actually...",omg so fun Nothing makes me feel more bad*** t...,66.000000,123.666667,34.000000,9.333333,"date, everyday, vacation"
5734,2965924,I ordered a size 10 and should have gone down ...,So many compliments on this piece. Really beau...,65.000000,141.333333,43.000000,9.333333,"other, work, date"


In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import numpy as np


product_df['combined_text'] = product_df['review_text'] + ' ' + product_df['review_summary']


scaler = StandardScaler()
product_df['normalized_rating'] = scaler.fit_transform(product_df['avg_rating'].values.reshape(-1, 1))


tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix_text = tfidf_vectorizer.fit_transform(product_df['combined_text'])
tfidf_matrix_rating = product_df['normalized_rating'].values.reshape(-1, 1)


tfidf_matrix_combined = np.hstack([tfidf_matrix_text.toarray(), tfidf_matrix_rating])


item_item_similarity = cosine_similarity(tfidf_matrix_combined, tfidf_matrix_combined)

# Example: Get top 5 similar items for item_id 1
item_id = 1
similar_items_indices = item_item_similarity[item_id].argsort()[::-1][1:6]

# Get the product details for similar items
similar_items = product_df.iloc[similar_items_indices]

print(similar_items)


      item_id                                        review_text  \
1299   755371  I got a size 6r and 8r just in case the dress ...   
10     127865  The dress had a lot of stretch so it made it  ...   
2847  1515339  Loved it. RTR helped me pick this out after th...   
1210   709832  This dress was PERFECT for the event.  I've re...   
1456   834620  The 4L was perfect for my height wearing 3 inc...   

                                         review_summary  avg_height  \
1299  I wore this dress for my senior prom and it wa...   65.254125   
10    We had so much fun and the American Heart Asso...   65.130824   
2847  This. Dress. Is. Amazing.  **  This gown is ep...   65.323529   
1210  This dress was absolutely PERFECT.  Got compli...   65.187500   
1456  Looked fantastic! Great dress! Comfortable, so...   65.520833   

      avg_weight    avg_age  avg_rating  \
1299  134.438944  31.828383    9.425743   
10    136.253584  36.959677    9.446237   
2847  133.522059  29.970588    9.54

#### KNN

In [12]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
import re

def convert_height(height):
    match = re.match(r"(\d+)' (\d+)\"", height)
    if match:
        feet = int(match.group(1))
        inches = int(match.group(2))
        return feet * 12 + inches
    else:
        return None

df['height'] = df['height'].apply(convert_height)

def extract_numeric(text):
    if isinstance(text, str):
        match = re.match(r'(\d+)', text)
        if match:
            return float(match.group(1))
    return text

df['weight'] = df['weight'].apply(extract_numeric)

features = ['bust size', 'weight', 'height', 'size', 'age', 'body type', 'rating']
target = 'item_id'

numeric_features = ['weight', 'height', 'size', 'age']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['bust size', 'body type']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

X = df[features]
y = df[target]
X_preprocessed = pipeline.fit_transform(X)

k = 5  
knn_model = NearestNeighbors(n_neighbors=k, metric='euclidean') 
knn_model.fit(X_preprocessed)

def recommend_items(new_user_features):
    new_user_features_preprocessed = pipeline.transform(pd.DataFrame([new_user_features]))
    distances, indices = knn_model.kneighbors(new_user_features_preprocessed)
    valid_indices = indices[0][indices[0] < len(df)]
    recommended_items = df.iloc[valid_indices][target]
    return recommended_items


new_user_features = {'bust size': '34C', 'weight': 130, 'height': 65, 'size': 8, 'age': 30, 'body type': 'hourglass', 'rating': 5}
recommended_items = recommend_items(new_user_features)
print('User_id   Item_id')
print(recommended_items)

User_id   Item_id
92907     2283350
66618     2653183
139948     158028
189120    2307629
119786    1126889
Name: item_id, dtype: int64


In [13]:
product_ratings1 = df.head(10000)
ratings_utility_matrix = product_ratings1.pivot_table(values='rating', index='user_id', columns='item_id', fill_value=0)
X = ratings_utility_matrix.T
X.head()

user_id,321,329,331,611,635,657,678,772,1066,1070,...,998704,999016,999063,999183,999274,999552,999621,999726,999910,999913
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
123373,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
123793,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
124204,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
124553,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
125424,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## SVD

In [14]:
from sklearn.decomposition import TruncatedSVD

SVD = TruncatedSVD(n_components=50)
decomposed_matrix = SVD.fit_transform(X)
decomposed_matrix.shape

(3136, 50)

In [15]:
correlation_matrix = np.corrcoef(decomposed_matrix)
correlation_matrix.shape

(3136, 3136)

In [18]:
def get_product_row(df, product_id):
    product_row = df[df['item_id'] == product_id] 
    return product_row

i=148089

product_names = list(X.index)
product_ID = product_names.index(i)
product_ID

print(get_product_row(df,i))

          fit  user_id bust size  item_id  weight  rating     rented for  \
223       fit   606812       32d   148089   132.0    10.0        wedding   
643     small   845469       34b   148089   154.0     6.0  formal affair   
841       fit   747133       32a   148089   110.0    10.0        wedding   
1003      fit   857032       32c   148089   118.0    10.0        wedding   
3184      fit   164355       34d   148089   140.0    10.0  formal affair   
...       ...      ...       ...      ...     ...     ...            ...   
187711    fit   851114       34b   148089   115.0    10.0  formal affair   
189703    fit   860698       36c   148089   150.0    10.0        wedding   
190731    fit   562629       36d   148089   160.0     8.0           date   
190776  small   524706       36c   148089   139.0     8.0          other   
191652    fit   393231       36d   148089   165.0    10.0        wedding   

                                              review_text          body type  \
223    

## K-means Clustering for users

In [19]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Load data
# product_descriptions = pd.read_json('renttherunway_final_data.json', lines=True)
product_descriptions1 = df.head(500)

# Vectorize product descriptions
vectorizer = TfidfVectorizer(stop_words='english')
X_product = vectorizer.fit_transform(product_descriptions1["review_text"])

# K-means clustering on product descriptions
kmeans_product = KMeans(n_clusters=10, init='k-means++')
kmeans_product.fit(X_product)

# Extract product IDs and cluster indices
product_ids = product_descriptions1["item_id"].values
product_cluster_indices = kmeans_product.labels_


user_data = product_descriptions1[['age', 'size','weight','height']].fillna(0)  # Filling NaNs with 0 for simplicity

# K-means clustering on user data
kmeans_user = KMeans(n_clusters=5, init='k-means++')
kmeans_user.fit(user_data)

# Recommend 5 products to a new user
def recommend_products(new_user_data, num_recommendations=5):
    # Convert new_user_data dictionary to a list
    new_user_data_list = [new_user_data['age'], new_user_data['size'],new_user_data['weight'], new_user_data['height']]
    # Predict the cluster for the new user
    user_cluster = kmeans_user.predict([new_user_data_list])[0]
    # Find products belonging to the same cluster as the user
    cluster_products = product_ids[kmeans_user.labels_ == user_cluster]
    # Randomly select 5 products from the cluster for recommendation
    recommended_products = np.random.choice(cluster_products, size=num_recommendations, replace=False)
    return recommended_products

# Example usage:
new_user_data = {'age': 25, 'size': 8,'weight':130,'height':72}  # New user's age and size
recommended_products = recommend_products(new_user_data, num_recommendations=5)
print("Recommended Product IDs:", recommended_products)



Recommended Product IDs: [ 806803 2608150  181453 1212992  442171]


  "X does not have valid feature names, but"


In [21]:
correlation_product_ID = correlation_matrix[product_ID]
Recommend = list(X.index[correlation_product_ID > 0.60])

Recommend.remove(i)

Recommend = Recommend[:9]

for product_index in Recommend:
    ans = get_product_row(df, product_index)
    print(ans)

          fit  user_id bust size  item_id  weight  rating     rented for  \
2546    small   446593       34c   129831   125.0    10.0        wedding   
4477      fit   470049       32d   129831   115.0    10.0          other   
4556    small   509263       34d   129831   150.0    10.0  formal affair   
5939    small   478800       34a   129831   133.0     8.0        wedding   
6141    small   723099       32c   129831   120.0     8.0        wedding   
...       ...      ...       ...      ...     ...     ...            ...   
189097    fit   112150       34c   129831   118.0    10.0        wedding   
190107    fit   642220       32a   129831   102.0    10.0          party   
190250    fit   129263       34d   129831   130.0    10.0        wedding   
190950    fit   302599      32aa   129831    95.0    10.0          party   
192447    fit   450455       36d   129831   118.0    10.0        wedding   

                                              review_text  body type  \
2546    I rente

In [None]:
import pandas as pd
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity


product_df['combined_text'] = product_df['review_text'] + ' ' + product_df['review_summary']

tokenized_text = product_df['combined_text'].apply(lambda x: x.split())

model = Word2Vec(sentences=tokenized_text, vector_size=100, window=5, min_count=1, workers=4)


def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    nwords = 0.
    for word in words:
        if word in vocabulary:
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model.wv[word])
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
    return feature_vector

word_vectors = np.array([average_word_vectors(review, model, model.wv.index_to_key, 100) for review in tokenized_text])

item_item_similarity = cosine_similarity(word_vectors, word_vectors)

In [None]:
item_id = 100
similar_items_indices = item_item_similarity[item_id].argsort()[::-1][1:6]


similar_items = product_df.iloc[similar_items_indices]

print(similar_items)

In [None]:
unique_user_ids = len(df['user_id'].unique())
print(unique_user_ids)