<img src="robotlable.png" alt="robot" width="500"/>

In [2]:
# Load the dataset
import pandas as pd
from scipy.spatial import distance
reviews = pd.read_csv("womens_clothing_e-commerce_reviews.csv")

# Display the first few entries
reviews.head()

# take some columns necessary for analysis
reviews = reviews[["Review ID", "Clothing ID", "Age", "Title", "Review Text", "Rating"]]
reviews.head()

# check for missing values
reviews.isnull().sum()


Review ID        0
Clothing ID      0
Age              0
Title          190
Review Text     42
Rating           0
dtype: int64

In [3]:
# CREATE THE OPENAI CLIENT
from openai import OpenAI
import os
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

<img src="embedding.png" alt="embedding" width="500"/>

In [4]:
# MAKE THE EMBEDDING FUNCTION  .
def make_embedding(input_text, model="text-embedding-3-small"):
    
    # Ensure input is either a string or a list
    if isinstance(input_text, str):
        input_list = [input_text]
    elif isinstance(input_text, list):
        input_list = input_text
    else:
        raise TypeError("Input must be a string or a list of strings.")

    # Create embeddings
    response = client.embeddings.create(
        model=model,
        input=input_list
    )

    # Extract embeddings
    embeddings = [item.embedding for item in response.data]

    # Return a single vector if input was a string
    return embeddings[0] if isinstance(input_text, str) else embeddings

<img src="workflow.png" alt="wf" width="1000"/>

In [5]:
# Embed the reviews and store as a list in the variable embeddings
review_embeddings = []
for index, review_text in enumerate(reviews['Review Text']):
    if (review_text is None) or (not isinstance(review_text, str)) or (review_text.strip() == ''):
        continue
    else:
        result = make_embedding(review_text)
        review_embeddings.append({'review_id':reviews['Review ID'][index], 'embedding':result})
        
print(review_embeddings[:3])

[{'review_id': np.int64(0), 'embedding': [0.025623168796300888, -0.049799442291259766, -0.08634008467197418, 0.010608360171318054, 0.015442299656569958, -0.048010557889938354, -0.02525486797094345, 0.04319634661078453, 0.043590955436229706, -0.02234792709350586, -0.009036507457494736, 0.006228218786418438, -0.010411056689918041, -0.027517283335328102, 0.044248633086681366, 0.04135484620928764, -0.010154562070965767, 0.011752720922231674, 0.029858620837330818, 0.0220322422683239, 0.02150609903037548, -0.01874384842813015, -0.01679711788892746, -0.011732990853488445, -0.025636322796344757, -0.017902018502354622, 0.007944760844111443, -0.053403522819280624, 0.011266038753092289, -0.017033882439136505, 0.04627428576350212, -0.03369946405291557, 0.030937213450670242, -0.07055579125881195, 0.016876040026545525, 0.01694180816411972, -0.012357786297798157, 0.03380469232797623, 0.01631043665111065, -0.01002960279583931, 0.015705371275544167, 0.0643472969532013, 0.04085501283407211, -0.004422890

In [6]:
# reduce dimension of embedding to 2d
from sklearn.manifold import TSNE
import numpy as np

tsne = TSNE(n_components=2, perplexity=5)
embeddings = [review_embedding['embedding'] for review_embedding in review_embeddings]
embeddings_2d = tsne.fit_transform(np.array(embeddings))
print(embeddings_2d[:4])

[[ 42.500824  -19.912306 ]
 [ 37.276386   16.078156 ]
 [  7.7976065  31.51369  ]
 [ 47.933697   48.543224 ]]


In [7]:
# identify some reviews that discuss topics such as 'quality', 'fit', 'style', 'comfort'...
review_topics = ['quality', 'fit', 'style', 'comfort', 'durability', 'material', 'design']
embedded_topics = make_embedding(review_topics)
embeddings_topics_2d = tsne.fit_transform(np.array(embedded_topics))
print(embeddings_topics_2d)

[[-16.871653  93.89182 ]
 [ -2.29085   20.996143]
 [ 30.693079 -30.195644]
 [ 53.072414  67.66831 ]
 [-92.77455  104.64902 ]
 [-75.866554  -6.559054]
 [-20.514772 -75.16848 ]]


In [8]:
# make a function that finds the smallest distance between embedded_topics and embedding_2d
def find_minimum_distance(aEmbedding2d, listEmbeddingTopic):
    dist = []
    for index, embeddingTopic in enumerate(listEmbeddingTopic):
        dis = distance.cosine(aEmbedding2d, embeddingTopic)
        dist.append({'distance':dis, 'index':index})
    return min(dist, key=lambda x: x["distance"])

In [9]:
# assign topic in embedding_2d
index_topic = []
for index, embedding_2d in enumerate(embeddings_2d):
    assignedTopic = find_minimum_distance(embedding_2d, embeddings_topics_2d)
    index_topic.append({'index': index, 'topic': review_topics[assignedTopic['index']]})

In [10]:
# assign topic in review_embeddings
review_id_topic = []
for index, review_embedding in enumerate(review_embeddings):
    review_id_topic.append({'Review ID':review_embedding['review_id'], 'topic': index_topic[index]['topic']})

print(len(review_id_topic))

958


<img src="img_left_join.png" alt="lj" width="200"/>

In [12]:
# merge review data frame and a list have included the 'review id' and the 'topic'

dataFrameList = pd.DataFrame(review_id_topic)
reviewFrameTopics = pd.merge(reviews, dataFrameList, on='Review ID', how='left')
print(reviewFrameTopics.head())

   Review ID  Clothing ID  Age                    Title  \
0          0          767   33                      NaN   
1          1         1080   34                      NaN   
2          2         1077   60  Some major design flaws   
3          3         1049   50         My favorite buy!   
4          4          847   47         Flattering shirt   

                                         Review Text  Rating    topic  
0  Absolutely wonderful - silky and sexy and comf...       4    style  
1  Love this dress!  it's sooo pretty.  i happene...       5  comfort  
2  I had such high hopes for this dress and reall...       3      fit  
3  I love, love, love this jumpsuit. it's fun, fl...       5  comfort  
4  This shirt is very flattering to all due to th...       5   design  
