# Data Wrangling
### Dataset: 2023 Fashion Trends

Initial Imports

In [48]:
import pandas as pd
from pathlib import Path
import os

from dotenv import load_dotenv
load_dotenv()

# Custom Functions
from fncs.utilities import (create_openai_client,
                            response_generator,
                            prompt_builder,
                            calculate_total_cost
                            )
from fncs.retrieval import create_embeddings_batch

# Load environment vars:
base_url_voc = os.getenv("OPENAI_BASE_VOC")
api_key_voc = os.getenv("OPENAI_API_VOC")

emb_name = 'text-embedding-3-small'

openai_client = create_openai_client(api_key= api_key_voc, base_url= base_url_voc)

Getting the project path:

In [9]:
proj_dir = Path(os.getcwd())

Reading csv file:

In [23]:
df = pd.read_csv(proj_dir / "data" / "2023_fashion_trends.csv")

In [24]:
df.head()

Unnamed: 0,URL,Trends,Source
0,https://www.refinery29.com/en-us/fashion-trend...,2023 Fashion Trend: Red. Glossy red hues took ...,7 Fashion Trends That Will Take Over 2023 — Sh...
1,https://www.refinery29.com/en-us/fashion-trend...,2023 Fashion Trend: Cargo Pants. Utilitarian w...,7 Fashion Trends That Will Take Over 2023 — Sh...
2,https://www.refinery29.com/en-us/fashion-trend...,"2023 Fashion Trend: Sheer Clothing. ""Bare it a...",7 Fashion Trends That Will Take Over 2023 — Sh...
3,https://www.refinery29.com/en-us/fashion-trend...,2023 Fashion Trend: Denim Reimagined. From dou...,7 Fashion Trends That Will Take Over 2023 — Sh...
4,https://www.refinery29.com/en-us/fashion-trend...,2023 Fashion Trend: Shine For The Daytime. The...,7 Fashion Trends That Will Take Over 2023 — Sh...


In [29]:
df['URL_processed'] = df['URL'].str.split('/').str[2]
df[['URL_processed']].tail(3)

Unnamed: 0,URL_processed
79,www.whowhatwear.com
80,www.whowhatwear.com
81,www.whowhatwear.com


In [31]:
df[['URL_processed','Source', 'Trends']].head()

Unnamed: 0,URL_processed,Source,Trends
0,www.refinery29.com,7 Fashion Trends That Will Take Over 2023 — Sh...,2023 Fashion Trend: Red. Glossy red hues took ...
1,www.refinery29.com,7 Fashion Trends That Will Take Over 2023 — Sh...,2023 Fashion Trend: Cargo Pants. Utilitarian w...
2,www.refinery29.com,7 Fashion Trends That Will Take Over 2023 — Sh...,"2023 Fashion Trend: Sheer Clothing. ""Bare it a..."
3,www.refinery29.com,7 Fashion Trends That Will Take Over 2023 — Sh...,2023 Fashion Trend: Denim Reimagined. From dou...
4,www.refinery29.com,7 Fashion Trends That Will Take Over 2023 — Sh...,2023 Fashion Trend: Shine For The Daytime. The...


Creating a 'text' feature. This dataframe feature includes all the information needed to be used later in a custom RAG-style chatbot.

In [44]:
# Using string format method
text_chunk = \
"""Title: {source}

{trends}

Source URL: {url}
"""

df['text'] = df.apply(lambda row: text_chunk.format(
    source=row['Source'],
    url=row['URL_processed'],
    trends=row['Trends']), axis=1)

print(df[['text']].iloc[0].values[0])

Title: 7 Fashion Trends That Will Take Over 2023 — Shop Them Now

2023 Fashion Trend: Red. Glossy red hues took over the Fall 2023 runways ranging from Sandy Liang and PatBo to Tory Burch and Wiederhoeft. Think: Juicy reds with vibrant orange undertones that would look just as good in head-to-toe looks (see: a pantsuit) as accent accessory pieces (shoes, handbags, jewelry).

Source URL: www.refinery29.com



Generating Embeddings from the feature 'text'

In [49]:
final_df, cost=\
create_embeddings_batch(
    client=openai_client,
    deployment_name=emb_name,
    batch_size=10,
    df=df,
    chunk_column='text'
)
print(f'Total Cost(eur): {cost}')

Total Cost(eur): 0.0013160000000000001


In [53]:
final_df[['text','embeddings']].head(2)

Unnamed: 0,text,embeddings
0,Title: 7 Fashion Trends That Will Take Over 20...,"[-0.0008604738395661116, 0.02634955383837223, ..."
1,Title: 7 Fashion Trends That Will Take Over 20...,"[0.01805400848388672, 0.049275610595941544, 0...."


Saving the final dataframe with only two features: the text chunks and their embeddings

In [54]:
final_df[['text','embeddings']].to_csv(proj_dir / "data" / "2023_fashion_trends_embeddings.csv", index=False)