In [1]:
import os
import pandas as pd
from dotenv import load_dotenv
from openai import AzureOpenAI
import tiktoken
from requests_ratelimiter import LimiterSession
from token_limiter import TokenLimiter

In [2]:
load_dotenv()

True

In [3]:
def read_data(filepath):
    return pd.read_csv(filepath, on_bad_lines='skip')

In [4]:
def clean_genre(genre_str):
    genre_str = genre_str.replace("[", "")
    genre_str = genre_str.replace("]", "")
    genre_str = genre_str.replace("'", "")
    return genre_str

In [5]:
def clean_author(author_str):
    pos = author_str.find("(")
    return author_str[:pos]

In [6]:
filepath = r"/home/ubuntu/notebooks/mlops_book_recommendation_system/data/books_1.Best_Books_Ever.csv"
df = read_data(filepath)

In [7]:
feature_df = df[["title", "author", "description", "genres", "likedPercent", "numRatings"]]

In [8]:
feature_df = feature_df[feature_df['description'].notna()]
feature_df['genres'] = feature_df['genres'].apply(lambda x: clean_genre(x))
feature_df = feature_df[feature_df['numRatings'] > 500]
feature_df['author'] = feature_df['author'].apply(lambda x: clean_author(x))
feature_df = feature_df[feature_df.notna()]
feature_df = feature_df.reset_index()
feature_df = feature_df.drop(columns=['index'])


In [9]:
feature_df

Unnamed: 0,title,author,description,genres,likedPercent,numRatings
0,The Hunger Games,Suzanne Collin,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,"Young Adult, Fiction, Dystopia, Fantasy, Scien...",96.0,6376780
1,Harry Potter and the Order of the Phoenix,"J.K. Rowling, Mary GrandPré",There is a door at the end of a silent corrido...,"Fantasy, Young Adult, Fiction, Magic, Children...",98.0,2507623
2,To Kill a Mockingbird,Harper Le,The unforgettable novel of a childhood in a sl...,"Classics, Fiction, Historical Fiction, School,...",95.0,4501075
3,Pride and Prejudice,"Jane Austen, Anna Quindlen",Alternate cover edition of ISBN 9780679783268S...,"Classics, Fiction, Romance, Historical Fiction...",94.0,2998241
4,Twilight,Stephenie Meye,About three things I was absolutely positive.\...,"Young Adult, Fantasy, Romance, Vampires, Ficti...",78.0,4964519
...,...,...,...,...,...,...
37055,Attracted to Fire,DiAnn Mills,Special Agent Meghan Connors' dream of one day...,"Christian Fiction, Christian, Suspense, Romanc...",95.0,2143
37056,Elemental,Kim Richardson,When seventeen-year-old Kara Nightingale is su...,"Fantasy, Young Adult, Angels, Romance, Paranor...",94.0,1947
37057,Unbelievable,Sherry Gammon,Lilah Lopez Dreser's in town to take care of u...,"Romance, Young Adult, Contemporary, Contempora...",94.0,1028
37058,Fractured,Cheri Schmidt,The Fateful Trilogy continues with Fractured. ...,"Vampires, Paranormal, Young Adult, Romance, Fa...",94.0,871


In [10]:
feature_df.to_csv("/home/ubuntu/notebooks/mlops_book_recommendation_system/data/clean_books.csv", index=False)

In [11]:
feature_df = pd.read_csv("/home/ubuntu/notebooks/mlops_book_recommendation_system/data/clean_books.csv")

In [12]:
feature_df

Unnamed: 0,title,author,description,genres,likedPercent,numRatings
0,The Hunger Games,Suzanne Collin,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,"Young Adult, Fiction, Dystopia, Fantasy, Scien...",96.0,6376780
1,Harry Potter and the Order of the Phoenix,"J.K. Rowling, Mary GrandPré",There is a door at the end of a silent corrido...,"Fantasy, Young Adult, Fiction, Magic, Children...",98.0,2507623
2,To Kill a Mockingbird,Harper Le,The unforgettable novel of a childhood in a sl...,"Classics, Fiction, Historical Fiction, School,...",95.0,4501075
3,Pride and Prejudice,"Jane Austen, Anna Quindlen",Alternate cover edition of ISBN 9780679783268S...,"Classics, Fiction, Romance, Historical Fiction...",94.0,2998241
4,Twilight,Stephenie Meye,About three things I was absolutely positive.\...,"Young Adult, Fantasy, Romance, Vampires, Ficti...",78.0,4964519
...,...,...,...,...,...,...
37055,Attracted to Fire,DiAnn Mills,Special Agent Meghan Connors' dream of one day...,"Christian Fiction, Christian, Suspense, Romanc...",95.0,2143
37056,Elemental,Kim Richardson,When seventeen-year-old Kara Nightingale is su...,"Fantasy, Young Adult, Angels, Romance, Paranor...",94.0,1947
37057,Unbelievable,Sherry Gammon,Lilah Lopez Dreser's in town to take care of u...,"Romance, Young Adult, Contemporary, Contempora...",94.0,1028
37058,Fractured,Cheri Schmidt,The Fateful Trilogy continues with Fractured. ...,"Vampires, Paranormal, Young Adult, Romance, Fa...",94.0,871


In [13]:
def count_tokens(text: str | list):
    encoding = tiktoken.encoding_for_model('text-embedding-3-small')
    num_tokens = 0

    if type(text) == str:
        num_tokens = len(encoding.encode(text))
    elif type(text) == list:
        num_tokens = sum([count_tokens(txt) for txt in text])
    else:
        print(text)
        raise ValueError(f"count_tokens does not support input of type {type(text)}. Please ensure your input is a string value or a list of strings.")
        
    return num_tokens 

In [14]:
def check_tokens(text):
    return count_tokens(text) <= 8191

In [15]:
categorical = "Title: " + feature_df.title + " Author: " + feature_df.author + " Description: " + feature_df.description + " Genres: " + feature_df.genres 

In [16]:
categorical

0        Title: The Hunger Games Author: Suzanne Collin...
1        Title: Harry Potter and the Order of the Phoen...
2        Title: To Kill a Mockingbird Author: Harper Le...
3        Title: Pride and Prejudice Author: Jane Austen...
4        Title: Twilight Author: Stephenie Meye Descrip...
                               ...                        
37055    Title: Attracted to Fire Author: DiAnn Mills  ...
37056    Title: Elemental Author: Kim Richardson  Descr...
37057    Title: Unbelievable Author: Sherry Gammon  Des...
37058    Title: Fractured Author: Cheri Schmidt  Descri...
37059    Title: Marked Author: Kim Richardson  Descript...
Length: 37060, dtype: object

In [17]:
categorical = categorical.dropna()

In [18]:
categorical

0        Title: The Hunger Games Author: Suzanne Collin...
1        Title: Harry Potter and the Order of the Phoen...
2        Title: To Kill a Mockingbird Author: Harper Le...
3        Title: Pride and Prejudice Author: Jane Austen...
4        Title: Twilight Author: Stephenie Meye Descrip...
                               ...                        
37055    Title: Attracted to Fire Author: DiAnn Mills  ...
37056    Title: Elemental Author: Kim Richardson  Descr...
37057    Title: Unbelievable Author: Sherry Gammon  Des...
37058    Title: Fractured Author: Cheri Schmidt  Descri...
37059    Title: Marked Author: Kim Richardson  Descript...
Length: 37030, dtype: object

In [19]:
categorical = categorical[categorical.apply(check_tokens)]

In [20]:
numerical = pd.DataFrame(feature_df[['likedPercent', 'numRatings']]) 

In [21]:
# scale the numerical features
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(numerical)

In [22]:
def _get_embeddings(texts: str | list, limiter: TokenLimiter, session: LimiterSession, model="text-embedding-3-small"):

    if type(texts) != str and type(texts) != list:
        raise ValueError("Must pass a string or a list to create embeddings")

    with session:
        client = AzureOpenAI(
            api_key = os.getenv("AZURE_OPENAI_API_KEY"),
            api_version = "2024-10-21",
            azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
        )

        token_count = count_tokens(texts)
        limiter.wait_for_slot(token_count)

        response = client.embeddings.create(
            model = model,
            input = texts
        )

        return response.data

In [23]:
def get_embeddings(data: str | list, step=10) -> list:
    embeddings = [] 

    token_limiter = TokenLimiter()
    limiter_session = LimiterSession(per_minute=700) # Change to variable

    if type(data) == list:
        for i in range(0, len(data), step):
            batch = _get_embeddings(data[i:i+step], token_limiter, limiter_session)
            embeddings += [item.embedding for item in batch]
    elif type(data) == str:
        return _get_embeddings(data, token_limiter, limiter_session).data[0].embedding
    else:
        raise ValueError(f"Data of type {type(data)} is not supported. Please use string or list.")

    return embeddings

In [24]:
categorical_list = categorical.values.tolist()

In [None]:
embeddings = get_embeddings(categorical_list[:100])
emb_len = len(embeddings[-1])
numerical = scaled_data[:100]
numerical = pd.DataFrame(numerical, columns=[f"{emb_len+1}", f"{emb_len+2}"])

In [35]:
embeddings_df = pd.DataFrame(embeddings)
embeddings_df = pd.concat([embeddings_df, numerical], axis=1)

In [36]:
embeddings_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1528,1529,1530,1531,1532,1533,1534,1535,1537,1538
0,-0.027731,0.042470,0.008717,0.006005,0.003524,-0.011976,0.025499,0.004350,0.002784,-0.020771,...,-0.001522,-0.055551,-0.029830,0.029234,0.005229,0.024660,0.046270,-0.004074,0.943662,0.904697
1,-0.033636,0.057242,-0.046693,0.006161,-0.012700,-0.030221,0.000629,0.043018,-0.024903,0.001636,...,-0.011662,-0.023584,0.000232,0.015337,0.005674,-0.049114,0.039603,-0.022287,0.971831,0.355723
2,-0.000989,0.046005,-0.038520,0.031684,-0.003147,-0.049453,-0.034260,-0.022698,-0.021684,0.002296,...,-0.003405,-0.020000,-0.029737,-0.005801,-0.006395,-0.006998,0.031461,-0.015730,0.929577,0.638563
3,0.005111,0.006698,0.006078,-0.008062,0.012402,0.011119,-0.058505,0.027798,-0.016893,-0.038939,...,0.002295,-0.055383,-0.012798,0.016380,-0.008986,-0.007564,0.030963,-0.003395,0.915493,0.425334
4,-0.038795,-0.015637,-0.055804,0.010385,-0.018284,0.015993,0.032960,-0.002102,0.018587,-0.021256,...,0.010023,-0.030690,-0.020500,0.042707,0.008348,0.015194,0.037736,0.006570,0.690141,0.704319
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.006093,-0.003435,-0.021544,-0.009148,-0.017442,-0.024461,0.001155,-0.034928,-0.008505,-0.061650,...,0.029428,-0.029849,0.000001,0.023551,-0.020458,-0.017165,0.036236,-0.009763,0.873239,0.188799
96,-0.002944,-0.006426,0.005605,-0.001191,-0.021401,-0.049449,0.001699,0.054636,-0.028529,-0.026761,...,0.035080,-0.025705,-0.021344,0.048566,-0.013803,-0.006085,0.047951,0.023572,0.943662,0.038293
97,0.028797,-0.002690,-0.010234,0.005035,-0.025384,-0.046558,-0.020694,0.024383,-0.031659,-0.012365,...,-0.018926,-0.000538,0.003985,0.048070,0.034765,-0.034132,0.050441,-0.011711,0.957746,0.031165
98,-0.038244,0.047560,-0.038172,-0.027755,-0.044087,-0.012632,-0.026749,0.004577,-0.034508,0.003739,...,0.000622,-0.020499,-0.047392,0.038244,0.043944,-0.003119,0.029743,0.012297,0.901408,0.068676


In [41]:
embeddings_df.to_csv("/home/ubuntu/notebooks/mlops_book_recommendation_system/data/open_ai_embeddings.csv", index=False)

In [42]:
df = pd.read_csv("/home/ubuntu/notebooks/mlops_book_recommendation_system/data/open_ai_embeddings.csv")

In [43]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1528,1529,1530,1531,1532,1533,1534,1535,1537,1538
0,-0.027731,0.042470,0.008717,0.006005,0.003524,-0.011976,0.025499,0.004350,0.002784,-0.020771,...,-0.001522,-0.055551,-0.029830,0.029234,0.005229,0.024660,0.046270,-0.004074,0.943662,0.904697
1,-0.033636,0.057242,-0.046693,0.006161,-0.012700,-0.030221,0.000629,0.043018,-0.024903,0.001636,...,-0.011662,-0.023584,0.000232,0.015337,0.005674,-0.049114,0.039603,-0.022287,0.971831,0.355723
2,-0.000989,0.046005,-0.038520,0.031684,-0.003147,-0.049453,-0.034260,-0.022698,-0.021684,0.002296,...,-0.003405,-0.020000,-0.029737,-0.005801,-0.006395,-0.006998,0.031461,-0.015730,0.929577,0.638563
3,0.005111,0.006698,0.006078,-0.008062,0.012402,0.011119,-0.058505,0.027798,-0.016893,-0.038939,...,0.002295,-0.055383,-0.012798,0.016380,-0.008986,-0.007564,0.030963,-0.003395,0.915493,0.425334
4,-0.038795,-0.015637,-0.055804,0.010385,-0.018284,0.015993,0.032960,-0.002102,0.018587,-0.021256,...,0.010023,-0.030690,-0.020500,0.042707,0.008348,0.015194,0.037736,0.006570,0.690141,0.704319
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.006093,-0.003435,-0.021544,-0.009148,-0.017442,-0.024461,0.001155,-0.034928,-0.008505,-0.061650,...,0.029428,-0.029849,0.000001,0.023551,-0.020458,-0.017165,0.036236,-0.009763,0.873239,0.188799
96,-0.002944,-0.006426,0.005605,-0.001191,-0.021401,-0.049449,0.001699,0.054636,-0.028529,-0.026761,...,0.035080,-0.025705,-0.021344,0.048566,-0.013803,-0.006085,0.047951,0.023572,0.943662,0.038293
97,0.028797,-0.002690,-0.010234,0.005035,-0.025384,-0.046558,-0.020694,0.024383,-0.031659,-0.012365,...,-0.018926,-0.000538,0.003985,0.048070,0.034765,-0.034132,0.050441,-0.011711,0.957746,0.031165
98,-0.038244,0.047560,-0.038172,-0.027755,-0.044087,-0.012632,-0.026749,0.004577,-0.034508,0.003739,...,0.000622,-0.020499,-0.047392,0.038244,0.043944,-0.003119,0.029743,0.012297,0.901408,0.068676
