In [1]:
import pandas as pd
import os
import logging

# Read the CSV dataset
file_path = './datasets/movies.csv'
df = pd.read_csv(file_path) # Add .head(100) if you want to limit the number of rows

# Perform the data cleaning
def clean_dataset(text, sep_token=' \n '):
    if pd.isna(text):
        return ''
    else:
        text = str(text)
        text = text.strip()
    return text

df['title'] = df['title'].apply(clean_dataset)
df['summary'] = df['summary'].apply(clean_dataset)

df = df.fillna('')

print(str(len(df)) + ' rows')
df.head()

40179 rows


Unnamed: 0,imdb_id,title,summary,date,genres,runtime,rating,votes,budget,revenue,language,adult,production,poster_link
0,114709,Toy Story (1995),"Led by Woody, Andy's toys live happily in his ...",10/30/95,"Animation, Adventure, Comedy",81.0,8.3,5415.0,30000000,373554033.0,en,False,Pixar Animation Studios,https://images-na.ssl-images-amazon.com/images...
1,113497,Jumanji (1995),When siblings Judy and Peter discover an encha...,12/15/95,"Action, Adventure, Family",104.0,6.9,2413.0,65000000,262797249.0,en,False,"TriStar Pictures, Teitler Film, Interscope Com...",https://images-na.ssl-images-amazon.com/images...
2,113228,Grumpier Old Men (1995),A family wedding reignites the ancient feud be...,12/22/95,"Comedy, Romance",101.0,6.6,92.0,0,0.0,en,False,"Warner Bros., Lancaster Gate",https://images-na.ssl-images-amazon.com/images...
3,114885,Waiting to Exhale (1995),"Cheated on, mistreated and stepped on, the wom...",12/22/95,"Comedy, Drama, Romance",127.0,5.7,34.0,16000000,81452156.0,en,False,Twentieth Century Fox Film Corporation,https://images-na.ssl-images-amazon.com/images...
4,113041,Father of the Bride Part II (1995),Just when George Banks has recovered from his ...,2/10/95,"Comedy, Family, Romance",106.0,5.9,173.0,0,76578911.0,en,False,"Sandollar Productions, Touchstone Pictures",https://images-na.ssl-images-amazon.com/images...


In [5]:
# Create a new column named 'metadata' with all the data combined
def generate_metadata(row):
    metadata = (
        f"The title of this film is '{row['title']}'. "
        f"The brief summary of this film is '{row['summary']}'. "
        f"The release date of this film is '{row['date']}'. "
        f"The genre(s) of this film is/are '{row['genres']}'. "
        f"The runtime or length of this film is {row['runtime']} minute(s). "
        f"This film has a rating of {row['rating']} out of 10. "
        f"The budget of this film is {row['budget']} and it made a revenue of {row['revenue']}. "
        f"The language of this film is '{row['language']}'. "
        f"The production company of this film is '{row['production']}'."
    )
    return metadata

df['metadata'] = df.apply(generate_metadata, axis=1)

pd.set_option('display.max_colwidth', None)
print("A sample value from column 'metadata':")
df['metadata'].head(1)

A sample value from column 'metadata':


0    The title of this film is 'Toy Story (1995)'. The brief summary of this film is 'Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.'. The release date of this film is '10/30/95'. The genre(s) of this film is/are 'Animation, Adventure, Comedy'. The runtime or length of this film is 81.0 minute(s). This film has a rating of 8.3 out of 10. The budget of this film is 30000000 and it made a revenue of 373554033.0. The language of this film is 'en'. The production company of this film is 'Pixar Animation Studios'.
Name: metadata, dtype: object

In [6]:
from openai import OpenAI
import openai
from tqdm import tqdm
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

client = OpenAI(
    api_key=os.getenv('OPENAI_API_KEY'),
)

# Generate the vector embeddings for selected columns
def get_embedding(text, model='text-embedding-3-small'):
    try:
        text = text.replace('\n', ' ')
        return client.embeddings.create(input = [text], model=model).data[0].embedding
    except Exception as e:
        # logging.error(f'Error generating embeddings: {e}. Found issue in the following text: {text}.')
        text = 'No data available'
        return client.embeddings.create(input = [text], model=model).data[0].embedding

tqdm.pandas()
df['metadata_vector'] = df['metadata'].apply(lambda x: get_embedding(x, model='text-embedding-3-small'))

print('Conversion to vector embeddings has been completed.')

df.head()

Conversion to vector embeddings has been completed.


Unnamed: 0,imdb_id,title,summary,date,genres,runtime,rating,votes,budget,revenue,language,adult,production,poster_link,metadata,metadata_vector
0,114709,Toy Story (1995),"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.",10/30/95,"Animation, Adventure, Comedy",81.0,8.3,5415.0,30000000,373554033.0,en,False,Pixar Animation Studios,"https://images-na.ssl-images-amazon.com/images/M/MV5BMDU2ZWJlMjktMTRhMy00ZTA5LWEzNDgtYmNmZTEwZTViZWJkXkEyXkFqcGdeQXVyNDQ2OTk4MzI@._V1_UX182_CR0,0,182,268_AL_.jpg","The title of this film is 'Toy Story (1995)'. The brief summary of this film is 'Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.'. The release date of this film is '10/30/95'. The genre(s) of this film is/are 'Animation, Adventure, Comedy'. The runtime or length of this film is 81.0 minute(s). This film has a rating of 8.3 out of 10. The budget of this film is 30000000 and it made a revenue of 373554033.0. The language of this film is 'en'. The production company of this film is 'Pixar Animation Studios'.","[-0.010617855004966259, 0.027610644698143005, -0.05589677765965462, -0.05243489146232605, -0.031494710594415665, -0.029721548780798912, 0.016813362017273903, 0.056952230632305145, -0.016496727243065834, -0.01569458283483982, -0.015135192312300205, 0.040339402854442596, -0.01767883263528347, -0.01882927678525448, -0.04551112279295921, 0.0074145556427538395, -0.03341563418507576, -0.02739955298602581, -0.01981084793806076, -0.04850860685110092, 0.034555524587631226, 0.040107205510139465, -0.021721217781305313, -0.005303650163114071, 0.03744746372103691, -0.00918771605938673, -0.018153786659240723, 0.017087779939174652, -0.008385571651160717, 0.03411223366856575, -0.022185616195201874, -0.014406929723918438, 0.017003344371914864, -0.004704680759459734, 0.03476661443710327, -0.014132511802017689, -0.008865803480148315, -0.060287460684776306, 0.018544305115938187, -0.0486774817109108, -0.06075185909867287, 0.0016781698213890195, 0.006929047405719757, 0.00290513364598155, -0.004395960830152035, 0.02355770580470562, -0.031220292672514915, 0.016391180455684662, -0.0007289220811799169, 0.0005587302730418742, -0.0229033250361681, 0.029426023364067078, 0.01855485886335373, 0.019916392862796783, -0.010517586953938007, -0.017140552401542664, -0.009710165672004223, -0.021087946370244026, -0.006522697862237692, -0.022945541888475418, -0.007235128432512283, 0.05294150859117508, 0.013150941580533981, 0.03833404555916786, 0.01662338152527809, -0.02387434057891369, -0.050281770527362823, -0.027019590139389038, 0.0012038757558912039, -0.020634101703763008, 0.02615411952137947, 0.07320620119571686, -0.03799629956483841, 0.03721526265144348, 0.01445970218628645, 0.040888238698244095, 0.0003176253230776638, 0.024190977215766907, 0.055474597960710526, -0.012559887953102589, -0.03073478303849697, 0.01975807547569275, -0.013488685712218285, -0.021826762706041336, -0.054588016122579575, 0.01855485886335373, 0.035885393619537354, -0.02725178934633732, -0.008200868032872677, -0.003997527062892914, 0.013488685712218285, -0.007878954522311687, -0.06459371000528336, 0.006068853195756674, 0.019240902736783028, -0.04264029115438461, 0.022227834910154343, 0.0068498882465064526, 0.0601608082652092, -0.005250877235084772, ...]"
1,113497,Jumanji (1995),"When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- into their living room. Alan's only hope for freedom is to finish the game, which proves risky as all three find themselves running from giant rhinoceroses, evil monkeys and other terrifying creatures.",12/15/95,"Action, Adventure, Family",104.0,6.9,2413.0,65000000,262797249.0,en,False,"TriStar Pictures, Teitler Film, Interscope Communications","https://images-na.ssl-images-amazon.com/images/M/MV5BZTk2ZmUwYmEtNTcwZS00YmMyLWFkYjMtNTRmZDA3YWExMjc2XkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_UY268_CR10,0,182,268_AL_.jpg","The title of this film is 'Jumanji (1995)'. The brief summary of this film is 'When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- into their living room. Alan's only hope for freedom is to finish the game, which proves risky as all three find themselves running from giant rhinoceroses, evil monkeys and other terrifying creatures.'. The release date of this film is '12/15/95'. The genre(s) of this film is/are 'Action, Adventure, Family'. The runtime or length of this film is 104.0 minute(s). This film has a rating of 6.9 out of 10. The budget of this film is 65000000 and it made a revenue of 262797249.0. The language of this film is 'en'. The production company of this film is 'TriStar Pictures, Teitler Film, Interscope Communications'.","[-0.004933763761073351, 0.09137537330389023, -0.016937144100666046, -0.013195083476603031, -0.034933581948280334, 0.015048842877149582, 0.026620447635650635, 0.06240606680512428, -0.018894530832767487, 0.013563532382249832, 0.039147716015577316, -0.0011398893548175693, 0.0042889779433608055, 0.026182914152741432, -0.03329858556389809, -0.025515099987387657, -0.04891161620616913, -0.022843843325972557, -0.018399426713585854, -0.002826695330440998, 0.05397779121994972, 0.05393173545598984, -0.04013792425394058, -0.019735055044293404, 0.005912456661462784, -0.01162341795861721, -0.015129441395401955, 0.023718910291790962, 0.05489891394972801, 0.015152469277381897, 0.03198598697781563, -0.0016565816476941109, 0.003926285542547703, 0.02922261878848076, 0.025837492197752, -0.011870969086885452, -0.01169825904071331, -0.07336742430925369, -0.01615419052541256, -0.017501331865787506, -0.016430526971817017, -0.02301655523478985, -0.0160390492528677, 0.004956791643053293, 0.02758762612938881, 0.04670092463493347, -0.026136858388781548, 0.019493259489536285, -0.02378799580037594, 0.04831288754940033, -0.0397925041615963, -0.0037046405486762524, -0.00816345028579235, 0.053793568164110184, 0.0009858892299234867, -0.00016182615945581347, -0.0025129381101578474, 0.04402966797351837, 0.007835300639271736, -0.03127211704850197, -0.0131375128403306, 0.024778202176094055, -0.053194839507341385, 0.021220365539193153, 0.016695350408554077, -0.013448392041027546, -0.043039459735155106, 0.0072077857330441475, 0.006701168138533831, -0.017616473138332367, 0.006148494314402342, 0.025422988459467888, -0.028485720977187157, -0.020921001210808754, -0.02708100900053978, 0.08082851767539978, -0.024801230058073997, -0.030949724838137627, 0.015992993488907814, 0.01653415337204933, -0.007495636586099863, 0.019642943516373634, 0.0015918152639642358, -0.0032009014394134283, -0.025561155751347542, 0.01517549715936184, 0.014001065865159035, 0.006280905567109585, -0.020955542102456093, 0.025307847186923027, 0.03834173455834389, -0.02678164467215538, -0.09312550723552704, 0.031847819685935974, 0.002774882363155484, -0.052273716777563095, -0.060978326946496964, -0.01055836956948042, 0.06802491843700409, 0.01758193038403988, ...]"
2,113228,Grumpier Old Men (1995),"A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John and Max. Meanwhile, a sultry Italian divorcée opens a restaurant at the local bait shop, alarming the locals who worry she'll scare the fish away. But she's less interested in seafood than she is in cooking up a hot time with Max.",12/22/95,"Comedy, Romance",101.0,6.6,92.0,0,0.0,en,False,"Warner Bros., Lancaster Gate","https://images-na.ssl-images-amazon.com/images/M/MV5BMjQxM2YyNjMtZjUxYy00OGYyLTg0MmQtNGE2YzNjYmUyZTY1XkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_UX182_CR0,0,182,268_AL_.jpg","The title of this film is 'Grumpier Old Men (1995)'. The brief summary of this film is 'A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John and Max. Meanwhile, a sultry Italian divorcée opens a restaurant at the local bait shop, alarming the locals who worry she'll scare the fish away. But she's less interested in seafood than she is in cooking up a hot time with Max.'. The release date of this film is '12/22/95'. The genre(s) of this film is/are 'Comedy, Romance'. The runtime or length of this film is 101.0 minute(s). This film has a rating of 6.6 out of 10. The budget of this film is 0 and it made a revenue of 0.0. The language of this film is 'en'. The production company of this film is 'Warner Bros., Lancaster Gate'.","[-0.03661668300628662, 0.10619544982910156, -0.026808220893144608, -0.0644623339176178, -0.058803606778383255, -0.022245872765779495, -0.0037341706920415163, -0.0013093175366520882, -0.020135637372732162, -0.04465679079294205, 0.07205446064472198, 0.00037688008160330355, -0.03784273937344551, 0.03885659575462341, -0.012484567239880562, 0.041237976402044296, 0.005977030843496323, 0.0034600759390741587, -0.022564174607396126, 0.04152091220021248, 0.06606563925743103, 0.017188385128974915, -0.013262642547488213, -0.0580962672829628, 0.010056030005216599, 0.020253527909517288, -0.05455956235527992, 0.054842498153448105, 0.06413224339485168, 0.030651438981294632, 0.007751277647912502, -0.008723871782422066, 0.06361352652311325, 0.010645480826497078, 0.033032819628715515, -0.011329243890941143, 0.03725328668951988, -0.0554555281996727, 0.0007567073917016387, -0.0121544748544693, -0.05003258213400841, 0.00033635535510256886, -0.0007187615265138447, -0.04531697556376457, 0.02134990692138672, -0.00798705779016018, 0.024026013910770416, 0.05399369075894356, 0.036852460354566574, -0.00798705779016018, -0.03208969905972481, 0.001080905320122838, -0.03140593692660332, 0.042416878044605255, 0.004361935891211033, -0.03678172826766968, 0.007710015866905451, 0.006531114690005779, 0.06050122529268265, -0.023872755467891693, 0.0145240668207407, 0.007391712628304958, -0.01934577338397503, -0.007550864480435848, -0.006442696787416935, -0.01707049459218979, -0.00012396885722409934, -0.023188993334770203, 0.00024185901565942913, -0.018048983067274094, -0.01638673059642315, -0.007427079603075981, 0.03475401550531387, -0.018379075452685356, -0.013038651086390018, 0.03819641098380089, -0.036310166120529175, -0.030769329518079758, 0.019463663920760155, 0.038502924144268036, -0.01234309934079647, 0.008099053055047989, -0.009661098010838032, 0.02053646557033062, 0.020453941076993942, 0.011411766521632671, 0.021573897451162338, -0.0027144206687808037, -0.04458605498075485, 0.003837324446067214, 0.04390229284763336, -0.002586215268820524, -0.04081357270479202, 0.00505454046651721, 0.0580962672829628, -0.08700293302536011, -0.052343226969242096, -0.006619532126933336, -0.0011494290083646774, 0.006578270345926285, ...]"
3,114885,Waiting to Exhale (1995),"Cheated on, mistreated and stepped on, the women are holding their breath, waiting for the elusive ""good man"" to break a string of less-than-stellar lovers. Friends and confidants Vannah, Bernie, Glo and Robin talk it all out, determined to find a better way to breathe.",12/22/95,"Comedy, Drama, Romance",127.0,5.7,34.0,16000000,81452156.0,en,False,Twentieth Century Fox Film Corporation,"https://images-na.ssl-images-amazon.com/images/M/MV5BMTczMTMyMTgyM15BMl5BanBnXkFtZTcwOTc4OTQyMQ@@._V1_UY268_CR4,0,182,268_AL_.jpg","The title of this film is 'Waiting to Exhale (1995)'. The brief summary of this film is 'Cheated on, mistreated and stepped on, the women are holding their breath, waiting for the elusive ""good man"" to break a string of less-than-stellar lovers. Friends and confidants Vannah, Bernie, Glo and Robin talk it all out, determined to find a better way to breathe.'. The release date of this film is '12/22/95'. The genre(s) of this film is/are 'Comedy, Drama, Romance'. The runtime or length of this film is 127.0 minute(s). This film has a rating of 5.7 out of 10. The budget of this film is 16000000 and it made a revenue of 81452156.0. The language of this film is 'en'. The production company of this film is 'Twentieth Century Fox Film Corporation'.","[-0.02201184816658497, 0.06293337047100067, -0.03050266206264496, -0.04172487556934357, -0.05220552906394005, 0.008639125153422356, 0.01572098396718502, 0.029291454702615738, -0.021777022629976273, 0.024619653820991516, 0.03759688138961792, 0.018205195665359497, -0.03408684954047203, -0.012383982539176941, -0.012612628750503063, 0.045284342020750046, 0.03257901966571808, 0.0225185789167881, 0.013310926966369152, 0.024310670793056488, 0.025534238666296005, 0.027734188362956047, -0.01093794871121645, 0.014954709447920322, 0.009028442203998566, 0.013187333941459656, -0.03819012641906738, 0.015535594895482063, 0.0270667877048254, 0.031862180680036545, -0.01741420291364193, -0.015337846241891384, 0.10431218892335892, 0.01147557608783245, 0.04605061560869217, -0.011234570294618607, 0.001507057691924274, -0.025200538337230682, -0.026448823511600494, -0.03996985778212547, -0.020083803683519363, -0.020738843828439713, 0.027313971891999245, 0.005972614046186209, 0.03556996211409569, 0.010443577542901039, 0.015387283638119698, 0.016141198575496674, 0.0052217887714505196, 0.029612796381115913, -0.034803684800863266, 0.013632267713546753, -0.040785569697618484, 0.05492456629872322, 0.017859136685729027, -0.025707269087433815, -0.02199948951601982, 0.024570215493440628, 0.040736131370067596, -0.008775076828897, 0.0010235015070065856, -0.011580630205571651, -0.022666890174150467, 0.0623895637691021, -0.003920976538211107, -0.01347159780561924, -0.007254887837916613, -0.035199183970689774, -0.016511976718902588, -0.0017951830523088574, 0.013039023615419865, 0.027215098962187767, 0.00603132089599967, 0.03374078869819641, -0.010975026525557041, 0.03344416618347168, 0.008132395334541798, 0.010208752006292343, -0.0030959956347942352, -0.04209565371274948, 0.0032659354619681835, -0.003129983553662896, 0.016165917739272118, 0.06293337047100067, -0.013249130919575691, 0.0047274185344576836, 0.005839752033352852, 0.014398542232811451, -0.04726182296872139, -0.02723981812596321, -0.02040514349937439, 0.0011154235107824206, -0.03435875475406647, 0.014658086933195591, 0.024248875677585602, -0.016536694020032883, -0.00781723391264677, -0.021900614723563194, 0.017167016863822937, 0.011766019277274609, ...]"
4,113041,Father of the Bride Part II (1995),"Just when George Banks has recovered from his daughter's wedding, he receives the news that she's pregnant ... and that George's wife, Nina, is expecting too. He was planning on selling their home, but that's a plan that -- like George -- will have to change with the arrival of both a grandchild and a kid of his own.",2/10/95,"Comedy, Family, Romance",106.0,5.9,173.0,0,76578911.0,en,False,"Sandollar Productions, Touchstone Pictures","https://images-na.ssl-images-amazon.com/images/M/MV5BOTEyNzg5NjYtNDU4OS00MWYxLWJhMTItYWU4NTkyNDBmM2Y0XkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_UX182_CR0,0,182,268_AL_.jpg","The title of this film is 'Father of the Bride Part II (1995)'. The brief summary of this film is 'Just when George Banks has recovered from his daughter's wedding, he receives the news that she's pregnant ... and that George's wife, Nina, is expecting too. He was planning on selling their home, but that's a plan that -- like George -- will have to change with the arrival of both a grandchild and a kid of his own.'. The release date of this film is '2/10/95'. The genre(s) of this film is/are 'Comedy, Family, Romance'. The runtime or length of this film is 106.0 minute(s). This film has a rating of 5.9 out of 10. The budget of this film is 0 and it made a revenue of 76578911.0. The language of this film is 'en'. The production company of this film is 'Sandollar Productions, Touchstone Pictures'.","[0.011255434714257717, 0.05743606761097908, -0.0025608798023313284, -0.0325169563293457, -0.07923463732004166, 0.012369106523692608, 0.030888846144080162, 0.03765002265572548, -0.04676290974020958, -0.023901544511318207, 0.027112537994980812, 0.023584969341754913, -0.020249607041478157, -0.003555835457518697, -0.029125062748789787, 0.029125062748789787, -0.00857018493115902, 0.0047967033460736275, 0.015817532315850258, 0.04052182659506798, 0.04165245592594147, 0.0016252824570983648, 0.012866583652794361, -0.07132022082805634, 0.015817532315850258, -0.0275874026119709, -0.02086014673113823, -0.013567575253546238, 0.04178813099861145, -0.0026965555734932423, 0.007236040197312832, 0.011204555630683899, 0.03986606001853943, 0.020950596779584885, 0.01172464620321989, -0.01642807200551033, -0.0014224754413589835, -0.055988859385252, 0.00794833805412054, 0.017536090686917305, 0.007807008922100067, -0.015387891791760921, 0.01876848004758358, -0.05553660914301872, 0.01690293848514557, 0.05101408436894417, 0.008575838059186935, 0.030594881623983383, -0.019412938505411148, 0.017547396942973137, 0.004434901289641857, 0.018124019727110863, -0.010724037885665894, 0.04597146809101105, -0.04043137654662132, -0.002303661312907934, -0.01882501132786274, 0.01893807388842106, 0.029577314853668213, -0.01316054817289114, 0.023539742454886436, -0.00988737028092146, -0.01055444311350584, -0.020464425906538963, -0.027474340051412582, -0.02435379847884178, -0.023121409118175507, 0.027745692059397697, -0.01316054817289114, -0.0013270784402266145, -0.00920899212360382, 0.04628804326057434, -0.012109060771763325, -0.010328317061066628, -0.0005363431992009282, 0.0413811057806015, -0.03134109824895859, -0.03898416832089424, 0.015331360511481762, 0.007886153645813465, 0.017411721870303154, 0.0015221124049276114, -0.040453989058732986, 0.04671768471598625, -0.011447641998529434, 0.025529654696583748, 0.04992867633700371, 0.03832840174436569, -0.07195337116718292, 0.0013087056577205658, -0.0007504565292038023, -0.026366321370005608, -0.0023587795440107584, 0.005511827301234007, 0.0014980864943936467, -0.026999475434422493, -0.018010957166552544, -0.0015885370085015893, 0.0022457162849605083, 0.014630368910729885, ...]"


In [7]:
# Save embedding model as CSV (optional)
df.to_csv('./datasets/movies_embedding.csv', index=False)

In [8]:
from qdrant_client import models, QdrantClient
from qdrant_client.http import models as rest
from qdrant_client.http.models import Record
from sentence_transformers import SentenceTransformer

# Initialize the Qdrant client
qdrant_client = QdrantClient(':memory:')

# Set the collection name and size
collection_name = 'movies'
vector_size = len(df['metadata_vector'][0])

# Create a collection
qdrant_client.recreate_collection(
    collection_name=collection_name,
    vectors_config={
        'metadata': rest.VectorParams(
            distance=rest.Distance.COSINE,
            size=vector_size,
        ),
    }
)

# Add vectors to the collection
qdrant_client.upsert(
    collection_name=collection_name,
    points=[
        rest.PointStruct(
            id=k,
            vector={
                'metadata': v['metadata_vector'],
            },
            payload=v.to_dict(),
        )
        for k, v in df.iterrows()
    ],
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [9]:
print(qdrant_client.get_collections())
qdrant_client.count(collection_name=collection_name)

collections=[CollectionDescription(name='movies')]


CountResult(count=40179)

In [10]:
# Generate a query embedding and search in Qdrant
def query_qdrant(query, collection_name, vector_name, top_k=5):
    # Creates embedding vector from user query
    completion = openai.embeddings.create(
        input=query,
        model='text-embedding-3-small'  # Be sure to use the same embedding model as the vectors in the collection
    )

    embedded_query = completion.data[0].embedding

    query_results = qdrant_client.search(
        collection_name=collection_name,
        query_vector=(
            vector_name, embedded_query
        ),
        limit=top_k,
    )
    
    return query_results

In [16]:
# Search for similar vectors and store a result
query_results = query_qdrant('scary movies with monsters released after the year 2000', collection_name=collection_name, vector_name='metadata')

for i, vector in enumerate(query_results):
    print(f"{i + 1}. {vector.payload['title']} {vector.payload['poster_link']} (Score: {round(vector.score, 3)})")

1. Monster (2008) https://images-na.ssl-images-amazon.com/images/M/MV5BMTQ2NjIxNTg5N15BMl5BanBnXkFtZTcwNzE0NzAzOA@@._V1_UY268_CR1,0,182,268_AL_.jpg (Score: 0.596)
2. Monster House (2006) https://images-na.ssl-images-amazon.com/images/M/MV5BMTIzNjE1NDg1N15BMl5BanBnXkFtZTcwOTg2NTMzMQ@@._V1_UX182_CR0,0,182,268_AL_.jpg (Score: 0.571)
3. Scary Movie (2000) https://images-na.ssl-images-amazon.com/images/M/MV5BMGEzZjdjMGQtZmYzZC00N2I4LThiY2QtNWY5ZmQ3M2ExZmM4XkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_UX182_CR0,0,182,268_AL_.jpg (Score: 0.562)
4. Boogeyman (2005) https://images-na.ssl-images-amazon.com/images/M/MV5BZWE1MDkyNzQtMTNjMC00OTU4LWE5NTctM2M0N2Y1OTE5NDAyXkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_UX182_CR0,0,182,268_AL_.jpg (Score: 0.544)
5. How to Make a Monster (2001) https://images-na.ssl-images-amazon.com/images/M/MV5BMTIyODQyMDgzN15BMl5BanBnXkFtZTcwNDI5NTkyMQ@@._V1_UY268_CR2,0,182,268_AL_.jpg (Score: 0.542)


In [17]:
# Format the response as JSON
import json
from datetime import datetime
import locale

# Assume year in 2000's if between 00 to 24, otherwise, in 1900's
def convert_date_format(date_str):
    month, day, year = map(int, date_str.split('/'))
    
    if year >= 0 and year <= 24:
        year += 2000
    else:
        year += 1900

    date_obj = datetime(year, month, day)
    formatted_date = date_obj.strftime('%m/%d/%Y')
    
    return formatted_date

def string_to_array(str):
    arr = str.split(',')
    arr = [arr.strip() for arr in arr]

    return arr

def format_time(minutes_float):
    minutes_int = int(minutes_float)

    hours = minutes_int // 60
    minutes = minutes_int % 60
    
    if hours > 0:
        time_string = f"{hours}h {minutes}m"
    else:
        time_string = f"{minutes}m"
    
    return time_string

def format_as_dollars(number):
    locale.setlocale(locale.LC_ALL, '')

    number = int(number)
    formatted_number = locale.currency(number, grouping=True)
    formatted_number = formatted_number.replace(locale.localeconv()['currency_symbol'], "$")

    return formatted_number

def get_country_name(abbreviation):
    country_names = {'en': 'English', 'fr': 'French', 'zh': 'Chinese', 'it': 'Italian', 'fa': 'Persian', 'nl': 'Dutch', 'de': 'German', 'cn': 'Chinese', 'ar': 'Arabic', 'es': 'Spanish', 'ru': 'Russian', 'sv': 'Swedish', 'ja': 'Japanese', 'ko': 'Korean', 'sr': 'Serbian', 'bn': 'Bengali', 'he': 'Hebrew', 'pt': 'Portuguese', 'wo': 'Wolof', 'ro': 'Romanian', 'hu': 'Hungarian', 'cy': 'Welsh', 'vi': 'Vietnamese'}
    return country_names.get(abbreviation, '')

# Function to search for similar vectors
def search_movies_in_qdrant(query):
    query_results = query_qdrant(query, collection_name, 'metadata')

    results = []
    
    for i, vector in enumerate(query_results):
        tmp = {
            "rank": i,
            "title": vector.payload["title"],
            "summary": vector.payload["summary"],
            "date": convert_date_format(vector.payload["date"]),
            "genres": string_to_array(vector.payload["genres"]),
            "runtime": format_time(vector.payload["runtime"]),
            "rating": vector.payload["rating"],
            "votes": int(vector.payload["votes"]),
            "budget": format_as_dollars(vector.payload["budget"]),
            "revenue": format_as_dollars(vector.payload["revenue"]),
            "language": get_country_name(vector.payload["language"]),
            "adult": vector.payload["adult"],
            "production": string_to_array(vector.payload["production"]),
            "poster_link": vector.payload["poster_link"]
        }
        results.append(tmp)

    return results

In [18]:
query = 'scary movies about monsters after year 2003'
response = (search_movies_in_qdrant(query))

json_string = json.dumps(response, indent=2)
print(json_string)

[
  {
    "rank": 0,
    "title": "Monster (2008)",
    "summary": "Two women, aspiring documentary filmmakers, find themselves trapped in a monster-plagued Toyko in 2003.",
    "date": "01/18/2008",
    "genres": [
      "Action",
      "Horror",
      "Thriller"
    ],
    "runtime": "1h 26m",
    "rating": 2.1,
    "votes": 7,
    "budget": "$0.00",
    "revenue": "$0.00",
    "language": "English",
    "adult": false,
    "production": [
      "Asylum",
      "The"
    ],
    "poster_link": "https://images-na.ssl-images-amazon.com/images/M/MV5BMTQ2NjIxNTg5N15BMl5BanBnXkFtZTcwNzE0NzAzOA@@._V1_UY268_CR1,0,182,268_AL_.jpg"
  },
  {
    "rank": 1,
    "title": "Monster House (2006)",
    "summary": "Monsters under the bed are scary enough, but what happens when an entire house is out to get you? Three teens aim to find out when they go up against a decrepit neighboring home and unlock its frightening secrets.",
    "date": "07/21/2006",
    "genres": [
      "Animation",
      "Adventu