# Import packages

In [43]:
import pandas as pd
import chromadb
from chromadb.config import Settings

# Read biographies

In [44]:
survivor_bios = pd.read_excel('input/top_players.xlsx')

In [45]:
#duckdb
#parquet: good compression ratio & offers high performance for querying & processing data
#persist_directory (if don't specify, then it will be in-memory)

# Configure and query ChromaDB database

In [46]:
chroma_client = chromadb.PersistentClient(path="input/chromadb")

collection_name = 'survivor_bios'
collection = chroma_client.create_collection(name=collection_name)

In [47]:
collection.add(documents=survivor_bios['filtered_bio'].tolist(),
               ids=survivor_bios['name'].tolist()
)

# Test query

In [48]:
results = collection.query(query_texts=["Parvati Shallow"], n_results=10 )
print(results)

{'ids': [['Natalie_White', 'Fabio_Birza', 'Tina_Wesson', 'Jenna_Morasca', 'Erika_Casupanan', 'Tony_Vlachos', 'Parvati_Shallow', 'Yam_Yam_Arocho', 'Mike_Holloway', 'Mike_Gabler']], 'distances': [[1.5224612801395585, 1.5799138713568992, 1.605866906931107, 1.6347916171236099, 1.647546450954634, 1.6597007626143494, 1.6639739529698983, 1.6703010205895144, 1.6743874749102954, 1.6757066385902757]], 'metadatas': [[None, None, None, None, None, None, None, None, None, None]], 'embeddings': None, 'documents': [['Natalie White Hometown Van Buren Ark. Occupation Pharmaceutical Sales typical Southern Belle afraid get hands dirty Friendly savvy attributes fellow castaways pick plans bring background pharmaceutical sales game knows make cut more likely going numbers go convinced ambitious personality help competitive scrappy care have dive ground bruised scraped do win love compete waste time changing plan needed observant flexible mention smart know adapt surroundings Currently relationship resides 

# Vector MAP

In [49]:
getado = collection.get(ids='Ethan_Zohn', include=['documents', 'embeddings'])
word_vectors = getado['embeddings']
word_list = getado['documents']
word_vectors

[[-0.046629346907138824,
  0.038358721882104874,
  -0.08248274028301239,
  -0.010124368593096733,
  0.013137085363268852,
  0.05812143161892891,
  -0.023272529244422913,
  -0.016385884955525398,
  -0.06434899568557739,
  0.0746590793132782,
  0.012744955718517303,
  -0.07386695593595505,
  0.027741430327296257,
  0.030525261536240578,
  -0.03989875689148903,
  0.008942081592977047,
  0.02123975194990635,
  -0.029048452153801918,
  -0.09488584101200104,
  -0.07396188378334045,
  -0.06910666823387146,
  0.0056307450868189335,
  0.05043661594390869,
  0.00952435564249754,
  0.01418609544634819,
  0.02000022865831852,
  -0.04954618215560913,
  0.05921135097742081,
  -0.014001637697219849,
  -0.011909758672118187,
  -0.015582812018692493,
  0.054726384580135345,
  0.03735533356666565,
  0.06271814554929733,
  0.05539408326148987,
  0.0022176443599164486,
  -0.0037691143807023764,
  0.055820975452661514,
  -0.08392928540706635,
  -0.053851474076509476,
  0.016948534175753593,
  -0.1137126684

In [50]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
#model_id = "databricks/dolly-v2-3b"
tokenizer = AutoTokenizer.from_pretrained(model_id)
lm_model = AutoModelForCausalLM.from_pretrained(model_id)

# Create pipeline, which provides a simple interface for performing NLP tasks

In [51]:
pipe = pipeline(
    "text-generation",
    model=lm_model, #AutoModel
    tokenizer=tokenizer, #AutoTokenizer, used to tokenize text inputs capatible with various pre-trained models
    max_new_tokens=256, #Response size is 256 tokens
    device_map="auto", #Model will decide whether to use CPU or GPU
)

# Create extended prompt

In [58]:
results = collection.get(ids=['Parvati_Shallow'])
print(results)

{'ids': ['Parvati_Shallow'], 'embeddings': None, 'metadatas': [None], 'documents': ['Cook IslandsMicronesiaHeroes VillainsWinners War Parvati Shallow grew eldest siblings commune Vero Beach Florida family moved Atlanta years old studied martial arts youth works female boxer cocktail waitress worked model bartender public relations Striking own put college attending University Georgia received Bachelor Arts Degree journalism minor French active member Alpha Omicron Pi sorority enjoys boxing running hiking volleyball yoga skiing softball dancing swimming high school swim team loves outdoors gets cranky play sun daily basis proud Perfect Model Boxing resides West Hollywood California birth date September degree minors Italian huge Bulldog fan nature lover heart spent extensive amounts time camping North America state parks including Tetons Yellowstone organized month backpacking trip Europe friends visited countries sleeping hostels train stations beaches park benches favorites include go

In [60]:
question = "Who is Parvati Shallow?"
context = " ".join([results["documents"][0]])

prompt_template = f"""
Relevant context: {context}
Considering the relevant context, answer the question.
Question: {question}
Answer: """
prompt_template

'\nRelevant context: Cook IslandsMicronesiaHeroes VillainsWinners War Parvati Shallow grew eldest siblings commune Vero Beach Florida family moved Atlanta years old studied martial arts youth works female boxer cocktail waitress worked model bartender public relations Striking own put college attending University Georgia received Bachelor Arts Degree journalism minor French active member Alpha Omicron Pi sorority enjoys boxing running hiking volleyball yoga skiing softball dancing swimming high school swim team loves outdoors gets cranky play sun daily basis proud Perfect Model Boxing resides West Hollywood California birth date September degree minors Italian huge Bulldog fan nature lover heart spent extensive amounts time camping North America state parks including Tetons Yellowstone organized month backpacking trip Europe friends visited countries sleeping hostels train stations beaches park benches favorites include good sushi wearing oversized jerseys karaoke cheering Dawgs traini

# Send prompt to model and wait its response

In [62]:
lm_response = pipe(prompt_template)
print(lm_response[0]["generated_text"])


Relevant context: Cook IslandsMicronesiaHeroes VillainsWinners War Parvati Shallow grew eldest siblings commune Vero Beach Florida family moved Atlanta years old studied martial arts youth works female boxer cocktail waitress worked model bartender public relations Striking own put college attending University Georgia received Bachelor Arts Degree journalism minor French active member Alpha Omicron Pi sorority enjoys boxing running hiking volleyball yoga skiing softball dancing swimming high school swim team loves outdoors gets cranky play sun daily basis proud Perfect Model Boxing resides West Hollywood California birth date September degree minors Italian huge Bulldog fan nature lover heart spent extensive amounts time camping North America state parks including Tetons Yellowstone organized month backpacking trip Europe friends visited countries sleeping hostels train stations beaches park benches favorites include good sushi wearing oversized jerseys karaoke cheering Dawgs training