### Import modules

In [2]:
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth, helpers
import boto3
import json
import pandas as pd

### Initialize and configure Boto Client for Bedrock

In [3]:
session = boto3.Session(profile_name="rapid-innovation-dev")
bedrock = session.client(
 service_name='bedrock-runtime',
 region_name='us-east-1',
#  endpoint_url='https://bedrock.us-east-1.amazonaws.com'
)

### Initialize and configure OpenSearch client

In [4]:
with open('host.txt', 'r') as fp:
    host = fp.read()
region = "us-east-1"
service = "aoss"
credentials = session.get_credentials()
auth = AWSV4SignerAuth(credentials, region, service)

In [5]:
client = OpenSearch(
    hosts = [{"host": host, "port": 443}],
    http_auth = auth,
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection,
    pool_maxsize = 20
)

### Process the dataset

In [6]:
df=pd.read_csv('./data/oscars.csv')

In [7]:
df=df.loc[df['year_ceremony'] == 2023]
df=df.dropna(subset=['film'])
df['category'] = df['category'].str.lower()

In [8]:
df.head()

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner
10639,2022,2023,95,actor in a leading role,Austin Butler,Elvis,False
10640,2022,2023,95,actor in a leading role,Colin Farrell,The Banshees of Inisherin,False
10641,2022,2023,95,actor in a leading role,Brendan Fraser,The Whale,True
10642,2022,2023,95,actor in a leading role,Paul Mescal,Aftersun,False
10643,2022,2023,95,actor in a leading role,Bill Nighy,Living,False


### Concatenate columns to create a new text colummn

In [9]:
# Create the column for all rows first
df['text'] = df['name'] + ' got nominated under the category, ' + df['category'] + ', for the film ' + df['film'] + ' to win the award'

# Find the rows where 'winner' is False and replace the 'text' for those rows
df.loc[df['winner'] == False, 'text'] = df['name'] + ' got nominated under the category, ' + df['category'] + ', for the film ' + df['film'] + ' but did not win'

In [10]:
df.head()

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner,text
10639,2022,2023,95,actor in a leading role,Austin Butler,Elvis,False,Austin Butler got nominated under the category...
10640,2022,2023,95,actor in a leading role,Colin Farrell,The Banshees of Inisherin,False,Colin Farrell got nominated under the category...
10641,2022,2023,95,actor in a leading role,Brendan Fraser,The Whale,True,Brendan Fraser got nominated under the categor...
10642,2022,2023,95,actor in a leading role,Paul Mescal,Aftersun,False,"Paul Mescal got nominated under the category, ..."
10643,2022,2023,95,actor in a leading role,Bill Nighy,Living,False,"Bill Nighy got nominated under the category, a..."


### Generate embeddings for the text column from Titan

In [11]:
def text_embedding(text):
    body=json.dumps({"inputText": text})
    response = bedrock.invoke_model(body=body, modelId='amazon.titan-embed-text-v1', accept='*/*', contentType='application/json')
    response_body = json.loads(response.get('body').read())
    embedding = response_body.get('embedding')
    return embedding

In [12]:
df=df.assign(embedding=(df["text"].apply(lambda x : text_embedding(x))))

In [14]:
pd.set_option('display.max_rows', 50)
df

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner,text,embedding
10639,2022,2023,95,actor in a leading role,Austin Butler,Elvis,False,Austin Butler got nominated under the category...,"[-0.109375, -0.16601562, 0.25390625, -0.621093..."
10640,2022,2023,95,actor in a leading role,Colin Farrell,The Banshees of Inisherin,False,Colin Farrell got nominated under the category...,"[-0.46484375, -0.055664062, 0.23828125, -0.761..."
10641,2022,2023,95,actor in a leading role,Brendan Fraser,The Whale,True,Brendan Fraser got nominated under the categor...,"[0.26757812, -0.56640625, 0.22265625, -0.40429..."
10642,2022,2023,95,actor in a leading role,Paul Mescal,Aftersun,False,"Paul Mescal got nominated under the category, ...","[-0.43359375, 0.032714844, 0.28125, -0.5390625..."
10643,2022,2023,95,actor in a leading role,Bill Nighy,Living,False,"Bill Nighy got nominated under the category, a...","[-0.43554688, -0.21289062, 0.18359375, -0.4335..."
...,...,...,...,...,...,...,...,...,...
10755,2022,2023,95,writing (original screenplay),Written by Daniel Kwan & Daniel Scheinert,Everything Everywhere All at Once,True,Written by Daniel Kwan & Daniel Scheinert got ...,"[0.048583984, 0.038085938, 0.46484375, -0.5429..."
10756,2022,2023,95,writing (original screenplay),Written by Steven Spielberg & Tony Kushner,The Fabelmans,False,Written by Steven Spielberg & Tony Kushner got...,"[-0.16503906, 0.029663086, 0.24414062, -0.7695..."
10757,2022,2023,95,writing (original screenplay),Written by Todd Field,Tár,False,Written by Todd Field got nominated under the ...,"[-0.087890625, -0.114746094, 0.31835938, -0.66..."
10758,2022,2023,95,writing (original screenplay),Written by Ruben Östlund,Triangle of Sadness,False,Written by Ruben Östlund got nominated under t...,"[0.3046875, 0.07861328, 0.42773438, -0.4140625..."


### Ingest the text and embeddings into AOSS

In [16]:
def add_document(vector,text):
    document = {
      "nominee_vector": vector,
      "nominee_text": text
    }
    
    response = client.index(
        index = 'oscars-index',
        body = document
    )
    print('\nAdding document:')
    print(response) 

In [17]:
df.apply(lambda row: add_document(row['embedding'], row['text']), axis=1)


Adding document:
{'_index': 'oscars-index', '_id': '1%3A0%3AnsOmJZABw5U0TIJo96gt', '_version': 1, 'result': 'created', '_shards': {'total': 0, 'successful': 0, 'failed': 0}, '_seq_no': 0, '_primary_term': 0}

Adding document:
{'_index': 'oscars-index', '_id': '1%3A0%3Aj-GmJZABlACFB3AV-QGh', '_version': 1, 'result': 'created', '_shards': {'total': 0, 'successful': 0, 'failed': 0}, '_seq_no': 0, '_primary_term': 0}

Adding document:
{'_index': 'oscars-index', '_id': '1%3A0%3An8OmJZABw5U0TIJo_Kha', '_version': 1, 'result': 'created', '_shards': {'total': 0, 'successful': 0, 'failed': 0}, '_seq_no': 0, '_primary_term': 0}

Adding document:
{'_index': 'oscars-index', '_id': '1%3A0%3AkOGmJZABlACFB3AV_gFM', '_version': 1, 'result': 'created', '_shards': {'total': 0, 'successful': 0, 'failed': 0}, '_seq_no': 0, '_primary_term': 0}

Adding document:
{'_index': 'oscars-index', '_id': '1%3A0%3AoMOnJZABw5U0TIJoAKhR', '_version': 1, 'result': 'created', '_shards': {'total': 0, 'successful': 0, 'fa

10639    None
10640    None
10641    None
10642    None
10643    None
         ... 
10755    None
10756    None
10757    None
10758    None
10764    None
Length: 121, dtype: object

### Perform semantic search

In [18]:
def search_index(vector):
    document = {
        "size": 15,
        "_source": {"excludes": ["nominee_vector"]},
        "query": {
            "knn": {
                 "nominee_vector": {
                     "vector": vector,
                     "k":15
                 }
            }
        }
    }
    response = client.search(
    body = document,
    index = "oscars-index"
    )
    return response

In [19]:
query='who won the award for best music?'
vector=text_embedding(query)

In [20]:
response=search_index(vector)
data=response['hits']['hits']

In [21]:
data

[{'_index': 'oscars-index',
  '_id': '1%3A0%3Ar-GnJZABlACFB3AVfQEh',
  '_score': 0.7270102,
  '_source': {'nominee_text': 'Volker Bertelmann got nominated under the category, music (original score), for the film All Quiet on the Western Front to win the award'}},
 {'_index': 'oscars-index',
  '_id': '1%3A0%3Av8OnJZABw5U0TIJof6gP',
  '_score': 0.7232194,
  '_source': {'nominee_text': 'Justin Hurwitz got nominated under the category, music (original score), for the film Babylon but did not win'}},
 {'_index': 'oscars-index',
  '_id': '1%3A0%3As-GnJZABlACFB3AVkQGl',
  '_score': 0.7190479,
  '_source': {'nominee_text': 'Music by M.M. Keeravaani; Lyric by Chandrabose got nominated under the category, music (original song), for the film RRR to win the award'}},
 {'_index': 'oscars-index',
  '_id': '1%3A0%3AwsOnJZABw5U0TIJoj6jV',
  '_score': 0.7148727,
  '_source': {'nominee_text': 'Music by Tems, Rihanna, Ryan Coogler and Ludwig Goransson; Lyric by Tems and Ryan Coogler got nominated under t