In [None]:
!pip install azure-identity  
!pip install kaggle  
!pip install python-dotenv  
!pip install rich  
!pip install azure-search-documents --pre


: 

In [None]:
# Generate embeddings  
from tqdm import tqdm  
from tenacity import retry, stop_after_attempt, wait_exponential  

@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=4, max=60))  
def get_embeddings(openai_client, texts):  
    response = openai_client.embeddings.create(  
        input=texts,  
        model=os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYED_MODEL_NAME")  
    )  
    response_json = json.loads(response.model_dump_json(indent=2))  
    return [data['embedding'] for data in response_json['data']]  

def add_embeddings_to_df(df, text_column, vector_column, batch_size=1000):  
    embeddings = []  
    for i in tqdm(range(0, len(df[text_column]), batch_size)):  
        batch_texts = df[text_column][i:i+batch_size].tolist()  
        batch_embeddings = get_embeddings(openai_client, batch_texts)  
        embeddings.extend(batch_embeddings)  
    df[vector_column] = embeddings  
    return df  

df_vectors = add_embeddings_to_df(df, "text_to_vectorize", "vector") 
# Drop the text_to_vectorize column since we have the vector field 
df_vectors.drop(columns=['text_to_vectorize'], inplace=True)


In [None]:
# Initialize the SearchIndexClient  
index_client = SearchIndexClient(  
    endpoint=os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT"),  
    credential=DefaultAzureCredential(),  
)  

# Define the fields  
fields = [  
    SimpleField(name="id", type=SearchFieldDataType.String, key=True),  
    SimpleField(name="link", type=SearchFieldDataType.String),  
    SearchableField(name="headline", type=SearchFieldDataType.String),  
    SearchableField(  
        name="category",  
        type=SearchFieldDataType.String,  
        filterable=True,  
        facetable=True,  
    ),  
    SearchableField(name="short_description", type=SearchFieldDataType.String),  
    SearchableField(name="authors", type=SearchFieldDataType.String),  
    SearchField(  
        name="date",  
        type=SearchFieldDataType.DateTimeOffset,  
        filterable=True,  
        sortable=True,  
    ),  
    SimpleField(  
        name="view_count",  
        type=SearchFieldDataType.Int32,  
        filterable=True,  
        sortable=True,  
    ),  
    SearchField(  
        name="vector",  
        type="Collection(Edm.Single)",  
        vector_search_dimensions=3072,  
        vector_search_profile_name="my-vector-config",  
    ),  
]  

# Define the vector search  
vector_search = VectorSearch(  
    profiles=[  
        VectorSearchProfile(  
            name="my-vector-config",  
            algorithm_configuration_name="my-hnsw",  
            vectorizer="my-vectorizer",  
        )  
    ],  
    algorithms=[  
        HnswAlgorithmConfiguration(  
            name="my-hnsw",  
            kind=VectorSearchAlgorithmKind.HNSW,  
            parameters=HnswParameters(metric=VectorSearchAlgorithmMetric.COSINE),  
        )  
    ],  
    vectorizers=[  
        AzureOpenAIVectorizer(  
            name="my-vectorizer",  
            azure_open_ai_parameters=AzureOpenAIParameters(  
                resource_uri=os.getenv("AZURE_OPENAI_ENDPOINT"),  
                deployment_id=os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYED_MODEL_NAME"),  
                model_name=AzureOpenAIModelName.TEXT_EMBEDDING3_LARGE,  
            ),  
        )  
    ],  
)  

# Define scoring profiles  
scoring_profiles = [  
    ScoringProfile(  
        name="boostCategory",  
        text_weights=TextWeights(  
            weights={  
                "category": 10.0,  
            }  
        ),  
    ),  
    ScoringProfile(  
        name="boostRecency",  
        functions=[  
            FreshnessScoringFunction(  
                field_name="date",  
                boost=10.0,  
                parameters=FreshnessScoringParameters(  
                    boosting_duration="P1095D",  
                ),  
                interpolation=ScoringFunctionInterpolation.LINEAR,  
            )  
        ],  
    ),  
    ScoringProfile(  
        name="boostByTag",  
        functions=[  
            TagScoringFunction(  
                field_name="category",  
                boost=10.0,  
                parameters=TagScoringParameters(  
                    tags_parameter="tags",  
                ),  
            )  
        ],  
    ),  
    ScoringProfile(  
        name="boostViewCount",  
        functions=[  
            MagnitudeScoringFunction(  
                field_name="view_count",  
                boost=10.0,  
                parameters=MagnitudeScoringParameters(  
                    boosting_range_start=0,  
                    boosting_range_end=10000,  
                ),  
                interpolation=ScoringFunctionInterpolation.LINEAR,  
            )  
        ],  
    ),  
]  

# Define the index  
index = SearchIndex(  
    name="news-category",  
    fields=fields,  
    scoring_profiles=scoring_profiles,  
    vector_search=vector_search,  
)  

# Create or update the index  
result = index_client.create_or_update_index(index)  
print(f"{result.name} created")
