In [None]:
import pandas as pd
import os
import openai
from openai import OpenAI
from transformers import AutoTokenizer, AutoModel
import torch
from scipy.spatial.distance import cosine
import nltk
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
path = os.getenv('FILE_PATH')
print(path)
os.chdir(path)  # Change to your own path

In [None]:
from sympy.codegen.ast import none

# Load your spreadsheet data into a pandas DataFrame
df = pd.read_excel('response_test.xlsx') #, sheet_name="DevOps")
df['Semantic Similarity Score'] = float('nan')
#print(df)

In [None]:

# Set your OpenAI API key (replace with your actual key)
# openai.api_key = "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"
openai.api_key = os.getenv('OPENAI_API_KEY')
print(openai.api_key)

# Initialize the OpenAI client
client = OpenAI(api_key=openai.api_key)

In [None]:
# Load the model and tokenizer
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [None]:
def analyze_alignment(capability, criteria, text):
    # Create a prompt for the model
    prompt1 = f"""You are trained to analyze and determine the alignment strength between the given criteria and text. If you are unsure of an answer, you can say "not sure" and recommend the user review manually. Analyze the following criteria and text pair and determine the alignment strength: Criteria: {criteria} Text: {text}"""
    
# Call the OpenAI API to generate a response
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a good assistant"},
            {"role": "user", "content": prompt1},
        ],
        max_tokens=100,
        temperature=0
    )
    #print(prompt)
    #print(response.choices[0].message)
    alignment_strength = response.choices[0].message.content.strip().lower()
    return alignment_strength


In [None]:
def get_embedding(text, tokenizer, model):
    # Tokenize and convert to input IDs
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    
    # Generate embeddings
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Use mean pooling to get a single vector representation
    embeddings = outputs.last_hidden_state
    attention_mask = inputs['attention_mask']
    mask_expanded = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
    sum_embeddings = torch.sum(embeddings * mask_expanded, 1)
    sum_mask = mask_expanded.sum(1)
    sum_mask = torch.clamp(sum_mask, min=1e-9)
    mean_pooled = sum_embeddings / sum_mask
    return mean_pooled

In [None]:
def compute_cosine_similarity(criteria, text, tokenizer, model):
    # Get embeddings for both texts
    embedding1 = get_embedding(criteria, tokenizer, model).numpy().flatten()
    embedding2 = get_embedding(text, tokenizer, model).numpy().flatten()
    
    # Compute cosine similarity
    score = 1 - cosine(embedding1, embedding2)
    return score

In [None]:
def summarize_paragraph(single_summary):
    # Use the completion endpoint to summarize the paragraph
    response = openai.completions.create(
      model="gpt-3.5-turbo-instruct",
      prompt=f"You are an expert summarizer. Your task is to read the following assessments;{single_summary}, which contains a varied set of analyses, and summarize the main points in a short narrative paragraph. Focus on capturing the essence of the analyses, highlighting key findings, and presenting them in a clear, concise manner. Please ensure the summary is informative, easy to understand and writen in a positive tone.",
      max_tokens=1000,
      temperature=.2,
      user="role" "Expert summarizer"  
      #top_p=0.5,
      #frequency_penalty=2,
      #presence_penalty=0
    )
    
    # Extract and return the summarized text from the response
    return response.choices[0].text.strip()

In [None]:
# Iterate over each Criteria in the DataFrame and perform analysis
results = []
similarity_scores = []
for _, row in df.iterrows():
    capability = row["Capability"]
    criteria = row['Criteria']
    text = row['Text']
    score = row['Semantic Similarity Score']
    strength = analyze_alignment(capability, criteria, text)
    
    # semantic similarity - correlation analysis in textual data.
    similarity = compute_cosine_similarity(criteria, text, tokenizer, model)
    similarity = round(similarity,2)
     # Append the similarity score to the similarity_scores list
    similarity_scores.append(similarity)
    
    results.append([capability,criteria, text, strength, similarity]) 
    

In [None]:
# Write Domain Summary 
# Converting to string if they are not already
for result in results:
    result[3] = str(result[3])
    #print(result[3])
    
# Extracting the 'strength' outputs from results
result_join = [result[3] for result in results]
single_summary = ' '.join(result_join)
assessment = summarize_paragraph(single_summary)
average_similarity = sum(similarity_scores) / len(similarity_scores) if similarity_scores else 0
average_similarity = round(average_similarity, 2)

# print (f'{assessment}, \n"Overall Similarity Score" {average_similarity}')

In [None]:
# Print output - Use for testing
#print(results)
print (f'{assessment}, \n"Overall Similarity Score" {average_similarity}')

In [None]:
output_df = pd.DataFrame(results, columns=['Capability', 'Criteria','Response', 'Alignment Strength', 'Semantic Similarity Score'])
output_df.to_csv('alignment_results.csv', index=False)
domain_summary = pd.DataFrame({"Assessment":[assessment], "Score": [average_similarity]})
domain_summary.to_csv('domain_summary.csv', index=False)
