# Load Data

In [5]:
import pandas as pd
import numpy as np

In [6]:
# import data
reviews = pd.read_csv(r'C:\Users\Wenxia\Desktop\Python\Topic_Modeling_Google_Review\merged_raw_data.csv')
print('Data shape:', reviews.shape)
reviews.head()

Data shape: (1865, 5)


Unnamed: 0.1,Unnamed: 0,Review Rate,Review Time,Review Text,Location
0,0,5 stars,2 months ago,Best burgers in town focused on meat quality. ...,Eaux Vives
1,1,5 stars,3 weeks ago,Best burgers in Geneva. The vegetarian option ...,Eaux Vives
2,2,5 stars,2 years ago,The burgers are very good(much better than tho...,Eaux Vives
3,3,5 stars,2 years ago,Inglewood is considered by many to have the be...,Eaux Vives
4,4,5 stars,a year ago,"very yummy burger!! we had cain cain, pasadena...",Eaux Vives


In [7]:
# extract only the reveiw column and transform it to a list
all_reviews = reviews['Review Text'].tolist()

In [8]:
tools = [
  {
    "type": "function",
    "function": {
        "name": "extracts_intents",
        "parameters": {
          "type": "object",
          "properties": {
            "intents": {
              "type": "array",
              "description": "List of intents identified from the customer review",
              "items": {
                "type": "object",
                "properties": {
                  "intent": {
                    "type": "string",
                    "description": "Description of the identified intent"
                  },
                  "text_summary": {
                    "type": "string",
                    "description": "Summary of the intent"
                  },
                  "sentiment": {
                    "type": "string",
                    "enum": ["Positive", "Negative", "Neutral"],
                    "description": "Sentiment of the intent"
                  },
                  "named_entities": {
                    "type": "array",
                    "items": {
                      "type": "string",
                      "description": "Named entities in the text, if any, like 'Cain Cain' or 'Pasadena'"
                    }
                  }
                },
                "required": ["intent", "text_summary", "sentiment"]
              }
            }
          }
        }
    }
  }
]

In [9]:
prompt_template = f"""
Follow the instructions below and extract structured insights from a customer review of a burger restaurant. **DO NOT include any additional notes, explanations, or unnecessary text in the output. ONLY return a valid JSON string.**

### **Instructions:**
- **Identify each distinct intent** in the review as `"intent"`. A single review may contain multiple different aspects of the customer's experience (e.g., burger quality, service, pricing, cleanliness).
- **Summarize the key part of the review** that relates to each intent as `"text_summary"`.
- **Classify the sentiment** (`"Positive"`, `"Negative"`, or `"Neutral"`) for each intent as `"sentiment"`.
- **Extract the `"named_entities"`**, specifically **burger names from the restaurant menu**. Only include burger names if they are explicitly mentioned in the review. Otherwise, return an empty list (`[]`).
- **Use the following predefined list of burger names** when extracting `"named_entities"`:

  **Burger Names:**  
  - CLASSIQUE "B"  
  - CAIN CAIN  
  - GROS CAIN CAIN  
  - PASADENA  
  - POINCOMNOU  
  - DZODZET  
  - MEXICANO "B"  
  - OURANOS  
  - VALAISAN  
  - ARNOLD  
  - HONEYGOAT  
  - GORGODZILLA  
  - MEXICANO "P"  
  - AIE AIE AIE  
  - CALIVOCAT  
  - BUFFALO  
  - VIOLETTE  
  - LIL WOOD  

- **Return a list of extracted intents in a JSON format.** The JSON string must be **COMPLETE** and correctly structured.

---

### **Output Format**
{{format_instructions}}

---

### **Example Review (Google Review):**
"I tried the MEXICANO 'B' and it was amazing! The flavors were perfect, and the purée of avocados added a great touch. However, the fries were cold and soggy. The service was friendly but a bit slow."

---

### **Example Output (JSON Format):**
{{example_output}}

---

### **Review to Analyze:**
{{review}}

"""

format_instructions = {
  "intents": [
    {
      "intent": "",
      "text_summary": "",
      "sentiment": "",
      "named_entities": [""]
    },
    {
      "intent": "",
      "text_summary": "",
      "sentiment": "",
      "named_entities": [""]
    }
  ]
}

example_output = {
  "intents": [
    {
      "intent": "Burger Quality",
      "text_summary": "The MEXICANO 'B' was amazing! The flavors were perfect, and the purée of avocados added a great touch.",
      "sentiment": "Positive",
      "named_entities": ["MEXICANO 'B'"]
    },
    {
      "intent": "Fries Quality",
      "text_summary": "The fries were cold and soggy.",
      "sentiment": "Negative",
      "named_entities": []
    },
    {
      "intent": "Customer Service",
      "text_summary": "The service was friendly but a bit slow.",
      "sentiment": "Neutral",
      "named_entities": []
    }
  ]
}


In [4]:
%%capture
#!pip install langchain

In [10]:
from langchain.prompts import PromptTemplate

prompt = PromptTemplate(
    input_variabels=['review'],
    template=prompt_template,
    partial_variables={
        'format_instructions': format_instructions,
        'example_output': example_output
    }
)


# Batch Inference with Tool Calling 

In [None]:
%%capture
# !pip install tenacity

In [11]:
from openai import OpenAI

In [None]:
import json
import concurrent.futures
from openai import OpenAI, RateLimitError
from tenacity import (retry, stop_after_attempt, wait_random_exponential, retry_if_exception)
from tqdm.notebook import tqdm
from typing import List

client = OpenAI(
  base_url="https://api.openai.com/v1",
  api_key="",
)

@retry(
    wait=wait_random_exponential(min=1, max=30),
    stop=stop_after_attempt(3),
    retry=retry_if_exception(RateLimitError),
)

def call_chat_model(
    prompt: str, review: str, temperature: float=0.1, max_tokens: int=500, **kwargs
): 
    """ calls the chat model and returns the response text in predefined JSON format"""
    chat_args = {
      "model": "gpt-4o-mini",
      "messages": [
        {
          "role": "system",
          "content": 'You are a helpful analyst to do analysis on Google Map reviews for Inglewood, a burger restaurant in Geneva, Switzerland. You help anayze customer reviews and extrarct insights.'
        },
        {
          "role": "user",
          "content": prompt.format(review=review)
        }
      ],
      "max_tokens": max_tokens,
      "temperature": temperature
  }
    
    chat_args.update(kwargs)

    chat_completion = client.chat.completions.create(**chat_args)
    
    response = chat_completion.choices[0].message
    if response.tool_calls:
      call_args = [c.function.arguments for c in response.tool_calls]
      if len(call_args) == 1:
          return call_args[0]
      return call_args
    return response.content 

def call_in_parallel(func, prompts: List[str]) -> List:   
  with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: 
    return list(tqdm(executor.map(func, prompts), total=len(prompts)))



In [37]:
def extract_batch(inp: str): 
    return call_chat_model(prompt=prompt, review=inp, tools=tools) 

def results_to_dataframe(reviews: List[str], responses: List[str]):
    """Combines reviews and model responses into a dataframe for tabular display."""
    return pd.DataFrame({"Review": reviews, "Model response": responses})
  
results = call_in_parallel(extract_batch, all_reviews)
results_to_dataframe(all_reviews, results)

  0%|          | 0/1865 [00:00<?, ?it/s]

Unnamed: 0,Review,Model response
0,Best burgers in town focused on meat quality. ...,"{""intents"":[{""intent"":""Burger Quality"",""text_s..."
1,Best burgers in Geneva. The vegetarian option ...,"{""intents"":[{""intent"":""Burger Quality"",""text_s..."
2,The burgers are very good(much better than tho...,"{""intents"":[{""intent"":""Burger Quality"",""text_s..."
3,Inglewood is considered by many to have the be...,"{""intents"":[{""intent"":""Burger Quality"",""text_s..."
4,"very yummy burger!! we had cain cain, pasadena...","{""intents"":[{""intent"":""Burger Quality"",""text_s..."
...,...,...
1860,"Bon hamburger, les bières par contre 3dl suff...","{""intents"":[{""intent"":""Burger Quality"",""text_s..."
1861,"A small, lively and young place. The center of...","{""intents"":[{""intent"":""Ambiance"",""text_summary..."
1862,Concernant les points positifs:\n-D.E.L.I.C.I....,"{""intents"":[{""intent"":""Burger Quality"",""text_s..."
1863,"serveurs super sympas et au taquet, déco très ...","{""intents"":[{""intent"":""Customer Service"",""text..."


In [None]:
df = pd.DataFrame({"Review": all_reviews, "Model response": results})
df.to_csv("reivew_model_response.csv", index=False, encoding="utf-8")

# Parse LLM output

In [None]:
%%capture
#!pip install openai tenacity tqdm

In [None]:
import pandas as pd
import json

In [None]:
# load the data
reviews = pd.read_csv("reivew_model_response.csv")
reviews.head()

Unnamed: 0,Review,Model response
0,Best burgers in town focused on meat quality. ...,"{""intents"":[{""intent"":""Burger Quality"",""text_s..."
1,Best burgers in Geneva. The vegetarian option ...,"{""intents"":[{""intent"":""Burger Quality"",""text_s..."
2,The burgers are very good(much better than tho...,"{""intents"":[{""intent"":""Burger Quality"",""text_s..."
3,Inglewood is considered by many to have the be...,"{""intents"":[{""intent"":""Burger Quality"",""text_s..."
4,"very yummy burger!! we had cain cain, pasadena...","{""intents"":[{""intent"":""Burger Quality"",""text_s..."


In [None]:
# get the llm intents 
results = reviews["Model response"]

In [None]:
# Parse the llm output
parsed_data = []

for index, item in enumerate(results): 
    try:
        parsed_data.append({'id': index, **json.loads(item)}) # Add an id column
    except json.JSONDecodeError:
        parsed_data.append({'id': index, 'intents': []}) # Add an empty list for invalid JSON

In [None]:
print(parsed_data[0])

{'id': 0, 'intents': [{'intent': 'Burger Quality', 'text_summary': 'Best burgers in town focused on meat quality.', 'sentiment': 'Positive', 'named_entities': []}, {'intent': 'Comparison with Other Burger Places', 'text_summary': 'A lot of other burger places are trying to cover poor meat quality by adding spices and non natural flavor enhancers.', 'sentiment': 'Negative', 'named_entities': []}, {'intent': 'Recommendation', 'text_summary': 'Simply put, if you like good meat in your burger go try this place out.', 'sentiment': 'Positive', 'named_entities': []}]}


In [None]:
# Explode the intents into different rows and create a pandas DataFrame
exploded_df = pd.DataFrame(parsed_data).explode("intents")
exploded_df.head()

Unnamed: 0,id,intents
0,0,"{'intent': 'Burger Quality', 'text_summary': '..."
0,0,{'intent': 'Comparison with Other Burger Place...
0,0,"{'intent': 'Recommendation', 'text_summary': '..."
1,1,"{'intent': 'Burger Quality', 'text_summary': '..."
1,1,"{'intent': 'Vegetarian Option', 'text_summary'..."


In [None]:
# Filter out rows where 'intents' is empty
exploded_df = exploded_df[exploded_df['intents'].apply(lambda x: isinstance(x, dict))]

# Normalize the 'intents' into different columns
results_df = pd.json_normalize(exploded_df['intents'])
# Add back the original id and llm response 
results_df['id'] = exploded_df['id'].values
results_df['llm_response'] = exploded_df['intents'].values

In [None]:
results_df.head()
results_df.to_csv("reviews.csv")