# Hail DB Data Filtering

# `.env`

- Make sure to create a `.env` file with:
```
AZURE_ENDPOINT=...
AZURE_API_KEY=...
AZURE_MODEL=...
```

# Define LLM Inference Code

In [1]:
!pip install openai python-dotenv

Collecting openai
  Downloading openai-1.17.0-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.3/268.3 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Installing collected 

In [2]:
import os
import re
import json
from openai import AzureOpenAI
from typing import Optional

class GPTModel:
  def __init__(self, api_key: str, endpoint: str, model: str):
    self.client = AzureOpenAI(
      azure_endpoint = endpoint,
      api_key = api_key,
      api_version = "2024-02-01"
    )
    self.model = model

  @staticmethod
  def extract_json_content(s: str) -> str:
    # Define the regex pattern to match `json<content>`
    pattern = r'^```json(.*)```$'

    # Search for the pattern
    match = re.search(pattern, s, re.DOTALL)

    # If a match is found, return the content within the wrapper
    if match:
        return match.group(1).strip()
    else:
        # Return the original string or handle as needed
        return s

  @staticmethod
  def json_to_dict(s: str) -> dict:
    return json.loads(s)

  def call(self, user_msg: str, sys_msg: Optional[str] = None) -> str:
    messages = []

    if sys_msg:
      messages.append({"role": "system", "content": sys_msg})
    messages.append({"role": "user", "content": user_msg})

    response = self.client.chat.completions.create(
        model=self.model,
        messages=messages,
    )

    res = response.choices[0].message.content
    res = GPTModel.extract_json_content(res)
    res = GPTModel.json_to_dict(res)
    return res

In [3]:
from dotenv import load_dotenv
load_dotenv()

AZURE_ENDPOINT = os.getenv('AZURE_ENDPOINT')
AZURE_API_KEY = os.getenv('AZURE_API_KEY')
AZURE_MODEL = os.getenv('AZURE_MODEL')

gpt_model = GPTModel(
    api_key=AZURE_API_KEY,
    endpoint=AZURE_ENDPOINT,
    model=AZURE_MODEL
)

# Load Dataset

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
import pandas as pd

df_hail_db = pd.read_csv('/content/drive/My Drive/Hackathon_20231007/Hail_db/hail_db_with_size_v20230204.csv')

In [9]:
df_hail_db['Notes']

0       Office inititated call. Reported twoonie size ...
1       Office initiated call. Reported pea sized hail...
2       Canwarn in Sk reported pea hail (but not a lot...
3       Rob MacDonald spotted a tornado on the ground ...
4       Multiple reports of a funnel near Calgary. Tou...
                              ...                        
6995    Call-out to Mantario, SK reported pea size hail. 
6996    Pea sized hail reported between Dalmeny, SK an...
6997    \n123mm hail reported in Markerville, AB. Phot...
6998    \n30mm\nhttps://twitter.com/lightningmanAB/sta...
6999    https://www.facebook.com/AlbertaStorm/photos/a...
Name: Notes, Length: 7000, dtype: object

In [10]:
from pydantic import BaseModel
from typing import List

class Response(BaseModel):
  names: List[str]
  emails: List[str]
  phone_numbers: List[str]

response_example = Response(
    names=["John Doe", "Jane Smith"],
    emails=["john.doe@example.com", "jane.smith@example.com"],
    phone_numbers=["123-456-7890", "987-654-3210"]
)

def filter_confidential_info_from_note(gpt_model, df_hail_db, row_index):
  note = df_hail_db.loc[row_index, 'Notes']

  sys_msg = f"Your response to the user's instruction should follow the following JSON format: {response_example.json()}"

  user_msg = f"""
Instruction:
Given the following note about hail observation below, extract the following confidential information:
- names: list of people's names (e.g. John Doe, Jane Smith)
- emails: list of people's emails (e.g. john.doe@example.com, jane.smith@example.com)
- phone_numbers: list of people's phone numbers (e.g. 123-456-7890, 987-654-3210)
For any of the fields above, if the information cannot be found, return an empty list.

Note:
{note}
"""

  res = gpt_model.call(user_msg, sys_msg)

  return res

In [21]:
start_index = 0
end_index = 7000
thread_max_workers = 5

In [22]:
def process_note(i, gpt_model, df_hail_db):
    try:
        res = filter_confidential_info_from_note(
            gpt_model=gpt_model,
            df_hail_db=df_hail_db,
            row_index=i,
        )
        return {
            'row_index': i,
            'success': True,
            'data': {
                'row_index': i,
                'names': res['names'] if 'names' in res else [],
                'emails': res['emails'] if 'emails' in res else [],
                'phone_numbers': res['phone_numbers'] if 'phone_numbers' in res else [],
            }
        }
    except Exception as e:
        print(f"Error processing row {i}: {e}")
        # Return a structured response indicating failure
        return {
            'row_index': i,
            'success': False,
        }

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

confidential_infos = []
missed_index_list = []  # Collect errors

with ThreadPoolExecutor(max_workers=thread_max_workers) as executor:
    futures = [executor.submit(process_note, i, gpt_model, df_hail_db)
               for i in range(start_index, end_index)]

    for future in tqdm(as_completed(futures), total=len(futures), desc="Filtering Confidential Info"):
        result = future.result()
        if result['success']:
            confidential_infos.append(result['data'])
        else:
            missed_index_list.append(result['row_index'])
            print(f"Error in row {result['row_index']}: {result['error']}")

In [None]:
confidential_infos

[{'row_index': 0, 'names': [], 'emails': [], 'phone_numbers': []},
 {'row_index': 4, 'names': [], 'emails': [], 'phone_numbers': []},
 {'row_index': 2, 'names': [], 'emails': [], 'phone_numbers': []},
 {'row_index': 1, 'names': [], 'emails': [], 'phone_numbers': []},
 {'row_index': 3, 'names': [], 'emails': [], 'phone_numbers': []},
 {'row_index': 7, 'names': [], 'emails': [], 'phone_numbers': []},
 {'row_index': 5, 'names': [], 'emails': [], 'phone_numbers': []},
 {'row_index': 9, 'names': [], 'emails': [], 'phone_numbers': []},
 {'row_index': 8, 'names': [], 'emails': [], 'phone_numbers': []},
 {'row_index': 6, 'names': [], 'emails': [], 'phone_numbers': []},
 {'row_index': 10, 'names': [], 'emails': [], 'phone_numbers': []},
 {'row_index': 13, 'names': [], 'emails': [], 'phone_numbers': []},
 {'row_index': 12,
  'names': ['Amanda Quinney', 'Srgt Clarke'],
  'emails': [],
  'phone_numbers': []},
 {'row_index': 15, 'names': [], 'emails': [], 'phone_numbers': []},
 {'row_index': 17, 'n

In [None]:
missed_index_list

[34, 36, 40, 45]

In [None]:
df_hail_db

Unnamed: 0.1,Unnamed: 0,Key,Province Code,Province,Reference Location,Start Time,End Time,Latitude,Longitude,Event Type,...,Hail Diameter (mm) FROM NOTES,Hail Diameter (mm) CHECKED,Hail Diameter (mm) FROM TYPE,Hail Diameter (mm) FROM CAT,Hail Diameter (mm) MIN FROM CAT,Hail Diameter (mm) MAX FROM CAT,Hail Diameter (mm) DIRECT,Reference Object,Hail Diameter (mm) MERGED,Hail Diameter (mm) FINAL
0,0,1367,AB,,,2006-07-30 23:30:00,2006-07-30 23:35:00,52.93000,-110.600000,,...,28.0,,,,,,,toonie,,28.0
1,1,1373,SK,,,2006-07-31 00:45:00,2006-07-31 01:30:00,52.15000,-106.670000,,...,12.0,,,,,,,pea,,12.0
2,2,1676,SK,,,2006-08-24 03:38:00,2006-08-24 04:08:00,52.33000,-104.500000,,...,12.0,,,,,,,pea,,12.0
3,3,6122,MB,,SW of Ninette,2015-07-12 23:39:00,2015-07-12 23:42:00,49.33600,-99.469000,,...,12.0,,,,,,,pea,,12.0
4,4,6210,AB,,10km NNW of Priddis,2015-07-22 18:53:00,2015-07-22 19:00:00,50.95500,-114.333000,,...,43.0,,,,,,,golf ball,,43.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,781,10922/swe-20220828001700-30181,SK,,"Mantario, SK, Canada/Highway 44, Mantario, Che...",2022-08-28 00:17:00,,51.26175,-109.691750,Hail,...,12.0,,,,,,7.0,pea,7.0,12.0
6996,782,10923/swe-20220828004500-69004,SK,,"Dalmeny, SK, Canada/Dalmeny -- Corman Park No....",2022-08-28 00:45:00,,52.32194,-106.726320,Hail,...,12.0,,,,,,7.0,pea,7.0,12.0
6997,AB,10675/swe-20220801233500-24953,AB,,"Markerville, AB, Canada/Markerville -- 2320, T...",2022-08-01 23:35:00,2022-08-01 23:50:00,52.16977,-114.196725,Hail,...,5.0,123.0,,,,,115.0,measured 123mm,115.0,123.0
6998,AB,10487/swe-20220717011000-25548,AB,,"Didsbury, AB, Canada/GB Fuels, 2001, 15 Avenue...",2022-07-17 01:05:00,2022-07-17 01:15:00,51.62280,-113.950000,Hail,...,,30.0,,,,,28.0,measured 30mm,28.0,30.0


In [None]:
df_hail_db['names'] = 'N/A'
df_hail_db['emails'] = 'N/A'
df_hail_db['phone_numbers'] = 'N/A'

In [None]:
for confidential_info in confidential_infos:
    # Find the row in df_extracted_info where 'row_index' matches match['row_index']
    # and update 'reference_location_matches' and 'match_reasoning'
    df_hail_db.loc[confidential_info['row_index'], 'names'] = ','.join(confidential_info['names']) if len(confidential_info['names']) > 0 else 'N/A'
    df_hail_db.loc[confidential_info['row_index'], 'emails'] = ','.join(confidential_info['emails']) if len(confidential_info['emails']) > 0 else 'N/A'
    df_hail_db.loc[confidential_info['row_index'], 'phone_numbers'] = ','.join(confidential_info['phone_numbers']) if len(confidential_info['phone_numbers']) > 0 else 'N/A'

In [None]:
def filter_note(row):
  note = row['Notes']
  names = row['names'].split(',')
  emails = row['emails'].split(',')
  phone_numbers = row['phone_numbers'].split(',')

  if type(note) != str:
    return ''

  for i, name in enumerate(names):
    note = note.replace(name, f'<name{i+1}>')
  for i, email in enumerate(emails):
    note = note.replace(email, f'<email{i+1}>')
  for i, phone_number in enumerate(phone_numbers):
    note = note.replace(phone_number, f'<phone-number{i+1}>')

  return note

In [None]:
df_hail_db['Filtered Notes'] = df_hail_db.apply(filter_note, axis=1)

In [None]:
# Example
df_hail_db.iloc[11]['Filtered Notes']

'Student on the desk talked to two residents at Fawcett Lake - <name1> (<phone-number1>) and <name2>. <name1> is the manager at the Fawcett Lake residence, he can find numbers of other people on the residence that experienced the storm. They are located at Fawcett Lake East resort. 5 or 6 residents supposedly saw a funnel cloud, no pictures were received of said funnel cloud. Damage reports included multiple fallen trees - mostly uprooted though some were snapped. The snapped trees were healthy poplars and some spruce trees. <name1> noted that the damage appeared to follow a definite path. Other damage reports included sheds destroyed, trailer roofs ripped off, tin roofs carried about a kilometer, trailers moved or tipped, and a shed lofted and carried about 150 feet. One family was inside one of the trailers that was tipped and a child was injured - very minor though, no hospital visit needed. Only pea sized hail was reported with the storm. Trees fell in different directions apparent

In [None]:
from google.colab import files

df_hail_db.to_csv('hail_db_with_filtered_notes.csv', index=False)
files.download('hail_db_with_filtered_notes.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>