In [69]:
!pip install openai_async -q


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [152]:
import pandas as pd
import numpy as np
import ast
import os
import re
import openai
from typing import List, Set, Tuple

import aiohttp 
import asyncio
import openai_async
from openai.error import RateLimitError, InvalidRequestError

In [18]:
# openai.api_key = ... 

### I'll use the fine-tuned openai `ada` model to extract topics from the GCNs. But firstly I will measure the model's metrics on the labelled Atel data (a fraction of the training data).

In [19]:
atel_df = pd.read_csv("atel_with_topics.csv", index_col=0)
atel_df['topics'] = atel_df['topics'].apply(lambda x: x.split(','))

In [63]:
def use_topics_model(text):
    
    PROMPT = f"""
    {text}
    \n\n###\n\n
    """
    
    resp = openai.Completion.create(
        model="ada:ft-pai-2023-05-16-20-18-46",
        prompt=PROMPT)
    
    return [x.strip() for x in resp['choices'][0]['text'].replace("###", ',').split(',') if x.strip()]


def calculate_precision_recall_f1(true: List[Set[str]], pred: List[Set[str]]):
    true_positives = 0
    false_positives = 0
    false_negatives = 0

    for y_true, y_pred in zip(true, pred):
        true_positives += len(y_true.intersection(y_pred))
        false_positives += len(y_pred - y_true)
        false_negatives += len(y_true - y_pred)

    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1 = 2 * (precision * recall) / (precision + recall)

    return precision, recall, f1

In [269]:
sample_atel_df = atel_df.sample(frac=0.3)
len(sample_atel_df)

4750

In [23]:
sample_atel_df['predicted_topics'] = sample_atel_df.body_clean.apply(use_topics_model)

In [51]:
# clean both topics

In [39]:
sample_atel_df['predicted_topics'] = sample_atel_df['predicted_topics'].apply(lambda x: list(set(x)))\
    .apply(lambda x: [y.replace('\n', '').strip() for y in x if y!='#' and len(y.split(' '))<=3])

In [42]:
sample_atel_df['topics'] = sample_atel_df['topics'].apply(lambda x: [y.replace('\n', '').strip() for y in x])

In [52]:
sample_atel_df.head()

Unnamed: 0_level_0,topics,body_clean,predicted_topics
telegram_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1781_atel,"[x-ray, gamma ray, request for observations, t...",the ibis isgri instrument aboard integral disc...,"[binary, black hole, x-ray, cataclysmic, gamma..."
3493_atel,"[x-ray, gamma ray, neutron star, soft gamma-ra...",using 1046 xrt photon counting mode data 1 uvo...,"[methods, transient, x-ray, optical]"
14725_atel,"[x-ray, binary, black hole, transient]",the nicer telescope followed recently reported...,"[transient, gev, tev, black hole, vhe]"
4176_atel,"[x-ray, gamma ray, binary, transient]",swift observed new outburst sfxt ax j1841.0-05...,"[transient, x-ray, binary, gamma ray]"
2256_atel,"[optical, gamma ray, gev, agn, quasar]",the large area telescope lat one two instrumen...,"[quasar, gev, blazar, request for observations..."


In [298]:
topics = sample_atel_df['topics'].apply(set).values.tolist()
pred_topics = sample_atel_df['predicted_topics'].apply(set).values.tolist()

KeyError: 'predicted_topics'

In [57]:
 # it is bearable I'd say. Consider that the training costed only 9$ and this is the cheapest openai's model
calculate_precision_recall_f1(topics, pred_topics) 

(0.6310580935059378, 0.6783783783783783, 0.6538632080700587)

### Get them topics for all GCNs 
Make async calls so speed things up

In [256]:
clean_bodies = pd.read_csv("../word2vec/clean_bodies.csv", index_col=0)

In [257]:
gcn_df = clean_bodies[clean_bodies.index.str.endswith('gcn')]

In [258]:
gcn_bodies = gcn_df['body_clean'].values

In [243]:
SEMAPHORE_LIMIT = 50
TIMEOUT = 120
LLM_MODEL = 'ada:ft-pai-2023-05-16-20-18-46'


async def gather_with_concurrency(n: int, *tasks):
    semaphore = asyncio.Semaphore(n)

    async def sem_task(task):
        async with semaphore:
            return await task

    return await asyncio.gather(*(sem_task(task) for task in tasks))


async def call_one(text: str, i: int) -> str:
    print(f'Calling for doc #{i}')

    await asyncio.sleep(0.5)
    prompt = f"""
    {text}
    \n\n###\n\n
    """
    payload = {"model": LLM_MODEL, "prompt": prompt}

    response = await openai_async.complete(openai.api_key, payload=payload, timeout=TIMEOUT)

    try:
        return [x.strip() for x in response.json()['choices'][0]['text'].replace("###", ',').split(',') if x.strip()]      
        
    except:
        if response.json()['error']['type'] == "invalid_request_error":
            print(f'Token limit for doc #{i} exceeded, reducing the prompt size.')
            text = text[:int(len(text)*0.9)]
            return await call_one(text, i)
        
        elif response.json()['error']['type'] == "rate_limit_error":
            print(f'Request limit exceeded, sleeping and waiting to re-call for doc #{i}')
            await asyncio.sleep(5)
            return await call_one(text, i)
        
        else:
            return []
    
async def call_all(texts: List[str]): 
    tasks = []
    for i, text in enumerate(texts):
          tasks.append(call_one(text, i))
    return await gather_with_concurrency(SEMAPHORE_LIMIT, *tasks)

In [140]:
all_topics = []

texts = gcn_bodies

topics = await call_all(texts)
all_topics.extend(topics)

Calling for doc #0
Calling for doc #1
Calling for doc #2
Calling for doc #3
Calling for doc #4
Calling for doc #5
Calling for doc #6
Calling for doc #7
Calling for doc #8
Calling for doc #9
Calling for doc #10
Calling for doc #11
Calling for doc #12
Calling for doc #13
Calling for doc #14
Calling for doc #15
Calling for doc #16
Calling for doc #17
Calling for doc #18
Calling for doc #19
Calling for doc #20
Calling for doc #21
Calling for doc #22
Calling for doc #23
Calling for doc #24
Calling for doc #25
Calling for doc #26
Calling for doc #27
Calling for doc #28
Calling for doc #29
Calling for doc #30
Calling for doc #31
Calling for doc #32
Calling for doc #33
Calling for doc #34
Calling for doc #35
Calling for doc #36
Calling for doc #37
Calling for doc #38
Calling for doc #39
Calling for doc #40
Calling for doc #41
Calling for doc #42
Calling for doc #43
Calling for doc #44
Calling for doc #45
Calling for doc #46
Calling for doc #47
Calling for doc #48
Calling for doc #49
Calling fo

In [191]:
all_topics[-10:]

[['grb', 'optical', 'transient', 'a comment'],
 ['a comment'],
 ['ARU', 'infra-red', 'optical', 'gamma ray'],
 ['neutrinos', 'agn', 'transient', 'puls'],
 ['optical', 'gamma-ray burst'],
 ['gev', 'gl', 'gamma-ray burst', 'optical', 'transient'],
 ['gr deeds', 'celestial mechanics', 'transient', 'agn', 'black hole'],
 ['gev', 'tev', 'vhe', 'neutrinos', 'transient'],
 ['optical', 'gamma ray', 'gamma-ray burst', 'transient'],
 ['iWitness', 'optical', 'gamma ray', 'gamma-ray burst']]

In [310]:
e = []
for i in all_topics:
    e+=i
    
len(set(e))

10046

In [315]:
len(np.where(np.unique(e, return_counts=True)[1]>2)[0])

766

In [259]:
gcn_df['topics'] = all_topics

In [260]:
gcn_df

Unnamed: 0_level_0,body_clean,topics
telegram_index,Unnamed: 1_level_1,Unnamed: 2_level_1
31_gcn,s. v. zharikov v. v. sokolov sao ras yu. v. ba...,"[optical, gamma ray, gamma-ray burst, optical]"
32_gcn,comparison optical r-band observations error b...,"[optical, gamma-ray burst, transient, optical]"
33_gcn,grb 980326 optical transient confirmed a. c. e...,[## ro sg i g h ## nova]
34_gcn,bruce grossan robert knop saul perlmutter lawr...,"[optical, gamma-ray burst, black hole, neutron]"
35_gcn,addendum gcn 34 bruce grossan robert knop saul...,"[gev, cv, a comment, nova]"
...,...,...
33636_gcn,we observed field grb 230414b d'ai et al. gcn ...,"[gev, gl, gamma-ray burst, optical, transient]"
33637_gcn,we observed field grb230414b d'ai et al. gcn 3...,"[gr deeds, celestial mechanics, transient, agn..."
33633_gcn,on 2023-04-16 05 22 26.15 ut icecube detected ...,"[gev, tev, vhe, neutrinos, transient]"
33634_gcn,we observed field grb 230414b swift trigger 11...,"[optical, gamma ray, gamma-ray burst, transient]"


### Some topics are trash and some contain weird random values, clean them, and run the predicitons again if no topics left after the cleaning

In [246]:
import warnings
warnings.filterwarnings('ignore')

In [261]:
gcn_df['topics'] = gcn_df['topics'].apply(str).apply(lambda x: re.sub(r'[^\w\s\-,]', ' ', x.lower()))
gcn_df['topics'] = gcn_df['topics'].apply(lambda x: x.split(','))
gcn_df['topics'] = gcn_df['topics'].apply(lambda x: [y.strip() for y in x if y.strip() 
                                                     and len(y.strip().split(' '))<3
                                                     and len(y.strip())>2])

In [262]:
benchmark_topics = np.unique(atel_df.explode('topics').apply(lambda x: x.topics.strip(), axis=1))

In [263]:
gcn_df['topics'] = gcn_df['topics'].apply(lambda x: [y for y in x if y in benchmark_topics])

In [264]:
gcn_df['len_topics'] = gcn_df['topics'].apply(len)

In [266]:
to_rerun = gcn_df[gcn_df.len_topics<2].copy()  # there should be at least 2 topics
to_accept = gcn_df[gcn_df.len_topics>=2].copy() 
len(to_rerun), len(to_accept)

(3465, 28982)

re-run topics extraction for these bodies

In [234]:
benchmark_topics

array(['a comment', 'agn', 'asteroid', 'asteroid  binary', 'binary',
       'black hole', 'blazar', 'cataclysmic variable', 'comet',
       'cosmic rays', 'direct collapse event', 'exoplanet',
       'far-infra-red', 'fast radio burst', 'gamma ray',
       'gamma-ray burst', 'gev', 'globular cluster',
       'gravitational lensing', 'gravitational waves', 'infra-red',
       'magnetar', 'meteor', 'microlensing event', 'millimeter',
       'near-earth object', 'neutrinos', 'neutron star', 'nova',
       'optical', 'planet', 'planet  minor',
       'potentially hazardous asteroid', 'pre-main-sequence star',
       'pulsar', 'quasar', 'radio', 'request for observations',
       'soft gamma-ray repeater', 'solar system object', 'star',
       'sub-millimeter', 'supernova remnant', 'supernovae', 'tev',
       'the sun', 'tidal disruption event', 'transient', 'uhe',
       'ultra-violet', 'variables', 'vhe', 'x-ray',
       'young stellar object'], dtype=object)

In [236]:
good_enough = []

for i, body in enumerate(to_rerun.body_clean):
    
    gen_topics = await call_one(body, i)
    gen_topics = validate_topics(gen_topics)
    
    while not gen_topics:
        gen_topics = await call_one(body, i)
        gen_topics = validate_topics(gen_topics)

    good_enough.append(gen_topics)

Calling for doc #0
Calling for doc #1
Calling for doc #2
Calling for doc #3
Calling for doc #4
Calling for doc #5
Calling for doc #6
Calling for doc #7
Calling for doc #8
Calling for doc #9
Calling for doc #10
Calling for doc #11
Calling for doc #12
Calling for doc #13
Calling for doc #14
Calling for doc #15
Calling for doc #16
Calling for doc #17
Calling for doc #18
Calling for doc #18
Calling for doc #19
Calling for doc #20
Calling for doc #21
Calling for doc #22
Calling for doc #23
Calling for doc #24
Calling for doc #24
Calling for doc #25
Calling for doc #26
Calling for doc #27
Calling for doc #28
Calling for doc #29
Calling for doc #29
Calling for doc #30
Calling for doc #31
Calling for doc #32
Calling for doc #33
Calling for doc #34
Calling for doc #35
Calling for doc #36
Calling for doc #37
Calling for doc #38
Calling for doc #39
Calling for doc #39
Calling for doc #40
Calling for doc #41
Calling for doc #41
Calling for doc #42
Calling for doc #42
Calling for doc #43
Calling fo

In [267]:
to_rerun['topics'] = good_enough

In [268]:
pd.concat([to_accept, to_rerun])[['topics', 'body_clean']].to_csv("gcn_with_topics.csv", index=True)