### topic segregation


In [9]:
%load_ext autoreload
%autoreload 2

import os
curpath = os.getcwd()
os.chdir(curpath.split("core")[0])

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
from openai import OpenAI
import json
import time
from typing import List, Dict, Any, Tuple, Union

from prompts import SIMPLE_TS_PROMPTS, SIMPLE_TS_WITH_REF_PROMPTS
from core.features.topic_segregation.utils import add_dicts, calculate_cost_gpt4_8k, calculate_cost_gpt4_omni
from core.features.provider import creator, text_creator_defaults

from dotenv import load_dotenv
load_dotenv()

client = OpenAI()

In [11]:
def get_unique_from_raw_text(raw_text, sep = "\n"):

    if isinstance(raw_text, str):
        raw_text = raw_text.replace("’", "'")
        raw_text_list = raw_text.split(sep)
    
    elif isinstance(raw_text, list):
        raw_text_list = raw_text
    return list(set(raw_text_list))

def treat_output_for_json(input):
    out = "{" + input.split("{")[1].split("}")[0] + "}"
    # out = out.replace('\'', '"')
    # out = out.replace('\\"', '\'')
    return out

def chunk_raw_text_list(raw_text_list, max_len=3000):

    text_list = []
    char_len = 0
    ind_text_list = []

    for text in raw_text_list:

        l = len(text)
        char_len += l
        if char_len > max_len:
            text_list.append(ind_text_list)
            ind_text_list = []
            ind_text_list.append(text)
            char_len = 0
            char_len += l
        else:
            ind_text_list.append(text)

    text_list.append(ind_text_list)
    return text_list

# provides indexed dict_text and text_dict 
def get_indexed_text_and_dict(text_list):

    indexed_text = []
    text_dict = {}
    for i, text in enumerate(text_list):
        if not text or text == '':
            continue

        mod = f"{i}: {text}"

        indexed_text.append(mod)
        text_dict[i] = text

    return indexed_text, text_dict

In [12]:

def generate_topic_segregation(raw_text = None, n = 1, topics = None, complete_seg = True):

    if raw_text is None:
        raise ValueError("raw_text must be provided")

    # if topics is None:
    #     conversation = [{"role": "system", "content": SIMPLE_TS_PROMPTS}]

    #     user_prompt = f"""
    #     RAW_TEXT:

    #     //raw//

    #     {raw_text}

    #     //raw//

    #     OUTPUT:
    #     """

    if topics is not None:
        conversation = [{"role": "system", "content": SIMPLE_TS_WITH_REF_PROMPTS}]

        n = len(topics)
        reference = {"number of classes to segregate": n, "topics": topics, "segregate all provided text": complete_seg}

        user_prompt = f"""
        RAW_TEXT:

        //raw//

        {raw_text}

        //raw//

        REFERENCE:
        {str(reference)}

        OUTPUT:
        """

    user_message = {'role': 'user', "content": user_prompt}
    conversation.append(user_message)

    response = creator(**text_creator_defaults,
                        messages = conversation,
                     )
    
    total_usage = dict(response.usage)

    output = response.choices[0].message.content

    return {"output": output, "total_usage": total_usage}


### Topics Segregation with reference and indexing

#### test case 1

In [21]:
with open("core/features/topic_segregation/test_data/news1.txt", 'rb') as f:
    _text = f.readlines()

_text = [x.decode('utf-8') for x in _text]
r_text = "".join(_text)

print(_text)
print(len(r_text))

# convert raw_text to list and get unique values
unique_text_list = get_unique_from_raw_text(_text, sep="\n")
print(len(unique_text_list))

indexed_text, text_dict = get_indexed_text_and_dict(unique_text_list)

# chunking the list into sublists with max length of characters in a chunk is 3000
chunked_text_list = chunk_raw_text_list(indexed_text, max_len=3000)

print(len(chunked_text_list))
for i in chunked_text_list:
    print(len(i))

4424
50
2
33
17


input = 1000 chars - 1 rupee
output = 1 rupee per request

In [24]:
4.42 + 1

5.42

In [23]:

# final batch processing function
def ts_batch_process(topics: List, input_text: List[List], complete_seg = False) -> [Dict, Dict]:

    final_dict = {topic:[] for topic in topics}
    total_tokens = {'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0}

    for raw_text_list in input_text:
        raw_text = "\n".join(raw_text_list)

        result = generate_topic_segregation(raw_text=raw_text, topics=topics, complete_seg=complete_seg)
        output = result['output']
        tokens = result['total_usage']

        try:
            json_out = json.loads(output)
        except:
            raise ValueError("Output is not a valid JSON")
        # print(json_out)
        final_dict = add_dicts(final_dict, json_out)
        total_tokens = add_dicts(total_tokens, tokens)

    return final_dict, total_tokens


topics = ["Sports", "Defence", "Regional:India"]
res, tokens = ts_batch_process(topics, chunked_text_list, complete_seg=False)

print(tokens)
print(calculate_cost_gpt4_omni(tokens))
for topic in topics:
    print(f"Topic: {topic}")
    print(res[topic])
    print()

for i in res[topics[0]]:
    print(text_dict[int(i)])
    print()

{'total_tokens': 1928, 'completion_tokens': 136, 'prompt_tokens': 1792}
0.022
Topic: Sports
['7', '15', '33', '34', '39', '43']

Topic: Defence
['1', '2', '3', '8', '11', '18', '19', '21', '24', '25', '26', '28', '29', '30', '32', '38', '40', '41', '44', '48']

Topic: Regional:India
['10', '20', '35', '36', '37', '45', '47', '49']

West Indies T20 World Cup winner announces international retirement - ICC Cricket


ICC World Cup 2023 Points Table: India Dominates, South Africa Chases, and Semi-final Scenarios Unfold | Mint - Mint


Choking New Delhi smog shutters schools and shrouds Cricket World Cup - CNN


‘SHAMEFUL’: Netizens scream at Shakib as Sri Lanka's Angelo Mathews becomes first player to be ‘timed out’ | Mint - Mint


November 6 Morning Brief: Today's Top news and headlines from cricket world - CricTracker


Who can qualify for the last two Cricket World Cup semifinal spots and how? - Al Jazeera English




#### test case 2

In [61]:
with open("core/features/topic_segregation/test_data/news2.txt", 'rb') as f:
    _text = f.readlines()

_text = [x.decode('utf-8') for x in _text]

print(_text)

# convert raw_text to list and get unique values
unique_text_list = get_unique_from_raw_text(_text, sep="\n")
print(len(unique_text_list))

indexed_text, text_dict = get_indexed_text_and_dict(unique_text_list)

# chunking the list into sublists with max length of characters in a chunk is 3000
chunked_text_list = chunk_raw_text_list(indexed_text, max_len=3000)

print(len(chunked_text_list))
for i in chunked_text_list:
    print(len(i))

['Earthquake tremors felt in Delhi-NCR again - IndiaTimes\r\n', "'Yeh planning se hota hai...': Wasim Akram on India's unbeaten run in World Cup - IndiaTimes\r\n", '‘More people are willing to pay more money for Indian artworks’ | Mint - Mint\r\n', 'UK springs EV surprise in FTA talks with India | Mint - Mint\r\n', "India's online gaming industry gains with increased user spending | Mint - Mint\r\n", 'Aakash Chopra reacts as Gurpatwant Pannun warns Air India passengers - Hindustan Times\r\n', "India-Canada row: Diplomatic standoff continues; experts say ‘relationship in deep crisis' | 10 points | Mint - Mint\r\n", "Canada carrying out 'tainted' Nijjar probe: Indian envoy Sanjay Verma - IndiaTimes\r\n", "High-Level Canadian Official Damaged Probe In Hardeep Nijjar's Killing: Indian Envoy - NDTV\r\n", 'Australia orders ex-Indian envoy to pay hefty compensation to former domestic help for alleged exploitation - WION\r\n', 'Australia court asks ex-India envoy to pay ₹74L to former help - I

In [62]:

# final batch processing function
def ts_batch_process(topics: List, input_text: List[List], complete_seg = False) -> [Dict, Dict]:

    final_dict = {topic:[] for topic in topics}
    total_tokens = {'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0}

    for raw_text_list in input_text:
        raw_text = "\n".join(raw_text_list)

        result = generate_topic_segregation(raw_text=raw_text, topics=topics, complete_seg=complete_seg)
        output = result['output']
        tokens = result['total_usage']

        try:
            json_out = json.loads(output)
        except:
            print("GOT ERROR IN JSON LOADING")
            out = treat_output_for_json(output)
            json_out = json.loads(out)
        # print(json_out)
        final_dict = add_dicts(final_dict, json_out)
        total_tokens = add_dicts(total_tokens, tokens)

    return final_dict, total_tokens


topics = ["Sports", "Defence", "Technology", "Elections"]
res, tokens = ts_batch_process(topics, chunked_text_list, complete_seg=False)

print(tokens)
print(calculate_cost_gpt4_8k(tokens))
for topic in topics:
    print(f"Topic: {topic}")
    print(res[topic])
    print()

for i in res[topics[0]]:
    print(text_dict[int(i)])
    print()

{'completion_tokens': 119, 'total_tokens': 1803, 'prompt_tokens': 1684}
0.057659999999999996
Topic: Sports
['7', '12', '22', '26', '35', '36', '37']

Topic: Defence
['2', '3', '13', '14', '32', '33']

Topic: Technology
['4', '6', '11', '16', '23', '29', '42', '43', '46']

Topic: Elections
['17', '18', '19', '41']

India TV Sports Wrap on November 6: Today's top 10 trending news stories - India TV News


India highest ranked after Germany in women’s Olympic qualifiers - IndiaTimes


Is India vs Pakistan World Cup final match possible? Here’s the roadmap to the dream encounter | Mint - Mint


Aakash Chopra reacts as Gurpatwant Pannun warns Air India passengers - Hindustan Times


Indian sports news wrap, November 6 - Sportstar


'Yeh planning se hota hai...': Wasim Akram on India's unbeaten run in World Cup - IndiaTimes


South Africa ‘nicely cleaned up’ by India: Jonty Rhodes’ hilarious remark after massive defeat | Mint - Mint




#### test case 3

In [63]:
with open("core/features/topic_segregation/test_data/news3.txt", 'rb') as f:
    _text = f.readlines()

_text = [x.decode('utf-8') for x in _text]

# print(_text)

# convert raw_text to list and get unique values
unique_text_list = get_unique_from_raw_text(_text, sep="\n")
print(len(unique_text_list))

indexed_text, text_dict = get_indexed_text_and_dict(unique_text_list)

# chunking the list into sublists with max length of characters in a chunk is 3000
chunked_text_list = chunk_raw_text_list(indexed_text, max_len=3000)

print(len(chunked_text_list))
for i in chunked_text_list:
    print(len(i))

100
3
35
34
31


In [64]:

# final batch processing function
def ts_batch_process(topics: List, input_text: List[List], complete_seg = False) -> [Dict, Dict]:

    final_dict = {topic:[] for topic in topics}
    total_tokens = {'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0}

    for raw_text_list in input_text:
        raw_text = "\n".join(raw_text_list)

        result = generate_topic_segregation(raw_text=raw_text, topics=topics, complete_seg=complete_seg)
        output = result['output']
        tokens = result['total_usage']

        try:
            json_out = json.loads(output)
        except:
            print("GOT ERROR IN JSON LOADING")
            out = treat_output_for_json(output)
            json_out = json.loads(out)
        # print(json_out)
        final_dict = add_dicts(final_dict, json_out)
        total_tokens = add_dicts(total_tokens, tokens)

    return final_dict, total_tokens


topics = ["Sports", "Defence", "Technology", "Elections"]
res, tokens = ts_batch_process(topics, chunked_text_list, complete_seg=False)

print(tokens)
print(calculate_cost_gpt4_omni(tokens))
for topic in topics:
    print(f"Topic: {topic}")
    print(res[topic])
    print()

print(topic[0])
for i in res[topics[0]]:
    print(text_dict[int(i)])

{'completion_tokens': 168, 'total_tokens': 3317, 'prompt_tokens': 3149}
0.03653
Topic: Sports
['13', '25', '35', '50', '57', '79', '91']

Topic: Defence
['8', '22', '32', '44', '54', '61', '69', '73', '74', '78', '89']

Topic: Technology
['0', '38', '46', '52', '64', '71', '85', '92']

Topic: Elections
['9', '12', '40', '41', '42', '43', '53', '59', '67', '87']

E
Entire Sri Lanka Cricket Board sacked after humiliating World Cup loss to India - Hindustan Times

IND vs SA | A record equalled and an Indian win — a perfect birthday gift for Kohli - The Hindu

Afternoon briefing: Sanjay Raut's India vs Pakistan match dig - Hindustan Times

"Ben Stokes should skip last two World Cup matches and return ... - Crictoday.com (Cricket News) 

Lanka sacks entire Cricket Board following humiliating defeat against India - Business Standard

India 8, Pak 0: Dominance on the big stage continues - Hindustan Times

Cricket World Cup Latest Points Table, Highest Run-Scorer, Wicket-Taker List After India

#### test case 4

In [65]:
with open("core/features/topic_segregation/test_data/news_data_raj.txt", 'rb') as f:
    _text = f.readlines()

_text = [x.decode('utf-8') for x in _text]

# print(_text)

# convert raw_text to list and get unique values
unique_text_list = get_unique_from_raw_text(_text, sep="\n")
print(len(unique_text_list))

indexed_text, text_dict = get_indexed_text_and_dict(unique_text_list)

# chunking the list into sublists with max length of characters in a chunk is 3000
chunked_text_list = chunk_raw_text_list(indexed_text, max_len=3000)

print(len(chunked_text_list))
for i in chunked_text_list:
    print(len(i))

100
4
28
28
27
17


In [66]:

# final batch processing function
def ts_batch_process(topics: List, input_text: List[List], complete_seg = False) -> [Dict, Dict]:

    final_dict = {topic:[] for topic in topics}
    total_tokens = {'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0}

    for raw_text_list in input_text:
        raw_text = "\n".join(raw_text_list)

        result = generate_topic_segregation(raw_text=raw_text, topics=topics, complete_seg=complete_seg)
        output = result['output']
        tokens = result['total_usage']

        try:
            json_out = json.loads(output)
        except:
            print("GOT ERROR IN JSON LOADING")
            out = treat_output_for_json(output)
            json_out = json.loads(out)
        # print(json_out)
        final_dict = add_dicts(final_dict, json_out)
        total_tokens = add_dicts(total_tokens, tokens)

    return final_dict, total_tokens


topics = ["Sports", "Defence", "Technology", "Elections"]
res, tokens = ts_batch_process(topics, chunked_text_list, complete_seg=False)

print(tokens)
print(calculate_cost_gpt4_omni(tokens))
for topic in topics:
    print(f"Topic: {topic}")
    print(res[topic])
    print()

print(topic[0])
for i in res[topics[0]]:
    print(text_dict[int(i)])

{'completion_tokens': 389, 'total_tokens': 6478, 'prompt_tokens': 6089}
0.07256
Topic: Sports
[]

Topic: Defence
[]

Topic: Technology
[]

Topic: Elections
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99']

E
