In [1]:
import pandas as pd

from owl.messagepack import load_compressed_msgpack, create_compressed_msgpack
from owl.log import get_logger

# Import prompts
import prompts.classification_percentage_analysis as CPA
import prompts.summary as SMR
import prompts.highlight as HLT
import prompts.country_tagging as CT
import prompts.issues as ISS

# Import system prompts
import system_prompts.classification_percentage_analysis as SCPA
import system_prompts.summary as SSMR
import system_prompts.highlight as SHLT
import system_prompts.country_tagging as SCT
import system_prompts.issues as SISS

log = get_logger(__name__)

In [2]:
USER_PROMPTS = {
    "classification": [getattr(CPA, p) for p in dir(CPA) if not p.startswith("__")],
    "summary": [getattr(SMR, p) for p in dir(SMR) if not p.startswith("__")],
    "highlight": [getattr(HLT, p) for p in dir(HLT) if not p.startswith("__")],
    "country_tagging": [getattr(CT, p) for p in dir(CT) if not p.startswith("__")],
    "issues": [getattr(ISS, p) for p in dir(ISS) if not p.startswith("__")],
}

SYSTEM_PROMPTS = {
    "classification": [getattr(SCPA, p) for p in dir(SCPA) if not p.startswith("__")],
    "summary": [getattr(SSMR, p) for p in dir(SSMR) if not p.startswith("__")],
    "highlight": [getattr(SHLT, p) for p in dir(SHLT) if not p.startswith("__")],
    "country_tagging": [getattr(SCT, p) for p in dir(SCT) if not p.startswith("__")],
    "issues": [getattr(SISS, p) for p in dir(SISS) if not p.startswith("__")],
}

In [3]:
raw_data = pd.DataFrame(load_compressed_msgpack('resource/raw_dataset.msg.gz'))
raw_data.tail(5)


2024-12-20 14:03:17 | INFO    | messagepack:load_compressed_msgpack:L49 - resource/raw_dataset.msg.gz is loaded.


Unnamed: 0,id,timestamp,title,body
55795,55796,2024-01-01 22:21:30,Teenager fatally stabbed in London named as Ha...,A 16-year-old boy stabbed to death as he waite...
55796,55797,2024-01-01 22:21:30,Teenager fatally stabbed in London named as Ha...,A 16-year-old boy stabbed to death as he waite...
55797,55798,2024-01-01 23:00:43,Tell us: what’s the most important lesson your...,Have you found yourself embracing the tried an...
55798,55799,2024-01-01 23:00:43,Thousands of ‘legacy’ asylum cases awaiting de...,The Home Office is yet to make decisions on th...
55799,55800,2024-01-01 23:46:19,Jürgen Klopp thrilled by Salah’s impact … and ...,Jürgen Klopp praised Mohamed Salah’s gamechang...


In [4]:
data = pd.DataFrame(load_compressed_msgpack('resource/dataset.msg.gz'))
data.tail(5)


2024-12-20 14:03:18 | INFO    | messagepack:load_compressed_msgpack:L49 - resource/dataset.msg.gz is loaded.


Unnamed: 0,id,timestamp,title,body,is_related,percentage_related,summary,keypoints,issue_category,issues,countries
1348,53246,2023-12-11 03:07:51,Qatar says ‘narrowing window’ for freeing more...,3.07am GMT Summary of the day so far It’s ...,True,60.0,The article discusses the ongoing Israel-Gaza ...,[The UN agency for Palestinian refugees (UNRWA...,"{'Shelter': 50, 'Food': 30, 'Health': 20}","{'Lack of housing': 15, 'Overcrowded camps': 1...","{'Israel': ['Gaza', 'Tel Aviv'], 'Palestine': ..."
1349,53247,2023-12-11 03:07:51,Qatar says ‘narrowing window’ for freeing more...,3.07am GMT Summary of the day so far It’s ...,True,35.0,The article discusses the ongoing conflict bet...,[The conflict between Israel and Hamas has led...,"{'Shelter': 65, 'Health': 15, 'Food': 15, 'Fam...","{'Lack of housing': 20, 'Overcrowded camps': 1...","{'Israel': ['Gaza', 'Tel Aviv'], 'Palestine': ..."
1350,53268,2023-12-11 07:00:13,Ireland's housing crisis is a disaster for its...,Ireland is in a dark place. Riots in Dublin la...,True,30.0,The article discusses how Ireland's housing cr...,[Ireland's housing crisis is being used by far...,"{'Shelter': 70, 'Security': 30}","{'Lack of housing': 40, 'Discrimination and xe...",{'Ireland': ['Dublin']}
1351,53343,2023-12-11 15:35:00,Tory MPs given ‘outdated’ analysis in push for...,Ministers are using an “outdated and flawed” H...,True,70.0,The article discusses the UK government's effo...,[The UK government is using outdated Home Offi...,"{'Legal Issues': 65, 'Family Reunification': 1...","{'Refugee status processing delays': 25, 'Depo...","{'United Kingdom': ['Dover', 'Kent'], 'Rwanda'..."
1352,53353,2023-12-11 16:21:27,Russia to hold presidential election in annexe...,3.48pm GMT Closing summaryUkraine’s president...,False,,,,,,


In [5]:
def assign_strings_equally(df, column_name, strings):
    """
    Assigns strings from a list equally across the rows of a DataFrame.

    Parameters:
        df (pd.DataFrame): The DataFrame to which strings are assigned.
        column_name (str): The name of the column to store the assigned strings.
        strings (list): The list of strings to distribute.

    Returns:
        pd.DataFrame: The DataFrame with the strings assigned to the specified column.
    """
    num_rows = len(df)
    num_strings = len(strings)
    rows_per_string = num_rows // num_strings

    # Generate the assignment list
    assigned_strings = [
        strings[i // rows_per_string] if i // rows_per_string < len(strings) else strings[-1]
        for i in range(num_rows)
    ]
    
    # Assign using .loc to avoid SettingWithCopyWarning
    df = df.copy()  # Ensure df is a new copy to avoid modifying the original
    df.loc[:, column_name] = assigned_strings

    return df


def process_data(
    data,
    system_prompts,
    user_prompts,
    output_path,
    mode,
):
    input_columns = {
        'classification': ['is_related', 'percentage_related'],
        'summary': ['summary'],
        'highlight': ['keypoints'],
        'country_tagging': ['countries'],
        'issues': ['issues'],
    }

    output_columns = [
        'id', 
        'timestamp', 
        'SYSTEM_PROMPTS',
        'USER_PROMPTS', 
        'OUTPUT',
    ]

    input_columns = input_columns[mode]
    
    data = assign_strings_equally(data, 'SYSTEM_PROMPTS',  system_prompts)
    data = assign_strings_equally(data, 'USER_PROMPTS',  user_prompts)
    data['USER_PROMPTS'] = data[['body', 'USER_PROMPTS']].apply(lambda x: f"{x['USER_PROMPTS']}{x['body']}", axis=1)

    if mode == 'classification':
        data['OUTPUT'] = data[input_columns].apply(
            lambda x: {"is_related": False} if pd.isna(x['percentage_related']) 
            else {"is_related": x['is_related'], 'percentage_related': int(x['percentage_related'])}, axis=1
        ).astype(str)
    elif mode == 'country_tagging':
        data = data.dropna().reset_index(drop=True)
        data['OUTPUT'] = data[input_columns].astype(str)
        data['OUTPUT'] = data['OUTPUT'].apply(lambda x: f'''{{"countries": "{x}"}}''')
    else:
        data = data.dropna().reset_index(drop=True)
        data['OUTPUT'] = data[input_columns].astype(str)
        data['OUTPUT'] = data['OUTPUT'].apply(lambda x: f'''{{"{mode}": "{x}"}}''')
    
    data = data[output_columns]
    data.to_parquet(output_path, engine="pyarrow")
    
    return data
    



In [6]:
dataset = {}

for _ in list(USER_PROMPTS.keys()):
    dataset[_] = process_data(
        data,
        SYSTEM_PROMPTS[_],
        USER_PROMPTS[_],
        f'processed_dataset/{_}.parquet',
        _,
    )


In [7]:
print(dataset['issues'].iloc[1].to_dict()['SYSTEM_PROMPTS'])
print(dataset['issues'].iloc[1].to_dict()['USER_PROMPTS'])
print(dataset['issues'].iloc[1].to_dict()['OUTPUT'])

You are a refugee expert, skilled in analyzing articles to identify key issues and their relevance percentages.
Issues Mapping Task: Analyze the given article to determine its relevance to the provided issues. Assign a relevancy percentage for each identified issue that reflects its significance within the article.

issues:[
 'Access to medical care',
 'Disease outbreaks',
 'Mental health support',
 'Maternal and child health',
 'Chronic illnesses',
 'Vaccination shortages',
 'Lack of housing',
 'Overcrowded camps',
 'Unsafe living conditions',
 'Long-term resettlement issues',
 'Lack of sanitation facilities',
 'Exposure to extreme weather',
 'Malnutrition',
 'Food shortages',
 'Access to clean water',
 'Food distribution inefficiencies',
 'Poor quality of food aid',
 'Lack of schooling for children',
 'Access to higher education',
 'Language barriers in schools',
 'Vocational training opportunities',
 'Dropout rates due to displacement',
 'Physical safety threats',
 'Armed conflicts'