# GPT for Linguistic Annotation

In [1]:
# imports for this project
import gspread
import pandas as pd
import os
from utils import append_data, lm_completion

### Sys-admin

In [2]:
# Google Sheets API
gc = gspread.service_account(filename='fignews-7b178eec49aa.json')
SHEET_ID = "1e_KpDnyNriLSNWMt-qIvcqGqtH_JC9YrzoMDHNiqdNA"
SHEET_NAME = "NwayMain"

Get Sample Data from Google Sheets

In [3]:
spreadsheet = gc.open_by_key(SHEET_ID)
worksheet = spreadsheet.worksheet(SHEET_NAME)
rows = worksheet.get_all_records()
df = pd.DataFrame(rows)
print(df.head(5))
df.describe()

  Batch Source Language  ID  Type  \
0   B01         English   1  MAIN   
1   B01         English   2  MAIN   
2   B01         English   4  MAIN   
3   B01         English   7  MAIN   
4   B01         English   8  MAIN   

                                                Text  \
0  Yemen's Houthis have waded into the Israel-Ham...   
1             Isreal - Hamas Conflict | Face to Face   
2  Videos show how armed men from Gaza stormed a ...   
3  Protest in Aligarh Muslim University in suppor...   
4  IDF releases audio recording about misfired ro...   

                                          English MT  \
0  Yemen's Houthis have waded into the Israel-Ham...   
1             Isreal - Hamas Conflict | Face to Face   
2  Videos show how armed men from Gaza stormed a ...   
3  Protest in Aligarh Muslim University in suppor...   
4  IDF releases audio recording about misfired ro...   

                                           Arabic MT  Annotator ID_1  \
0  خاض الحوثيون في اليمن الحرب 

Unnamed: 0,ID,Annotator ID_1,Annotator ID_2,Annotator ID_3,Annotator ID_4,N-way count bias,N-way count propaganda
count,1800.0,1800.0,1800.0,1800.0,1800.0,1800.0,1800.0
mean,2641.267222,1.0,2.0,3.0,4.0,0.413889,0.413333
std,4519.388845,0.0,0.0,0.0,0.0,0.492666,0.492568
min,1.0,1.0,2.0,3.0,4.0,0.0,0.0
25%,286.75,1.0,2.0,3.0,4.0,0.0,0.0
50%,957.0,1.0,2.0,3.0,4.0,0.0,0.0
75%,3500.75,1.0,2.0,3.0,4.0,1.0,1.0
max,42210.0,1.0,2.0,3.0,4.0,1.0,1.0


In [None]:
def apply_tag(df):
    
    prompt: str = """
    You are an expert in linguistic media bias detection. Evaluate the text based solely on the language used to describe actions or parties and assign the appropriate label according to the following criteria:
    Unbiased: Label as "Unbiased" if the text reports events in a straightforward and factual manner without using language that emotionally charges or slants the description towards either approval or disapproval. The language should not imply moral judgments.
    Biased against Palestine: Label as "Biased against Palestine" only if the text uses language that specifically denigrates Palestine or Palestinians, such as using pejorative terms, unfounded accusations, or portraying actions without equivalent descriptions of similar actions by others. This bias should be clear and direct, not inferred from neutral descriptions of conflict actions. Talking about Hamas does not automatically imply Bias against Palestine.
    Biased against Israel: Use this label if the language portrays Israel or Israelis negatively through the use of loaded language or misrepresentation. This could include terms that unfairly categorize Israeli actions or policies, such as using "genocide" without factual backing or negative descriptors for Israeli military actions not used for similar actions by others.
    Biased against both Palestine and Israel: Use this label if the text uses language that negatively portrays both sides, suggesting hostility or illegitimacy towards both, often using a tone that ridicules or condemns the conflict or both parties involved.
    Biased against others: Apply this label if the bias is directed towards other entities not directly related to the Israel-Palestine conflict. If multiple entities including Israel and/or Palestine are portrayed negatively, choose the most specific label relevant to the content.
    Unclear: Choose "Unclear" if the text lacks sufficient context to determine the presence of bias or if the language is ambiguous without clear indications of leaning towards or against any party.
    Not Applicable: Select "Not Applicable" for texts that are irrelevant to the task or unrelated to any conflict involving Israel and Palestine.

    Clarify that descriptions of conflict or military actions in themselves do not constitute bias unless coupled with language that unjustly portrays one side in a morally negative light compared to the other.
    Encourage checking if comparable actions by different sides are described differently; this disparity can indicate bias. Check the label defintions above for more guidance.
    Respond with the label only, without any prefix or additional explanation.
    Example for Analysis:
    """
    df['GPT-4 Bias proposal'] = df['English MT'].apply(lambda row: lm_completion([
        {"role": "system", "content": prompt},
        {"role": "system", "content": row}]))
    return df

df_annotated = apply_tag(df)
print(df_annotated.head())

In [None]:
df_annotated.describe()

In [None]:
df_annotated.to_excel("bias_annotation_gpt4.xlsx")
append_data(df_annotated[["GPT-4 Bias proposal"]], worksheet, 2)

In [None]:
def tag_connotations(df):
    prompt: str = """
    You are an expert in detecting linguistic media bias through word connotations and implications about factuality in texts. Your task is to label specific types of words in a text according to the following criteria:

    [negative]: Use this label for words that clearly carry a negative connotation, influencing the reader's perception negatively.
    [positive]: Use this label for words that clearly have a positive connotation, influencing the reader's perception positively.
    [factuality]: Use this label for words that cast doubt or imply uncertainty about the factuality of the information presented.
    Focus on tagging verbs, adjectives, and adverbs. Avoid tagging named entities or neutral words. Label directly after the word it applies to. For example:

    This is an awful[negative] text.
    Do not change anything else about the text. Just add the tags where applicable. Here is the text for analysis:
    """
    df['GPT-4 connotation'] = df['English MT'].apply(lambda row: lm_completion([
        {"role": "system", "content": prompt},
        {"role": "system", "content": row}]))
    return df

df_annotated_connotations = tag_connotations(df)
print(df_annotated_connotations.head())

In [None]:
df_annotated_connotations.to_excel("annotated_connotations_full_run.xlsx")

Update Google Sheets with our Results

In [4]:
def apply_propaganda_tag(df):
    
    prompt: str = """
    You are an expert in linguistic media propaganda detection. Carefully evaluate the text based on the language used and the context provided, and assign the appropriate label according to the refined criteria below. Respond with the label only, without any prefix or additional explanation.
    Definitions and Criteria:
    Propaganda: Use this label for texts that actively promote a specific political or ideological agenda through highly charged emotional language, overt promotional statements, or a selective presentation of facts that clearly aim to manipulate public perception. Examples include texts that celebrate military actions with terms like "win" against "terror," implying a righteous cause, or that frame conflict participants in a manner that clearly supports one side over another.
    Not Propaganda: Apply this label to texts that, while potentially using biased language, do not combine this with overt calls to action or explicit valorization/demonization that manipulates public perception. A statement reporting expected military actions or describing events without additional commentary or emotional framing should be considered under this category, even if it includes terms like "terror attacks" which may reflect some level of bias but do not by themselves constitute propaganda.
    Unclear: This label should be used for texts where the intent to inform or persuade is not clear-cut, particularly in brief texts or statements where contextual cues are minimal.
    Not Applicable: Use this label for texts that do not engage in any form of persuasive communication about controversial or conflict-related issues.
    Additional Guidance for Annotators:
    Evaluating the Intensity of Language: Focus on how intensely the language used in the text promotes one side or demeans another. Propaganda typically involves a strong bias towards one perspective, often accompanied by language that seeks to evoke a specific emotional response from the audience.
    Contextual Sensitivity: Consider the broader media context and the usual reporting style of the source. This can provide clues about whether a statement is part of a pattern of propaganda or a more isolated instance of biased reporting.
    Example for Analysis:
    """
    df['GPT-4 Propaganda proposal'] = df['English MT'].apply(lambda row: lm_completion([
        {"role": "system", "content": prompt},
        {"role": "system", "content": row}]))
    return df

df_annotated = apply_propaganda_tag(df)
print(df_annotated.head())

  Batch Source Language  ID  Type  \
0   B01         English   1  MAIN   
1   B01         English   2  MAIN   
2   B01         English   4  MAIN   
3   B01         English   7  MAIN   
4   B01         English   8  MAIN   

                                                Text  \
0  Yemen's Houthis have waded into the Israel-Ham...   
1             Isreal - Hamas Conflict | Face to Face   
2  Videos show how armed men from Gaza stormed a ...   
3  Protest in Aligarh Muslim University in suppor...   
4  IDF releases audio recording about misfired ro...   

                                          English MT  \
0  Yemen's Houthis have waded into the Israel-Ham...   
1             Isreal - Hamas Conflict | Face to Face   
2  Videos show how armed men from Gaza stormed a ...   
3  Protest in Aligarh Muslim University in suppor...   
4  IDF releases audio recording about misfired ro...   

                                           Arabic MT  Annotator ID_1  \
0  خاض الحوثيون في اليمن الحرب 

In [9]:
df_annotated.to_excel("annotated_propanda.xlsx")

In [8]:
append_data(df_annotated[["GPT-4 Propanda proposal"]], worksheet, 2)