# GPT for Linguistic Annotation

In [16]:
# imports for this project
import gspread
import pandas as pd
import os
from utils import append_data, lm_completion

### Sys-admin

In [19]:
# Google Sheets API
gc = gspread.service_account(filename='fignews-7b178eec49aa.json')
SHEET_ID = "1e_KpDnyNriLSNWMt-qIvcqGqtH_JC9YrzoMDHNiqdNA"
SHEET_NAME = "NwayMain"

Get Sample Data from Google Sheets

In [22]:
spreadsheet = gc.open_by_key(SHEET_ID)
worksheet = spreadsheet.worksheet(SHEET_NAME)
rows = worksheet.get_all_records()
df = pd.DataFrame(rows)
print(df.head())
df.describe()

  Batch Source Language  ID  Type  \
0   B01         English   1  MAIN   
1   B01         English   2  MAIN   
2   B01         English   4  MAIN   
3   B01         English   7  MAIN   
4   B01         English   8  MAIN   

                                                Text  \
0  Yemen's Houthis have waded into the Israel-Ham...   
1             Isreal - Hamas Conflict | Face to Face   
2  Videos show how armed men from Gaza stormed a ...   
3  Protest in Aligarh Muslim University in suppor...   
4  IDF releases audio recording about misfired ro...   

                                          English MT  \
0  Yemen's Houthis have waded into the Israel-Ham...   
1             Isreal - Hamas Conflict | Face to Face   
2  Videos show how armed men from Gaza stormed a ...   
3  Protest in Aligarh Muslim University in suppor...   
4  IDF releases audio recording about misfired ro...   

                                           Arabic MT  Annotator ID_1  \
0  خاض الحوثيون في اليمن الحرب 

Unnamed: 0,ID,Annotator ID_1,Annotator ID_2,Annotator ID_3,Annotator ID_4,N-way count bias,N-way count propaganda
count,1800.0,1800.0,1800.0,1800.0,1800.0,1800.0,1800.0
mean,2641.267222,1.0,2.0,3.0,4.0,0.263333,0.262778
std,4519.388845,0.0,0.0,0.0,0.0,0.440564,0.440265
min,1.0,1.0,2.0,3.0,4.0,0.0,0.0
25%,286.75,1.0,2.0,3.0,4.0,0.0,0.0
50%,957.0,1.0,2.0,3.0,4.0,0.0,0.0
75%,3500.75,1.0,2.0,3.0,4.0,1.0,1.0
max,42210.0,1.0,2.0,3.0,4.0,1.0,1.0


In [11]:
def apply_tag(df):
    
    prompt: str = """
    You are an expert for linguistic media bias detection. You are asked to label the following text with one of the following labels:
    Unbiased
    Biased against Palestine
    Biased against Israel
    Biased against both Palestine and Israel
    Biased against others
    Unclear
    Not Applicable.
    There is not more context to include that what is given to you, so please make your best judgment based on the text alone. Just reply with the label. Here is the text:
    """
    df['GPT proposal'] = df['English MT'].apply(lambda row: lm_completion([
        {"role": "system", "content": prompt},
        {"role": "system", "content": row}]))
    return df

df_annotated = apply_tag(sample)
print(df_annotated.head())

     Batch Source Language     ID  Type  \
10     B01         English     21  MAIN   
838    B01           Hindi    907  MAIN   
548    B01          Hebrew     13  MAIN   
75     B01         English    417  MAIN   
1257   B02          Arabic  37691  MAIN   

                                                   Text  \
10    SENDING A CLEAR MESSAGE: Americans joined toge...   
838   मेरी बहन और उनकी 2 बच्चियों को हमास अगवा किया ...   
548   ״היי חנוך. בן זוגי הי״ד סרן אור יוסף רן נהרג ב...   
75    Hamas has invited Elon Musk to witness in pers...   
1257  رؤى طالبة في كلية الطب بالجامعة الإسلامية في غ...   

                                             English MT  \
10    SENDING A CLEAR MESSAGE: Americans joined toge...   
838   My sister and her two daughters have been kidn...   
548   "Hey Enoch. My partner in the military, Capt. ...   
75    Hamas has invited Elon Musk to witness in pers...   
1257  Ruaa is a student at the Faculty of Medicine a...   

                               

In [5]:
df_annotated.to_excel("annotated.xlsx")

In [21]:
def tag_connotations(df):
    prompt: str = """
    <prompt>
You are an expert for linguistic media bias detection. You are asked to label words in the following text with one of the following labels:
[positive]: meaning that the word has strong positive emotional or biased connotations.
[negative]: meaning that the word has strong negative emotional or biased connotations.
[factuality]: meaning that the word adds doubt to the factuality or veracity of the text.

Directly apply the tag behind the word it applies to, for example:
This is an awful[negative] text.

Do not change anything else about the text, just add the tags where applicable.

Here is the text: 
</prompt>
    """
    df['GPT connotation'] = df['English MT'].apply(lambda row: lm_completion([
        {"role": "system", "content": prompt},
        {"role": "system", "content": row}]))
    return df

df_annotated_connotations = tag_connotations(df)
print(df_annotated_connotations.head())

  Batch Source Language  ID  Type  \
0   B01         English   1  MAIN   
1   B01         English   2  MAIN   
2   B01         English   4  MAIN   
3   B01         English   7  MAIN   
4   B01         English   8  MAIN   

                                                Text  \
0  Yemen's Houthis have waded into the Israel-Ham...   
1             Isreal - Hamas Conflict | Face to Face   
2  Videos show how armed men from Gaza stormed a ...   
3  Protest in Aligarh Muslim University in suppor...   
4  IDF releases audio recording about misfired ro...   

                                          English MT  \
0  Yemen's Houthis have waded into the Israel-Ham...   
1             Isreal - Hamas Conflict | Face to Face   
2  Videos show how armed men from Gaza stormed a ...   
3  Protest in Aligarh Muslim University in suppor...   
4  IDF releases audio recording about misfired ro...   

                                           Arabic MT  Annotator ID_1  \
0  خاض الحوثيون في اليمن الحرب 

In [23]:
df_annotated_connotations.to_excel("annotated_connotations.xlsx")

Update Google Sheets with our Results

In [25]:
append_data(df_annotated_connotations[["GPT connotation"]], worksheet, 2)