# GPT for Linguistic Annotation

In [1]:
# imports for this project
import gspread
import pandas as pd
import os
from utils import append_data, lm_completion

### Sys-admin

In [2]:
# Google Sheets API
gc = gspread.service_account(filename='fignews-7b178eec49aa.json')
SHEET_ID = "1e_KpDnyNriLSNWMt-qIvcqGqtH_JC9YrzoMDHNiqdNA"
SHEET_NAME = "NwayMain"

Get Sample Data from Google Sheets

In [3]:
spreadsheet = gc.open_by_key(SHEET_ID)
worksheet = spreadsheet.worksheet(SHEET_NAME)
rows = worksheet.get_all_records()
df = pd.DataFrame(rows)
print(df.describe())

                 ID  Annotator ID_1  Annotator ID_2  Annotator ID_3  \
count   1800.000000          1800.0          1800.0          1800.0   
mean    2641.267222             1.0             2.0             3.0   
std     4519.388845             0.0             0.0             0.0   
min        1.000000             1.0             2.0             3.0   
25%      286.750000             1.0             2.0             3.0   
50%      957.000000             1.0             2.0             3.0   
75%     3500.750000             1.0             2.0             3.0   
max    42210.000000             1.0             2.0             3.0   

       Annotator ID_4  N-way count bias  N-way count propaganda  
count          1800.0       1800.000000             1800.000000  
mean              4.0          0.255556                0.255000  
std               0.0          0.436294                0.435982  
min               4.0          0.000000                0.000000  
25%               4.0         

In [4]:
def apply_tag(df):
    
    prompt: str = """
    You are an expert for linguistic media bias detection. You are asked to label the following text with one of the following labels:
    Unbiased
    Biased against Palestine
    Biased against Israel
    Biased against both Palestine and Israel
    Biased against others
    Unclear
    Not Applicable.
    There is not more context to include that what is given to you, so please make your best judgment based on the text alone. Just reply with the label. Here is the text:
    """
    df['GPT proposal'] = df['English MT'].apply(lambda row: lm_completion([
        {"role": "system", "content": prompt},
        {"role": "system", "content": row}]))
    return df

df_annotated = apply_tag(sample)
print(df_annotated.head())

      Batch Source Language     ID  Type  \
2149    B03          Arabic  14521  MAIN   
1833    B03         English    673  MAIN   
309     B01          Arabic   3530  MAIN   
11796   B14         English   3727  MAIN   
12011   B14          Arabic   9325  MAIN   

                                                    Text  \
2149   "فتح": تصريحات المدعو حماد تعبر عن انحطاط وطني...   
1833   HAPPENING NOW: Pres. Biden delivers remarks af...   
309    بدأ عملها بخطف جلعاد شاليط، وكانت المسؤولة عن ...   
11796  The Republican presidential candidates all cal...   
12011  منذ 15 عاماً.. 4 حروب اندلعت بين الفلسطينيين و...   

                                              English MT  \
2149   "Fatah": The statements of the so-called Hamma...   
1833   HAPPENING NOW: Pres. Biden delivers remarks af...   
309    Her work began with the kidnapping of Gilad Sh...   
11796  The Republican presidential candidates all cal...   
12011  15 years ago... 4 wars broke out between Pales...   

             

In [5]:
df_annotated.to_excel("annotated.xlsx")

In [7]:
def tag_connotations(df):
    prompt: str = """
    You are an expert for linguistic media bias detection. You are asked to label words in the following text with one of the following labels:
    [negative]: meaning that the word has a negative connotation
    [positive]: meaning that the word has a positive connotation
    [factuality]: meaning that the word implies a doubt about the factuality of the statement
    Focus on tagging verbs and adjectives and proper nouns. Do not tag named entities.
    Directly apply the tag behind the word it applies to, for example:
    This is an awful[negative] text.
    Do not change anything else about the text, just add the tags, where applicable. Here is the text:
    """
    df['GPT connotation'] = df['English MT'].apply(lambda row: lm_completion([
        {"role": "system", "content": prompt},
        {"role": "system", "content": row}]))
    return df

df_annotated_connotations = tag_connotations(df.sample(5))
print(df_annotated_connotations.describe())

                ID  Annotator ID_1  Annotator ID_2  Annotator ID_3  \
count     5.000000             5.0             5.0             5.0   
mean   2085.400000             1.0             2.0             3.0   
std    3926.991762             0.0             0.0             0.0   
min      78.000000             1.0             2.0             3.0   
25%      99.000000             1.0             2.0             3.0   
50%     178.000000             1.0             2.0             3.0   
75%     995.000000             1.0             2.0             3.0   
max    9077.000000             1.0             2.0             3.0   

       Annotator ID_4  N-way count bias  N-way count propaganda  
count             5.0          5.000000                5.000000  
mean              4.0          0.400000                0.400000  
std               0.0          0.547723                0.547723  
min               4.0          0.000000                0.000000  
25%               4.0          0.000000

In [6]:
df_annotated_connotations.to_excel("annotated_connotations.xlsx")

Collecting openpyxl
  Downloading openpyxl-3.1.2-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-1.1.0-py3-none-any.whl.metadata (1.8 kB)
Using cached openpyxl-3.1.2-py2.py3-none-any.whl (249 kB)
Using cached et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.2


Update Google Sheets with our Results

In [8]:
append_data(df_annotated_connotations[["GPT proposal", "GPT connotation"]], worksheet, 2)