# Importing libraries

In [None]:
import os
import time
from tqdm import tqdm
from collections import Counter

import pandas as pd

import openai
openai.api_key = "ENTER-YOUR-OPEN-AI-KEY-HERE"
from openai.api_resources import embedding

from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.indexes import VectorstoreIndexCreator

# Queries

In [65]:
keywords = ['Authorship Attribution', 'Authorship Attribution Darknet', 'Authorship Attribution Dark Web', 'Authorship Attribution Cybercrime', 'Authorship Attribution Human Trafficking', 'Authorship Attribution Forensic Analysis']
keywords

['Authorship Attribution',
 'Authorship Attribution Darknet',
 'Authorship Attribution Dark Web',
 'Authorship Attribution Cybercrime',
 'Authorship Attribution Human Trafficking',
 'Authorship Attribution Forensic Analysis']

# Getting all the pdf documents

In [22]:
all_docs = os.listdir("pdfs/")
all_docs = [os.path.join(os.getcwd(), "pdfs", docs) for docs in all_docs]

# Deleting the faulty docs
del all_docs[55]
del all_docs[58]

# Getting list of all questions

The guidelines presented stem from the four foundational concepts outlined in our research paper. "Privacy" pertains to the principles of privacy and data protection, ensuring that individual rights are safeguarded. "Bias" addresses discrimination and unintended biases, highlighting the need for equitable and impartial AA applications. "Transparency" encompasses transparency and fairness, advocating for clear and open practices throughout the AA life cycle. Lastly, "risk" examines the societal impact, focusing on the broader implications of AA research and its applications on society. These guidelines are crafted to facilitate the identification and balancing of the benefits and potential ethical issues inherent in AA research, promoting the responsible use of AA tools.

In [3]:
question_dictionary = {
    "privacy" :  [
    "Does the Authorship Atttribution research/application involve a high level of risk, necessitating a Data Protection Impact Assessment (DPIA)? High-risk scenarios may include biometric identification, law enforcement, or justice system usage. If the relevant details are not available, just say Info not available. Otherwise answer it as yes or no. ",
    "Does the Authorship Atttribution processing encompass extensive automated processing leading to decisions with legal or significant effects on individuals? Are measures in place to prevent identity disclosure, reputational damage, or legal consequences? If the relevant details are not available, just say Info not available. Otherwise answer it as yes or no. ",
    "Is there a scientific purpose or objective justifying exemptions from GDPR provisions? If the relevant details are not available, just say Info not available. Otherwise answer it as yes or no. ",
    "Is data processing confined to the original purpose for which it was collected? Is there periodic assessment and review to ensure ongoing relevance? If the relevant details are not available, just say Info not available. Otherwise answer it as yes or no. ",
    "Does the dataset contain information enabling the identification of individuals? If the relevant details are not available, just say Info not available. Otherwise answer it as yes or no. ",
    "Have adequate safeguards like anonymization, encryption, data minimization, and security procedures been implemented to minimize risks and protect individual rights? Are these measures in line with guidelines from research and academic organizations, with ethical oversight? If the relevant details are not available, just say Info not available. Otherwise answer it as yes or no. ",
    "Is the information provided to individuals about data processing clear, complete, and correct? If the relevant details are not available, just say Info not available. Otherwise answer it as yes or no. ",
],
    
    "bias" : [
    "Is there a specific label or target for each data instance, and how were these labels obtained? For manually annotated data, please provide details about the number of annotators, their backgrounds, and any measures taken to mitigate label bias. If the relevant details are not available, just say Info not available. Otherwise answer it as yes or no. ",
    "Does the training dataset sufficiently represent the entire authorship landscape, and what steps were taken to mitigate selection bias? If the relevant details are not available, just say Info not available. Otherwise answer it as yes or no. ",
    "Are there correlations between authors in the dataset and specific demographic attributes or population characteristics? If the relevant details are not available, just say Info not available. Otherwise answer it as yes or no. ",
    "Does the dataset cover multiple text genres or domains, and if not, what actions were taken to prevent biases related to domain and genre? If the relevant details are not available, just say Info not available. Otherwise answer it as yes or no. ",
    "Is there a class imbalance in the dataset, and what measures were implemented to avoid over-representing certain authors? Describe any sampling strategies used to address potential sampling bias. If the relevant details are not available, just say Info not available. Otherwise answer it as yes or no. ",
    "What feature extraction techniques were employed during training, and was fine-tuning performed on the target data? If the relevant details are not available, just say Info not available. Otherwise answer it as yes or no. ",
    "What precautions were taken to prevent overfitting and underfitting during model training? If the relevant details are not available, just say Info not available. Otherwise answer it as yes or no. ",
    "Do the chosen evaluation metrics align with the primary task objectives, and what insights can be provided about model generalization and robustness? If the relevant details are not available, just say Info not available. Otherwise answer it as yes or no. ",
    "Were independent blind assessments conducted by external evaluators, and can information about their backgrounds and diversity be provided? If the relevant details are not available, just say Info not available. Otherwise answer it as yes or no. "
],
    
    "transparency" : ["Were any experiments conducted to gain insights into the model’s decision-making processes (XAI experiments)? If so, what were the key findings and outcomes of these experiments? If the relevant details are not available, just say Info not available. Otherwise answer it as yes or no."],
    
    "risk" : [
    "Is there a disclaimer to alert readers to potentially harmful content in the research? If the relevant details are not available, just say Info not available. Otherwise answer it as yes or no. ",
    "Does the Authorship Atttribution processing encompass systematic and extensive automated processing that leads to decisions with legal or significant effects on individuals? Are measures in place to prevent identity disclosure, reputational damage, or legal ramifications? If the relevant details are not available, just say Info not available. Otherwise answer it as yes or no. ",
    "Does the scope of the Authorship Atttribution model align with its intended purpose to minimize potential misuse? If the relevant details are not available, just say Info not available. Otherwise answer it as yes or no. ",
    "Is there a mechanism to mitigate the risk of potential misuse and abuse, including scenarios involving targeted harassment, social engineering, or the creation of deceptive content falsely attributed to others? If the relevant details are not available, just say Info not available. Otherwise answer it as yes or no. ",
    "Are there mechanisms for human oversight and intervention to review and reject content with ethical concerns? If the relevant details are not available, just say Info not available. Otherwise answer it as yes or no. ",
    "What measures are in place to minimize the potential trauma experienced by individuals during the design, development, and deployment stages? Are there regular check-ins among team members to ensure clear communication and provide essential support for maintaining a healthy and safe working environment? Is mental health and psychological support offered to team members dealing with harmful text? If the relevant details are not available, just say Info not available. Otherwise answer it as yes or no. ",
    "Is there a routine schedule of audits and updates to the Authorship Atttribution model to anticipate and address potential ethical challenges? Do these audits and updates help ensure the model remains aligned with evolving ethical standards and societal expectations? If the relevant details are not available, just say Info not available. Otherwise answer it as yes or no. ",
    "Are efficient algorithms and training strategies given priority to minimize the carbon footprint and energy consumption? Is carbon tracking employed to monitor and quantify carbon emissions during Authorship Atttribution model training, aiding in optimization and offsetting strategies? If the relevant details are not available, just say Info not available. Otherwise answer it as yes or no. "
]
}

To ensure clarity, it's important to define the responses "NA", "Yes", and "No" within the context of assessing research paper compliance with established responsible guidelines:

1. "NA" (Not Available): This response indicates that the information necessary to determine compliance is missing or not provided in the research paper.
2. "Yes": This response signifies that the research paper adheres to and is compliant with the established responsible guidelines.
3. "No": This response denotes that the research paper fails to meet or is not compliant with the established responsible guidelines.

These responses are used to quickly identify the status of a research paper's adherence to required compliance standards.

Please be aware that while we recognize the potential for enhancing the system's effectiveness by subdividing the transparency and fairness guideline into more detailed subquestions, we must adhere to the Program Committee's directive that prohibits altering or introducing new experiments at this stage. Consequently, we have chosen to maintain the current structure without modification. Given an opportunity, we are prepared to advance our system by refining the guidelines related to transparency and fairness for improved efficacy.

# Loading model

In [4]:
llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo-1106",openai_api_key = openai.api_key)

  warn_deprecated(


# Generating embeddings

In [5]:
embeddings_model = OpenAIEmbeddings(openai_api_key=openai.api_key)

  warn_deprecated(


# Loading Documents

In [None]:
# Initialize the responses dictionary with empty lists for each category
# responses = {category: [] for category in question_dictionary.keys()}

pbar = tqdm(total=len(all_docs[59:]))
for index, doc in enumerate(all_docs[59:]):
    # After every 10 elements, introduce a delay
    if (index + 1) % 5 == 0:
        print("Taking a power nap for 2 mins. Kindly wait! ")
        time.sleep(120)  # Sleep for 120 seconds (2 minutes)
    
    loader = PyPDFLoader(doc)
    # pages = loader.load()
    index = VectorstoreIndexCreator(embedding=embeddings_model).from_loaders([loader])

    # Iterate over each question category
    for category, questions in question_dictionary.items():
        if category not in ["privacy", "bias", "transparency", "risk"]:
            raise Exception("Question categories can only be amongst privacy, bias, transparency, and risk")

        category_temp = []
        # Iterate over each question in the current category
        for question in questions:
            response = index.query(llm=llm, question=question)
            category_temp.append(response)
        responses[category].append(category_temp)
        
    pbar.update(1)
pbar.close()

# Unpack the responses into separate variables
privacy_response, bias_response, transparency_response, risk_response = (
    responses["privacy"], responses["bias"], responses["transparency"], responses["risk"])

# Converting data to pandas dataframe format

In [24]:
column_names = range(len(all_docs))
column_names = ["DOC-" + str(column) for column in column_names]

In [26]:
privacy_questions = range(len(question_dictionary['privacy']))
privacy_questions = ["Q-" + str(question) for question in privacy_questions]

bias_questions = range(len(question_dictionary['bias']))
bias_questions = ["Q-" + str(question) for question in bias_questions]

transparency_questions = range(len(question_dictionary['transparency']))
transparency_questions = ["Q-" + str(question) for question in transparency_questions]

risk_questions = range(len(question_dictionary['risk']))
risk_questions = ["Q-" + str(question) for question in risk_questions]

#### Creating privacy dataframe

In [46]:
privacy_df = pd.DataFrame(columns=column_names)
for index, column in enumerate(column_names):
    privacy_df[column] = privacy_response[index]
    
privacy_df.index = privacy_questions
privacy_df = privacy_df.T.replace("Info not available.", "NA")
privacy_df = privacy_df.replace(to_replace=r'^(Yes).*', value='Yes', regex=True)
privacy_df = privacy_df.replace(to_replace=r'^(No).*', value='No', regex=True)
privacy_df.to_csv("data/privacy.csv")

In [56]:
privacy_df

Unnamed: 0,Q-0,Q-1,Q-2,Q-3,Q-4,Q-5,Q-6
DOC-0,,Yes,,,Yes,,
DOC-1,,Yes,,,Yes,,
DOC-2,,Yes,,,Yes,,
DOC-3,Yes,Yes,,,,,
DOC-4,Yes,Yes,,,,,
...,...,...,...,...,...,...,...
DOC-62,Yes,Yes,Yes,Yes,Yes,,No
DOC-63,Yes,Yes,Yes,Yes,Yes,,No
DOC-64,Yes,Yes,Yes,,Yes,,No
DOC-65,Yes,Yes,Yes,Yes,Yes,,Yes


# Creating bias dataframe

In [55]:
bias_df = pd.DataFrame(columns=column_names)
for index, column in enumerate(column_names):
    bias_df[column] = bias_response[index]
    
bias_df.index = bias_questions
bias_df = bias_df.T.replace("Info not available.", "NA")
bias_df = bias_df.replace(to_replace=r'^(Yes).*', value='Yes', regex=True)
bias_df = bias_df.replace(to_replace=r'^(No).*', value='No', regex=True)
bias_df = bias_df.replace(to_replace=r'.*(not available).*', value='NA', regex=True)
bias_df.to_csv("data/bias.csv")

In [57]:
bias_df

Unnamed: 0,Q-0,Q-1,Q-2,Q-3,Q-4,Q-5,Q-6,Q-7,Q-8
DOC-0,Yes,Yes,,Yes,Yes,,Yes,Yes,
DOC-1,Yes,Yes,,Yes,Yes,,Yes,,
DOC-2,Yes,Yes,,Yes,Yes,,Yes,,
DOC-3,,,,,,,,,
DOC-4,Yes,,,Yes,Yes,,,,
...,...,...,...,...,...,...,...,...,...
DOC-62,Yes,Yes,Yes,,Yes,,Yes,,
DOC-63,Yes,Yes,Yes,,Yes,,Yes,,
DOC-64,Yes,Yes,Yes,,Yes,,Yes,,
DOC-65,Yes,Yes,Yes,,Yes,,Yes,,


# Creating transparency dataframe

In [58]:
transparency_df = pd.DataFrame(columns=column_names)
for index, column in enumerate(column_names):
    transparency_df[column] = transparency_response[index]
    
transparency_df.index = transparency_questions
transparency_df = transparency_df.T.replace("Info not available.", "NA")
transparency_df = transparency_df.replace(to_replace=r'^(Yes).*', value='Yes', regex=True)
transparency_df = transparency_df.replace(to_replace=r'^(No).*', value='No', regex=True)
transparency_df.to_csv("data/transparency.csv")

In [63]:
transparency_df

Unnamed: 0,Q-0
DOC-0,Yes
DOC-1,Yes
DOC-2,Yes
DOC-3,Yes
DOC-4,Yes
...,...
DOC-62,Yes
DOC-63,Yes
DOC-64,Yes
DOC-65,Yes


# Creating Risk dataframe

In [62]:
risk_df = pd.DataFrame(columns=column_names)
for index, column in enumerate(column_names):
    risk_df[column] = risk_response[index]
    
risk_df.index = risk_questions
risk_df = risk_df.T.replace("Info not available.", "NA")
risk_df = risk_df.replace(to_replace=r'^(Yes).*', value='Yes', regex=True)
risk_df = risk_df.replace(to_replace=r'^(No).*', value='No', regex=True)
risk_df.to_csv("data/risk.csv")

In [64]:
risk_df

Unnamed: 0,Q-0,Q-1,Q-2,Q-3,Q-4,Q-5,Q-6,Q-7
DOC-0,Yes,,Yes,,,,,
DOC-1,Yes,,Yes,Yes,,,,
DOC-2,Yes,,Yes,Yes,Yes,,,
DOC-3,,,,,,,,
DOC-4,,,Yes,,,,,
...,...,...,...,...,...,...,...,...
DOC-62,Yes,Yes,Yes,Yes,Yes,,,
DOC-63,Yes,Yes,Yes,Yes,Yes,,,
DOC-64,Yes,Yes,Yes,Yes,Yes,,,
DOC-65,Yes,Yes,Yes,Yes,Yes,,,


# Calculate data stats

In [20]:
# Loading data

privacy_df = pd.read_csv("data/privacy.csv").set_index('Unnamed: 0')
bias_df = pd.read_csv("data/bias.csv").set_index('Unnamed: 0')
transparency_df = pd.read_csv("data/transparency.csv").set_index('Unnamed: 0')
risk_df = pd.read_csv("data/risk.csv").set_index('Unnamed: 0')

In [27]:
def generate_stats(df):
    columns = df.columns
    stat_dict = {}
    for column in columns:
        temp_dict = dict(Counter(df[column]))
        for key, value in temp_dict.items():
            if key not in stat_dict.keys():
                stat_dict[key] = temp_dict[key]
            else:
                stat_dict[key] += temp_dict[key]
    return stat_dict

In [28]:
privacy_stat = generate_stats(privacy_df)
print("Privacy Compliance Ratio:", privacy_stat['Yes']/sum(list((privacy_stat.values()))))

Privacy Compliance Ratio: 0.48187633262260127


In [30]:
# Assuming privacy_df is your DataFrame
bias_stat = generate_stats(bias_df)
# Calculate and print compliance stats
print("Bias Compliance Ratio:", bias_stat['Yes']/sum(list((bias_stat.values()))))

Bias Compliance Ratio: 0.5008291873963516


In [31]:
# Assuming privacy_df is your DataFrame
transparency_stat = generate_stats(transparency_df)
# Calculate and print compliance stats
print("Transparency Compliance Ratio:", transparency_stat['Yes']/sum(list((transparency_stat.values()))))

Transparency Compliance Ratio: 1.0


In [32]:
# Assuming privacy_df is your DataFrame
risk_stat = generate_stats(risk_df)
# Calculate and print compliance stats
print("Risk Compliance Ratio:", risk_stat['Yes']/sum(list((risk_stat.values()))))

Risk Compliance Ratio: 0.36007462686567165
