In [None]:
import requests
import csv
import time

"""
Script used to find entries of stack exchange sites that contain the acronym "IANAL" and
save them to a CSV file. The script will search for questions and answers containing "IANAL"
before saving the data.
"""

# Configuration
SITE = 'softwareengineering'  # Change to another Stack Exchange site as needed
OUTPUT_FILE = 'ianal_posts.csv'

# API endpoints
QUESTION_SEARCH_URL = "https://api.stackexchange.com/2.3/search/advanced"
QUESTION_ANSWERS_URL = "https://api.stackexchange.com/2.3/questions/{id}/answers"
ANSWER_DETAIL_URL = "https://api.stackexchange.com/2.3/answers/{id}"
QUESTION_DETAIL_URL = "https://api.stackexchange.com/2.3/questions/{id}"
EXCERPT_SEARCH_URL = "https://api.stackexchange.com/2.3/search/excerpts"

# We'll track answer IDs that we have already processed so we don’t duplicate them.
processed_answer_ids = set()

# Open the CSV file for writing.
with open(OUTPUT_FILE, mode='w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    # CSV header row
    writer.writerow(["post_type", "post_id", "creation_date", "question_id", "question_title", "question_body", "answer_body"])

    ###########################################################################
    # PART 1: Process questions that contain IANAL using /search/advanced
    ###########################################################################
    print("Processing questions that contain IANAL...")
    page = 1
    while True:
        params = {
            'q': 'IANAL',       # Search term
            'site': SITE,
            'pagesize': 100,
            'page': page,
            'filter': 'withbody'  # Built-in filter to include the full body
        }
        response = requests.get(QUESTION_SEARCH_URL, params=params)
        data = response.json()
        if 'items' not in data:
            print("No more question items or an error occurred.")
            break

        items = data.get('items', [])
        print(f"Page {page} | Returned Items: {len(items)} | Has More: {data.get('has_more')}")

        for question in data['items']:
            # Double-check that the question body contains "IANAL" (case-insensitive)
            if "ianal" in question.get('body', '').lower():
                question_id = question.get('question_id')
                creation_date = question.get('creation_date')
                title = question.get('title')
                question_body = question.get('body')
                # Write a row for the question itself
                writer.writerow(["question", question_id, creation_date, question_id, title, question_body, ""])

                # Retrieve the answers for this question.
                ans_params = {
                    'site': SITE,
                    'pagesize': 100,
                    'filter': 'withbody'
                }
                ans_url = QUESTION_ANSWERS_URL.format(id=question_id)
                ans_response = requests.get(ans_url, params=ans_params)
                ans_data = ans_response.json()

                # For each answer, check if it also contains "IANAL"
                if 'items' in ans_data:
                    for answer in ans_data['items']:
                        answer_body = answer.get('body', '')
                        if "ianal" in answer_body.lower():
                            answer_id = answer.get('answer_id')
                            creation_date_ans = answer.get('creation_date')
                            # Only add if not already processed
                            if answer_id not in processed_answer_ids:
                                writer.writerow(["answer", answer_id, creation_date_ans, question_id, title, question_body, answer_body])
                                processed_answer_ids.add(answer_id)
                        # End if answer qualifies
                # Pause briefly between question and its answers
                time.sleep(0.2)
        # End processing of one page of questions

        if not data.get('has_more'):
            break

        page += 1
        time.sleep(0.5)

    ##################################################################################
    # PART 2: Process answers that contain IANAL (but whose parent question did not)
    #         using /search/excerpts to catch answers that weren't returned in part 1.
    ##################################################################################
    print("Processing answers that contain IANAL...")
    page = 1
    while True:
        params = {
            'q': 'IANAL',
            'site': SITE,
            'pagesize': 100,
            'page': page
        }
        response = requests.get(EXCERPT_SEARCH_URL, params=params)
        data = response.json()
        if 'items' not in data:
            print("No more excerpt items or an error occurred.")
            break

        items = data.get('items', [])
        print(f"Page {page} | Returned Items: {len(items)} | Has More: {data.get('has_more')}")

        for item in data['items']:
            # We’re only interested in answers from the excerpt search.
            if item.get('item_type') == 'answer':
                answer_id = item.get('answer_id')  # In excerpt results, "post_id" is the answer id.
                # Skip if we've already processed this answer.
                if answer_id in processed_answer_ids or answer_id is None:
                    continue

                # Fetch the full answer details to get the complete body.
                ans_detail_url = ANSWER_DETAIL_URL.format(id=answer_id)
                ans_params = {
                    'site': SITE,
                    'filter': 'withbody'
                }
                ans_detail_response = requests.get(ans_detail_url, params=ans_params)
                ans_detail_data = ans_detail_response.json()
                if 'items' in ans_detail_data and len(ans_detail_data['items']) > 0:
                    answer_detail = ans_detail_data['items'][0]
                    answer_body = answer_detail.get('body', '')
                    # Ensure the answer body indeed contains IANAL.
                    if "ianal" not in answer_body.lower():
                        continue
                    creation_date_ans = answer_detail.get('creation_date')
                    question_id = answer_detail.get('question_id')

                    # Now fetch the parent question details for context.
                    quest_detail_url = QUESTION_DETAIL_URL.format(id=question_id)
                    quest_params = {
                        'site': SITE,
                        'filter': 'withbody'
                    }
                    quest_detail_response = requests.get(quest_detail_url, params=quest_params)
                    quest_detail_data = quest_detail_response.json()
                    if 'items' in quest_detail_data and len(quest_detail_data['items']) > 0:
                        question_detail = quest_detail_data['items'][0]
                        title = question_detail.get('title')
                        question_body = question_detail.get('body')
                        # Write a row for this answer along with its question context.
                        writer.writerow(["answer", answer_id, creation_date_ans, question_id, title, question_body, answer_body])
                        processed_answer_ids.add(answer_id)
                # Pause briefly between processing answers.
                time.sleep(0.2)
        # End processing one page of excerpts

        if not data.get('has_more'):
            break

        page += 1
        time.sleep(0.5)

print("Data saved to", OUTPUT_FILE)


Processing questions that contain IANAL...
Page 1 | Returned Items: 12 | Has More: False
Processing answers that contain IANAL...
Page 1 | Returned Items: 100 | Has More: True
Page 2 | Returned Items: 100 | Has More: True


KeyboardInterrupt: 

In [None]:
import requests
import csv
import time

"""
Script used to find entries of Stack Exchange sites that contain the acronym "IANAL" and
save them to a CSV file. The script searches for questions and answers containing "IANAL"
before saving the data.
"""

# Configuration
SITE = 'softwareengineering'  # Change to another Stack Exchange site as needed
OUTPUT_FILE = 'ianal_posts.csv'
API_KEY = "********"  # Insert your Stack Exchange API key here

# API endpoints
QUESTION_SEARCH_URL = "https://api.stackexchange.com/2.3/search/advanced"
QUESTION_ANSWERS_URL = "https://api.stackexchange.com/2.3/questions/{id}/answers"
ANSWER_DETAIL_URL = "https://api.stackexchange.com/2.3/answers/{id}"
QUESTION_DETAIL_URL = "https://api.stackexchange.com/2.3/questions/{id}"
EXCERPT_SEARCH_URL = "https://api.stackexchange.com/2.3/search/excerpts"

# Helper function to add the API key if it is provided
def add_api_key(params):
    if API_KEY:
        params['key'] = API_KEY
    return params

# We'll track answer IDs that we have already processed so we don’t duplicate them.
processed_answer_ids = set()

# Open the CSV file for writing.
with open(OUTPUT_FILE, mode='w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    # CSV header row
    writer.writerow(["post_type", "post_id", "creation_date", "question_id", "question_title", "question_body", "answer_body"])

    ###########################################################################
    # PART 1: Process questions that contain IANAL using /search/advanced
    ###########################################################################
    print("Processing questions that contain IANAL...")
    page = 1
    while True:
        params = {
            'q': 'IANAL',       # Search term
            'site': SITE,
            'pagesize': 100,
            'page': page,
            'filter': 'withbody'  # Built-in filter to include the full body
        }
        params = add_api_key(params)  # Add the API key to parameters
        response = requests.get(QUESTION_SEARCH_URL, params=params)
        data = response.json()
        if 'items' not in data:
            print("No more question items or an error occurred.")
            break

        items = data.get('items', [])
        print(f"Page {page} | Returned Items: {len(items)} | Has More: {data.get('has_more')}")

        for question in data['items']:
            # Double-check that the question body contains "IANAL" (case-insensitive)
            if "ianal" in question.get('body', '').lower():
                question_id = question.get('question_id')
                creation_date = question.get('creation_date')
                title = question.get('title')
                question_body = question.get('body')
                # Write a row for the question itself
                writer.writerow(["question", question_id, creation_date, question_id, title, question_body, ""])

                # Retrieve the answers for this question.
                ans_params = {
                    'site': SITE,
                    'pagesize': 100,
                    'filter': 'withbody'
                }
                ans_params = add_api_key(ans_params)
                ans_url = QUESTION_ANSWERS_URL.format(id=question_id)
                ans_response = requests.get(ans_url, params=ans_params)
                ans_data = ans_response.json()

                # For each answer, check if it also contains "IANAL"
                if 'items' in ans_data:
                    for answer in ans_data['items']:
                        answer_body = answer.get('body', '')
                        if "ianal" in answer_body.lower():
                            answer_id = answer.get('answer_id')
                            creation_date_ans = answer.get('creation_date')
                            # Only add if not already processed
                            if answer_id not in processed_answer_ids:
                                writer.writerow(["answer", answer_id, creation_date_ans, question_id, title, question_body, answer_body])
                                processed_answer_ids.add(answer_id)
                # Pause briefly between question and its answers
                time.sleep(0.2)
        # End processing of one page of questions

        if not data.get('has_more'):
            break

        page += 1
        time.sleep(0.5)

    ##################################################################################
    # PART 2: Process answers that contain IANAL (but whose parent question did not)
    #         using /search/excerpts to catch answers that weren't returned in part 1.
    ##################################################################################
    print("Processing answers that contain IANAL...")
    page = 1
    while True:
        params = {
            'q': 'IANAL',
            'site': SITE,
            'pagesize': 100,
            'page': page
        }
        params = add_api_key(params)
        response = requests.get(EXCERPT_SEARCH_URL, params=params)
        data = response.json()
        if 'items' not in data:
            print("No more excerpt items or an error occurred.")
            break

        items = data.get('items', [])
        print(f"Page {page} | Returned Items: {len(items)} | Has More: {data.get('has_more')}")

        for item in data['items']:
            # We’re only interested in answers from the excerpt search.
            if item.get('item_type') == 'answer':
                answer_id = item.get('answer_id')  # In excerpt results, "post_id" is the answer id.
                # Skip if we've already processed this answer.
                if answer_id in processed_answer_ids or answer_id is None:
                    continue

                # Fetch the full answer details to get the complete body.
                ans_detail_url = ANSWER_DETAIL_URL.format(id=answer_id)
                ans_params = {
                    'site': SITE,
                    'filter': 'withbody'
                }
                ans_params = add_api_key(ans_params)
                ans_detail_response = requests.get(ans_detail_url, params=ans_params)
                ans_detail_data = ans_detail_response.json()
                if 'items' in ans_detail_data and len(ans_detail_data['items']) > 0:
                    answer_detail = ans_detail_data['items'][0]
                    answer_body = answer_detail.get('body', '')
                    # Ensure the answer body indeed contains IANAL.
                    if "ianal" not in answer_body.lower():
                        continue
                    creation_date_ans = answer_detail.get('creation_date')
                    question_id = answer_detail.get('question_id')

                    # Now fetch the parent question details for context.
                    quest_detail_url = QUESTION_DETAIL_URL.format(id=question_id)
                    quest_params = {
                        'site': SITE,
                        'filter': 'withbody'
                    }
                    quest_params = add_api_key(quest_params)
                    quest_detail_response = requests.get(quest_detail_url, params=quest_params)
                    quest_detail_data = quest_detail_response.json()
                    if 'items' in quest_detail_data and len(quest_detail_data['items']) > 0:
                        question_detail = quest_detail_data['items'][0]
                        title = question_detail.get('title')
                        question_body = question_detail.get('body')
                        # Write a row for this answer along with its question context.
                        writer.writerow(["answer", answer_id, creation_date_ans, question_id, title, question_body, answer_body])
                        processed_answer_ids.add(answer_id)
                # Pause briefly between processing answers.
                time.sleep(0.2)
        # End processing one page of excerpts

        if not data.get('has_more'):
            break

        page += 1
        time.sleep(0.5)

print("Data saved to", OUTPUT_FILE)


Processing questions that contain IANAL...
Page 1 | Returned Items: 12 | Has More: False
Processing answers that contain IANAL...
Page 1 | Returned Items: 100 | Has More: True
Page 2 | Returned Items: 100 | Has More: True
Page 3 | Returned Items: 76 | Has More: False
Data saved to ianal_posts.csv


In [None]:
import requests
import csv
import time

"""
Script used to find entries of Stack Exchange sites that contain the acronym "IANAL" and
retrieve all answers for those questions, along with upvotes and number of answers per question.
"""

# Configuration
SITE = 'devops'  # Change to another Stack Exchange site as needed
OUTPUT_FILE = 'devops.csv'
API_KEY = '********'  # Add your API key here for higher rate limits (optional)

# API endpoints
ANSWER_SEARCH_URL = "https://api.stackexchange.com/2.3/search/excerpts"
QUESTION_ANSWERS_URL = "https://api.stackexchange.com/2.3/questions/{id}/answers"
QUESTION_DETAIL_URL = "https://api.stackexchange.com/2.3/questions/{id}"

def make_request(url, params):
    """Helper function to make API requests with retries and error handling."""
    for _ in range(3):  # Retry up to 3 times in case of failure
        try:
            response = requests.get(url, params=params)
            if response.status_code == 200:
                return response.json()
            elif response.status_code == 429:
                print("Rate limit exceeded. Waiting before retrying...")
                time.sleep(5)
            else:
                print(f"Error: Received status code {response.status_code}")
                return None
        except requests.exceptions.RequestException as e:
            print(f"Request failed: {e}")
            time.sleep(2)
    return None

with open(OUTPUT_FILE, mode='w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(["question_id", "question_title", "question_body", "num_answers", "answers_with_upvotes", "majority"])

    print("Searching for answers that explicitly contain 'IANAL'...")
    page = 1
    while True:
        params = {
            'q': 'IANAL',
            'site': SITE,
            'pagesize': 100,
            'page': page,
            'filter': 'withbody',
            'item_type': 'answer',
            'key': API_KEY
        }
        data = make_request(ANSWER_SEARCH_URL, params)
        if not data or 'items' not in data:
            break

        question_ids = set()
        for item in data.get('items', []):
            if item.get('item_type') == 'answer':
                question_ids.add(item.get('question_id'))

        for question_id in question_ids:
            # Fetch question details
            quest_params = {'site': SITE, 'filter': 'withbody', 'key': API_KEY}
            quest_data = make_request(QUESTION_DETAIL_URL.format(id=question_id), quest_params)

            # Fetch all answers for the question
            ans_params = {'site': SITE, 'pagesize': 100, 'filter': 'withbody', 'key': API_KEY}
            ans_data = make_request(QUESTION_ANSWERS_URL.format(id=question_id), ans_params)

            if not quest_data or 'items' not in quest_data or not ans_data or 'items' not in ans_data:
                continue

            question_detail = quest_data['items'][0]
            question_title = question_detail.get('title', '')
            question_body = question_detail.get('body', '')
            num_answers = len(ans_data['items'])

            # Store all answers and their upvotes in a structured format
            answers_with_upvotes = []
            max_upvotes = 0
            ianal_answer_max = False
            ianal_present = False

            for answer in ans_data['items']:
                answer_id = answer.get('answer_id')
                answer_body = answer.get('body', '').replace('\n', ' ').replace('\r', ' ')
                upvotes = answer.get('score', 0)
                answers_with_upvotes.append(f"[Answer ID: {answer_id}, Upvotes: {upvotes}] {answer_body}")

                # Determine max upvoted answer
                if upvotes > max_upvotes:
                    max_upvotes = upvotes
                    ianal_answer_max = "ianal" in answer_body.lower()

                if "ianal" in answer_body.lower():
                    ianal_present = True

            # Determine majority field
            if num_answers == 1:
                majority = "NA"
            else:
                majority = "True" if ianal_present and ianal_answer_max else "False"

            writer.writerow([question_id, question_title, question_body, num_answers, " | ".join(answers_with_upvotes), majority])

            time.sleep(0.2)

        if not data.get('has_more'):
            break

        page += 1
        time.sleep(1)

print(f"Data saved to {OUTPUT_FILE}")


Searching for answers that explicitly contain 'IANAL'...
Data saved to devops.csv


In [None]:
import os
import pandas as pd

def compile_csv_files(directory_path, output_file="uvcompiled.csv"):
    # List all CSV files in the provided directory
    csv_files = [file for file in os.listdir(directory_path) if file.endswith(".csv")]

    # Create an empty list to store DataFrames
    dataframes = []

    # Loop over all CSV files and read them into pandas
    for csv_file in csv_files:
        file_path = os.path.join(directory_path, csv_file)
        df = pd.read_csv(file_path)
        dataframes.append(df)

    # Concatenate all DataFrames into a single DataFrame
    combined_df = pd.concat(dataframes, ignore_index=True)

    # Save the combined DataFrame to a new CSV file
    combined_df.to_csv(output_file, index=False)
    print(f"Compiled {len(csv_files)} CSV files into '{output_file}'.")


if __name__ == "__main__":
    # Replace 'path_to_your_csv_folder' with your actual folder path
    directory = "/content/uv"
    compile_csv_files(directory, "uvcompiled.csv")


Compiled 11 CSV files into 'uvcompiled.csv'.


In [None]:
import pandas as pd

# Load the dataset (Replace with your actual file path if running locally)
file_path = "uvcompiled.csv"
df = pd.read_csv(file_path)

# Check if the column exists
if "majority" in df.columns:  # Adjust column name if needed
    # Count occurrences of True, False, and NA values
    true_count = (df["majority"] == True).sum()
    false_count = (df["majority"] == False).sum()
    na_count = df["majority"].isna().sum()  # If NA values exist

    # Print the results
    print(f"True Count: {true_count}")
    print(f"False Count: {false_count}")
    print(f"NA Count: {na_count}")

else:
    print("Column 'upvote_comparison' not found in dataset. Please check the column names.")


True Count: 128
False Count: 214
NA Count: 180
