# Imports

In [None]:
import requests
import pandas as pd
import time
import os
from openpyxl import load_workbook


# First Data Collection Run (Failed due to the API quota getting over)

In [None]:

API_KEY = "rl_iqe6KXFPAWpJYip5t6ZyHpq6H"  # Your API key
BASE_URL = "https://api.stackexchange.com/2.3/questions"
EXCEL_FILE = "nlp_stackoverflow_dataset.xlsx"
posts = []

# Load existing data if file exists
if os.path.exists(EXCEL_FILE):
    existing_df = pd.read_excel(EXCEL_FILE, sheet_name="Raw_Data")
    posts = existing_df.to_dict("records")
    start_page = (len(posts) // 100) + 1
    print(f"Resuming from page {start_page}, {len(posts)} posts already collected")
else:
    start_page = 1

# Collect posts
page = start_page
limit_reached = False
while len(posts) < 20000 and not limit_reached:
    params = {
        "page": page,
        "pagesize": 100,
        "order": "desc",
        "sort": "creation",
        "tagged": "nlp",
        "site": "stackoverflow",
        "key": API_KEY,
        "filter": "withbody"
    }
    response = requests.get(BASE_URL, params=params)
    data = response.json()

    if "error_id" in data:  # Handle quota errors
        print(f"Error at page {page}: {data}")
        limit_reached = True
        break
    if "items" not in data:
        print(f"Error at page {page}: {data}")
        break
    if not data["items"]:
        print("No more posts available.")
        break

    for item in data["items"]:
        post_id = item["question_id"]
        accepted_answer_id = item.get("accepted_answer_id", None)
        accepted_answer_text = "No accepted answer"
        more_answers_text = "No additional answers"

        answers_url = f"https://api.stackexchange.com/2.3/questions/{post_id}/answers"
        answers_params = {
            "site": "stackoverflow",
            "key": API_KEY,
            "filter": "withbody",
            "order": "desc",
            "sort": "votes"
        }
        answers_response = requests.get(answers_url, params=answers_params)
        answers_data = answers_response.json()

        if "error_id" in answers_data:
            print(f"Error fetching answers for post {post_id}: {answers_data}")
            limit_reached = True
            break
        if "items" not in answers_data:
            print(f"Error fetching answers for post {post_id}: {answers_data}")
            continue

        if answers_data["items"]:
            for answer in answers_data["items"]:
                answer_id = answer["answer_id"]
                if answer_id == accepted_answer_id:
                    accepted_answer_text = answer["body"]
                elif more_answers_text == "No additional answers":
                    more_answers_text = answer["body"]
                    break

        posts.append({
            "title": item["title"],
            "description": item["body"],
            "tags": ";".join(item["tags"]),
            "accepted_answer": accepted_answer_text,
            "more_accepted_answers": more_answers_text
        })

    print(f"Collected page {page}, total posts: {len(posts)}")
    page += 1
    time.sleep(1)

    # Save after every page
    initial_df = pd.DataFrame(posts, columns=["title", "description", "tags", "accepted_answer", "more_accepted_answers"])
    with pd.ExcelWriter(EXCEL_FILE, engine="xlsxwriter") as writer:
        initial_df.to_excel(writer, sheet_name="Raw_Data", index=False)
    print(f"Saved page {page - 1} to {EXCEL_FILE}")

    if "quota_remaining" in data and data["quota_remaining"] < 100:
        print(f"Quota nearly exhausted ({data['quota_remaining']} remaining), moving to preprocessing...")
        limit_reached = True
        break

# Final save
initial_df = pd.DataFrame(posts, columns=["title", "description", "tags", "accepted_answer", "more_accepted_answers"])
with pd.ExcelWriter(EXCEL_FILE, engine="xlsxwriter") as writer:
    initial_df.to_excel(writer, sheet_name="Raw_Data", index=False)
print(f"Collected {len(initial_df)} posts, saved to {EXCEL_FILE}")

Collected page 1, total posts: 100
Saved page 1 to nlp_stackoverflow_dataset.xlsx
Collected page 2, total posts: 200
Saved page 2 to nlp_stackoverflow_dataset.xlsx
Collected page 3, total posts: 300
Saved page 3 to nlp_stackoverflow_dataset.xlsx
Collected page 4, total posts: 400
Saved page 4 to nlp_stackoverflow_dataset.xlsx
Collected page 5, total posts: 500
Saved page 5 to nlp_stackoverflow_dataset.xlsx
Collected page 6, total posts: 600
Saved page 6 to nlp_stackoverflow_dataset.xlsx
Collected page 7, total posts: 700
Saved page 7 to nlp_stackoverflow_dataset.xlsx
Collected page 8, total posts: 800
Saved page 8 to nlp_stackoverflow_dataset.xlsx
Collected page 9, total posts: 900
Saved page 9 to nlp_stackoverflow_dataset.xlsx
Collected page 10, total posts: 1000
Saved page 10 to nlp_stackoverflow_dataset.xlsx
Collected page 11, total posts: 1100
Saved page 11 to nlp_stackoverflow_dataset.xlsx
Collected page 12, total posts: 1200
Saved page 12 to nlp_stackoverflow_dataset.xlsx
Collect

# Second Data Collection Run (Failed due to the IllegalCharacterError)

In [None]:
API_KEY = "rl_kUCy4GWQMRSP2AHqViJ1AsvUU"  # new API

# Load existing data if file exists
if os.path.exists(EXCEL_FILE):
    existing_df = pd.read_excel(EXCEL_FILE, sheet_name="Raw_Data")
    posts = existing_df.to_dict("records")
    start_page = (len(posts) // 100) + 1  # Resume from next page
    print(f"Resuming from page {start_page}, {len(posts)} posts already collected")
else:
    print("No existing file found, starting fresh.")
    posts = []
    start_page = 1

# Collect posts
page = start_page
limit_reached = False
while len(posts) < 20000 and not limit_reached:
    params = {
        "page": page,
        "pagesize": 100,
        "order": "desc",
        "sort": "creation",
        "tagged": "nlp",
        "site": "stackoverflow",
        "key": API_KEY,
        "filter": "withbody"
    }
    response = requests.get(BASE_URL, params=params)
    data = response.json()

    if "error_id" in data:
        print(f"Error at page {page}: {data}")
        limit_reached = True
        break
    if "items" not in data:
        print(f"Error at page {page}: {data}")
        break
    if not data["items"]:
        print("No more posts available.")
        break

    new_posts = []  # Collect new posts for this page
    for item in data["items"]:
        post_id = item["question_id"]
        accepted_answer_id = item.get("accepted_answer_id", None)
        accepted_answer_text = "No accepted answer"
        more_answers_text = "No additional answers"

        answers_url = f"https://api.stackexchange.com/2.3/questions/{post_id}/answers"
        answers_params = {
            "site": "stackoverflow",
            "key": API_KEY,
            "filter": "withbody",
            "order": "desc",
            "sort": "votes"
        }
        answers_response = requests.get(answers_url, params=answers_params)
        answers_data = answers_response.json()

        if "error_id" in answers_data:
            print(f"Error fetching answers for post {post_id}: {answers_data}")
            limit_reached = True
            break
        if "items" not in answers_data:
            print(f"Error fetching answers for post {post_id}: {answers_data}")
            continue

        if answers_data["items"]:
            for answer in answers_data["items"]:
                answer_id = answer["answer_id"]
                if answer_id == accepted_answer_id:
                    accepted_answer_text = answer["body"]
                elif more_answers_text == "No additional answers":
                    more_answers_text = answer["body"]
                    break

        new_posts.append({
            "title": item["title"],
            "description": item["body"],
            "tags": ";".join(item["tags"]),
            "accepted_answer": accepted_answer_text,
            "more_accepted_answers": more_answers_text
        })

    # Append new posts to existing list
    posts.extend(new_posts)
    print(f"Collected page {page}, total posts: {len(posts)}")
    page += 1
    time.sleep(1)

    # Append to existing Excel file
    new_df = pd.DataFrame(new_posts, columns=["title", "description", "tags", "accepted_answer", "more_accepted_answers"])
    if os.path.exists(EXCEL_FILE):
        book = load_workbook(EXCEL_FILE)
        with pd.ExcelWriter(EXCEL_FILE, engine="openpyxl", mode="a", if_sheet_exists="overlay") as writer:
            # Append new data below existing rows
            start_row = len(existing_df) + 1  # Start after existing data (header + rows)
            new_df.to_excel(writer, sheet_name="Raw_Data", startrow=start_row, index=False, header=False)
    else:
        # Create new file if it doesn’t exist (first run)
        with pd.ExcelWriter(EXCEL_FILE, engine="openpyxl") as writer:
            new_df.to_excel(writer, sheet_name="Raw_Data", index=False)
    print(f"Saved page {page - 1} to {EXCEL_FILE}, appended {len(new_posts)} new posts")

    if "quota_remaining" in data and data["quota_remaining"] < 100:
        print(f"Quota nearly exhausted ({data['quota_remaining']} remaining), moving to preprocessing...")
        limit_reached = True
        break

# Final save
initial_df = pd.DataFrame(posts, columns=["title", "description", "tags", "accepted_answer", "more_accepted_answers"])
with pd.ExcelWriter(EXCEL_FILE, engine="openpyxl") as writer:
    initial_df.to_excel(writer, sheet_name="Raw_Data", index=False)
print(f"Collected {len(initial_df)} posts, saved to {EXCEL_FILE}")

Resuming from page 99, 9806 posts already collected
Collected page 99, total posts: 9906
Saved page 99 to /content/nlp_stackoverflow_dataset.xlsx, appended 100 new posts
Collected page 100, total posts: 10006
Saved page 100 to /content/nlp_stackoverflow_dataset.xlsx, appended 100 new posts
Collected page 101, total posts: 10106
Saved page 101 to /content/nlp_stackoverflow_dataset.xlsx, appended 100 new posts
Collected page 102, total posts: 10206
Saved page 102 to /content/nlp_stackoverflow_dataset.xlsx, appended 100 new posts
Collected page 103, total posts: 10306
Saved page 103 to /content/nlp_stackoverflow_dataset.xlsx, appended 100 new posts
Collected page 104, total posts: 10406
Saved page 104 to /content/nlp_stackoverflow_dataset.xlsx, appended 100 new posts
Collected page 105, total posts: 10506
Saved page 105 to /content/nlp_stackoverflow_dataset.xlsx, appended 100 new posts
Collected page 106, total posts: 10606
Saved page 106 to /content/nlp_stackoverflow_dataset.xlsx, append

IllegalCharacterError: <p>"Another systemic problem with Naive Bayes is that
features are assumed to be independent. As a result,
even when words are dependent, each word contributes
evidence individually. Thus the magnitude of
the weights for classes with strong word dependencies
is larger than for classes with weak word dependencies.
To keep classes with more dependencies from dominating,
we normalize the classication weights." (<a href="http://www.aaai.org/Papers/ICML/2003/ICML03-081.pdf" rel="nofollow">Reference</a> )</p>

<p>What does this exactly mean? Is there any example that explains it better?</p>
 cannot be used in worksheets.

# Third Data Collection Run (Successfully completed but excel sheet missing entries from page 99 to 174)

In [None]:
API_KEY = "rl_kUCy4GWQMRSP2AHqViJ1AsvUU"  # new API key

# Function to remove illegal characters
def clean_text(text):
    if isinstance(text, str):
        # Remove control characters and other illegal characters
        text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]', '', text)
    return text

# Load existing data and determine start page
if os.path.exists(EXCEL_FILE):
    existing_df = pd.read_excel(EXCEL_FILE, sheet_name="Raw_Data")
    posts = existing_df.to_dict("records")
    start_page = (len(posts) // 100) + 1
    print(f"Resuming from page {start_page}, {len(posts)} posts already collected")
else:
    print("No existing file found, starting fresh.")
    posts = []
    start_page = 1

# Collect posts
page = start_page
limit_reached = False
while len(posts) < 20000 and not limit_reached:
    params = {
        "page": page,
        "pagesize": 100,
        "order": "desc",
        "sort": "creation",
        "tagged": "nlp",
        "site": "stackoverflow",
        "key": API_KEY,
        "filter": "withbody"
    }
    response = requests.get(BASE_URL, params=params)
    data = response.json()

    if "error_id" in data:
        print(f"Error at page {page}: {data}")
        limit_reached = True
        break
    if "items" not in data:
        print(f"Error at page {page}: {data}")
        break
    if not data["items"]:
        print("No more posts available.")
        break

    new_posts = []
    for item in data["items"]:
        try:
            post_id = item["question_id"]
            accepted_answer_id = item.get("accepted_answer_id", None)
            accepted_answer_text = "No accepted answer"
            more_answers_text = "No additional answers"

            answers_url = f"https://api.stackexchange.com/2.3/questions/{post_id}/answers"
            answers_params = {
                "site": "stackoverflow",
                "key": API_KEY,
                "filter": "withbody",
                "order": "desc",
                "sort": "votes"
            }
            answers_response = requests.get(answers_url, params=answers_params)
            answers_data = answers_response.json()

            if "error_id" in answers_data:
                print(f"Error fetching answers for post {post_id}: {answers_data}")
                limit_reached = True
                break
            if "items" not in answers_data:
                print(f"Error fetching answers for post {post_id}: {answers_data}")
                continue

            if answers_data["items"]:
                for answer in answers_data["items"]:
                    answer_id = answer["answer_id"]
                    if answer_id == accepted_answer_id:
                        accepted_answer_text = answer["body"]
                    elif more_answers_text == "No additional answers":
                        more_answers_text = answer["body"]
                        break

            new_posts.append({
                "title": clean_text(item["title"]),
                "description": clean_text(item["body"]),
                "tags": clean_text(";".join(item["tags"])),
                "accepted_answer": clean_text(accepted_answer_text),
                "more_accepted_answers": clean_text(more_answers_text)
            })
        except Exception as e:
            print(f"Skipping post {post_id} due to error: {e}")
            continue

    posts.extend(new_posts)
    print(f"Collected page {page}, total posts: {len(posts)}")
    page += 1
    time.sleep(1)

    # Append to existing Excel file
    new_df = pd.DataFrame(new_posts, columns=["title", "description", "tags", "accepted_answer", "more_accepted_answers"])
    if os.path.exists(EXCEL_FILE):
        book = load_workbook(EXCEL_FILE)
        with pd.ExcelWriter(EXCEL_FILE, engine="openpyxl", mode="a", if_sheet_exists="overlay") as writer:
            start_row = len(existing_df) + 1  # Append after existing data
            new_df.to_excel(writer, sheet_name="Raw_Data", startrow=start_row, index=False, header=False)
    else:
        with pd.ExcelWriter(EXCEL_FILE, engine="openpyxl") as writer:
            new_df.to_excel(writer, sheet_name="Raw_Data", index=False)
    print(f"Saved page {page - 1} to {EXCEL_FILE}, appended {len(new_posts)} new posts")

    if "quota_remaining" in data and data["quota_remaining"] < 100:
        print(f"Quota nearly exhausted ({data['quota_remaining']} remaining), moving to preprocessing...")
        limit_reached = True
        break

# Final save
initial_df = pd.DataFrame(posts, columns=["title", "description", "tags", "accepted_answer", "more_accepted_answers"])
with pd.ExcelWriter(EXCEL_FILE, engine="openpyxl") as writer:
    initial_df.to_excel(writer, sheet_name="Raw_Data", index=False)
print(f"Collected {len(initial_df)} posts, saved to {EXCEL_FILE}")

Loaded 10006 posts from existing file
Starting collection from page 174
Collected page 174, total posts: 10106
Saved page 174 to nlp_stackoverflow_dataset.xlsx, appended 100 new posts
Collected page 175, total posts: 10206
Saved page 175 to nlp_stackoverflow_dataset.xlsx, appended 100 new posts
Collected page 176, total posts: 10306
Saved page 176 to nlp_stackoverflow_dataset.xlsx, appended 100 new posts
Collected page 177, total posts: 10406
Saved page 177 to nlp_stackoverflow_dataset.xlsx, appended 100 new posts
Collected page 178, total posts: 10506
Saved page 178 to nlp_stackoverflow_dataset.xlsx, appended 100 new posts
Collected page 179, total posts: 10606
Saved page 179 to nlp_stackoverflow_dataset.xlsx, appended 100 new posts
Collected page 180, total posts: 10706
Saved page 180 to nlp_stackoverflow_dataset.xlsx, appended 100 new posts
Collected page 181, total posts: 10804
Saved page 181 to nlp_stackoverflow_dataset.xlsx, appended 98 new posts
Collected page 182, total posts: 

# Fourth Data Collection Run (successfully completed)

In [None]:
API_KEY = "rl_kUCy4GWQMRSP2AHqViJ1AsvUU"  # new API key

# Load existing data
if os.path.exists(EXCEL_FILE):
    existing_df = pd.read_excel(EXCEL_FILE, sheet_name="Raw_Data")
    posts = existing_df.to_dict("records")
    print(f"Loaded {len(posts)} posts from existing file")
else:
    print("No existing file found, starting fresh.")
    posts = []

# Set explicit page range: 99 to 174
start_page = 99
end_page = 174
print(f"Collecting from page {start_page} to {end_page}")

# Collect posts
page = start_page
limit_reached = False
while page <= end_page and len(posts) < 50000 and not limit_reached:
    params = {
        "page": page,
        "pagesize": 100,
        "order": "desc",
        "sort": "creation",
        "tagged": "nlp",
        "site": "stackoverflow",
        "key": API_KEY,
        "filter": "withbody"
    }
    response = requests.get(BASE_URL, params=params)
    data = response.json()

    if "error_id" in data:
        print(f"Error at page {page}: {data}")
        limit_reached = True
        break
    if "items" not in data:
        print(f"Error at page {page}: {data}")
        break
    if not data["items"]:
        print(f"No more posts available at page {page}.")
        break

    new_posts = []
    for item in data["items"]:
        try:
            post_id = item["question_id"]
            accepted_answer_id = item.get("accepted_answer_id", None)
            accepted_answer_text = "No accepted answer"
            more_answers_text = "No additional answers"

            answers_url = f"https://api.stackexchange.com/2.3/questions/{post_id}/answers"
            answers_params = {
                "site": "stackoverflow",
                "key": API_KEY,
                "filter": "withbody",
                "order": "desc",
                "sort": "votes"
            }
            answers_response = requests.get(answers_url, params=answers_params)
            answers_data = answers_response.json()

            if "error_id" in answers_data:
                print(f"Error fetching answers for post {post_id}: {answers_data}")
                limit_reached = True
                break
            if "items" not in answers_data:
                print(f"Error fetching answers for post {post_id}: {answers_data}")
                continue

            if answers_data["items"]:
                for answer in answers_data["items"]:
                    answer_id = answer["answer_id"]
                    if answer_id == accepted_answer_id:
                        accepted_answer_text = answer["body"]
                    elif more_answers_text == "No additional answers":
                        more_answers_text = answer["body"]
                        break

            new_posts.append({
                "title": clean_text(item["title"]),
                "description": clean_text(item["body"]),
                "tags": clean_text(";".join(item["tags"])),
                "accepted_answer": clean_text(accepted_answer_text),
                "more_accepted_answers": clean_text(more_answers_text)
            })
        except Exception as e:
            print(f"Skipping post {post_id} due to error: {e}")
            continue

    posts.extend(new_posts)
    print(f"Collected page {page}, total posts: {len(posts)}, posts this page: {len(new_posts)}")
    page += 1
    time.sleep(1)

    # Append to existing Excel file
    new_df = pd.DataFrame(new_posts, columns=["title", "description", "tags", "accepted_answer", "more_accepted_answers"])
    if os.path.exists(EXCEL_FILE):
        book = load_workbook(EXCEL_FILE)
        with pd.ExcelWriter(EXCEL_FILE, engine="openpyxl", mode="a", if_sheet_exists="overlay") as writer:
            start_row = len(existing_df) + 1  # Append after existing data
            new_df.to_excel(writer, sheet_name="Raw_Data", startrow=start_row, index=False, header=False)
    else:
        with pd.ExcelWriter(EXCEL_FILE, engine="openpyxl") as writer:
            new_df.to_excel(writer, sheet_name="Raw_Data", index=False)
    print(f"Saved page {page - 1} to {EXCEL_FILE}, appended {len(new_posts)} new posts")

    if "quota_remaining" in data and data["quota_remaining"] < 100:
        print(f"Quota nearly exhausted ({data['quota_remaining']} remaining), stopping...")
        limit_reached = True
        break

# Final save
initial_df = pd.DataFrame(posts, columns=["title", "description", "tags", "accepted_answer", "more_accepted_answers"])
with pd.ExcelWriter(EXCEL_FILE, engine="openpyxl") as writer:
    initial_df.to_excel(writer, sheet_name="Raw_Data", index=False)
print(f"Collected {len(initial_df)} posts, saved to {EXCEL_FILE}")

Loaded 13657 posts from existing file
Collecting from page 99 to 174
Collected page 99, total posts: 13757, posts this page: 100
Saved page 99 to nlp_stackoverflow_dataset.xlsx, appended 100 new posts
Collected page 100, total posts: 13857, posts this page: 100
Saved page 100 to nlp_stackoverflow_dataset.xlsx, appended 100 new posts
Collected page 101, total posts: 13957, posts this page: 100
Saved page 101 to nlp_stackoverflow_dataset.xlsx, appended 100 new posts
Collected page 102, total posts: 14057, posts this page: 100
Saved page 102 to nlp_stackoverflow_dataset.xlsx, appended 100 new posts
Collected page 103, total posts: 14157, posts this page: 100
Saved page 103 to nlp_stackoverflow_dataset.xlsx, appended 100 new posts
Collected page 104, total posts: 14257, posts this page: 100
Saved page 104 to nlp_stackoverflow_dataset.xlsx, appended 100 new posts
Collected page 105, total posts: 14357, posts this page: 100
Saved page 105 to nlp_stackoverflow_dataset.xlsx, appended 100 new p