# 1. Crawling

stackexchange의 API를 이용하여 Q&A가 site별로 구분되어있어서 사이트의 category로 데이터를 수집합니다.
site 주제들 중 law를 이용하여 질문과 답변, 코멘트 데이터를 수집하도록 하겠습니다. 추가적으로 tag정보도 수집하여 사용하겠습니다.

API_KEY는 https://stackapps.com/apps/oauth/register 에서 새 애플리케이션을 등록 후 얻을 수 있습니다. 아래 정보를 입력해서 KEY를 얻습니다.
OAUTH Domain: stackexchange.com
Application Website: https://stackapps.com 

In [None]:
import os
import itertools
import requests
import json
import re
import glob
import tempfile


from tqdm import tqdm
from dotenv import load_dotenv
load_dotenv()
STACKEXCHANGE_API_KEY=os.environ.get('STACKEXCHANGE_API_KEY', None)

In [None]:
def get_requests(api_url, params=None):
    data = None
    if STACKEXCHANGE_API_KEY:
        if params is None:
            params = {}
        params['key'] = STACKEXCHANGE_API_KEY
    response = requests.get(api_url, params=params)
    if response.status_code == 200:
        data = response.json()
    return data

### stackexchange에 있는 주제 리스트업

아래 코드는 가능한 api site의 parameter들을 text파일에 입력을 넣어주는 코드입니다.

In [None]:

api_url = "https://api.stackexchange.com/2.3/sites"
data = get_requests(api_url, params={"pagesize": 100, "page": 1})
# print(data['items'][0].keys())
with open("api_site_parameter.txt", 'w') as f:
    f.writelines([site['api_site_parameter'] + '\n' for site in data['items']])
with open("api_site_parameter.txt", 'r') as f:
    print(f.read())

stackoverflow
serverfault
superuser
meta
webapps
webapps.meta
gaming
gaming.meta
webmasters
webmasters.meta
cooking
cooking.meta
gamedev
gamedev.meta
photo
photo.meta
stats
stats.meta
math
math.meta
diy
diy.meta
meta.superuser
meta.serverfault
gis
gis.meta
tex
tex.meta
askubuntu
meta.askubuntu
money
money.meta
english
english.meta
stackapps
ux
ux.meta
unix
unix.meta
wordpress
wordpress.meta
cstheory
cstheory.meta
apple
apple.meta
rpg
rpg.meta
bicycles
bicycles.meta
softwareengineering
softwareengineering.meta
electronics
electronics.meta
android
android.meta
boardgames
boardgames.meta
physics
physics.meta
homebrew
homebrew.meta
security
security.meta
writing
writing.meta
video
video.meta
graphicdesign
graphicdesign.meta
dba
dba.meta
scifi
scifi.meta
codereview
codereview.meta
codegolf
codegolf.meta
quant
quant.meta
pm
pm.meta
skeptics
skeptics.meta
fitness
fitness.meta
drupal
drupal.meta
mechanics
mechanics.meta
parenting
parenting.meta
sharepoint
sharepoint.meta
music
music.meta
sqa
s

In [None]:
SITE = 'law'
# get information about the SITE
data = get_requests(f"https://api.stackexchange.com/2.3/info?site={SITE}")
data

{'items': [{'new_active_users': 1,
   'total_users': 43005,
   'badges_per_minute': 0.02,
   'total_badges': 84686,
   'total_votes': 319063,
   'total_comments': 181008,
   'answers_per_minute': 0.01,
   'questions_per_minute': 0.01,
   'total_answers': 42491,
   'total_accepted': 12072,
   'total_unanswered': 2745,
   'total_questions': 28672,
   'api_revision': '2023.11.8.42135'}],
 'has_more': False,
 'quota_max': 10000,
 'quota_remaining': 6683}

아래 코드들은 question, comment, answer를 stackexchange api [사이트](https://api.stackexchange.com/)에서 가져옵니다.
filter부분에 필요한 속성의 데이터를 선택하여 api주소를 만들어 사용하였습니다. [Question API](https://api.stackexchange.com/docs/questions)

가져온 데이터에서 실제로 필요하다고 판단되는 데이터들의 key를 아래 변수에 저장하여서 해당 key만 추출하여 json형태로 저장하는 코드입니다.

In [None]:
json_path_template = f"../data/{SITE}_stackexchange/raw/" + "{page_num}.json"
os.makedirs(os.path.dirname(json_path_template), exist_ok=True)
has_more = True
page_num = 0  # total number of pages
page_num = 287  # total number of pages commented out to fetch again
while has_more:
    page_num += 1
    query_url = (
        "https://api.stackexchange.com"
        + "/2.3/questions?page="
        + str(page_num)
        + "&pagesize=100&order=desc&sort=activity&site="
        + SITE
        + "&filter=!*Mg4PjfgUgTmRZV1"
        + "&key="
        + STACKEXCHANGE_API_KEY
    )
    data = requests.get(query_url).json()
    if len(data.get("items", [])) > 0:
        with open(json_path_template.format(page_num=page_num), "w") as f:
            json.dump(data, f, indent=4)
        print("DONE", json_path_template.format(page_num=page_num))
    has_more = data.get("has_more", False)

In [None]:
num_samples_valid = 0
num_accepted_samples = 0
for json_path in glob.glob(f"../data/{SITE}_stackexchange/raw/*.json"):
    with open(json_path, 'r') as f:
        data = json.load(f)
    
    for question in data['items']:
        if question['is_answered'] and question['answer_count'] > 0:
            answers = question['answers']
            if question.get('accepted_answer_id', None) is not None:
                num_accepted_samples += 1
            if len(answers) > 0:
                num_samples_valid += 1
print("samples answered", num_samples_valid, 'accepted', num_accepted_samples)