In [None]:
"""This Jupyter Notebook is designed to scrape first 100 solutions sorted by most voted for programing problems on Leetcode """

In [1]:
import requests
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import random

In [2]:
def scrape(query, variables, url='https://leetcode.com/graphql/'):
    """
    Sends a POST request to the LeetCode GraphQL API to retrieve data based on the provided query and variables.

    Args:
        query (str): The GraphQL query string to be executed.
        variables (dict): A dictionary of variables to be passed along with the query.
        url (str, optional): The URL of the GraphQL endpoint. Defaults to 'https://leetcode.com/graphql/'.

    Returns:
        dict: The JSON response from the API containing the requested data.

    Raises:
        requests.exceptions.RequestException: If the HTTP request fails or returns an error status code.
    """
    headers = {
        'authority': 'leetcode.com','method': 'POST','path': '/graphql/',
        'scheme': 'https','accept': '*/*','accept-encoding': 'gzip, deflate, br, zstd','accept-language': 'en-US,en;q=0.9',
        'content-type': 'application/json','origin': 'https://leetcode.com','referer': 'https://leetcode.com/problems/spiral-matrix/solutions/','user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)...',
    }
    data = {"query": query, "variables": variables}
    response = requests.post(url, headers=headers, json=data)
    response.raise_for_status()
    return response.json()

In [3]:
""" Trial Run """
sol_list_query = "\n    query ugcArticleSolutionArticles($questionSlug: String!, $orderBy: ArticleOrderByEnum, $userInput: String, $tagSlugs: [String!], $skip: Int, $before: String, $after: String, $first: Int, $last: Int, $isMine: Boolean) {\n  ugcArticleSolutionArticles(\n    questionSlug: $questionSlug\n    orderBy: $orderBy\n    userInput: $userInput\n    tagSlugs: $tagSlugs\n    skip: $skip\n    first: $first\n    before: $before\n    after: $after\n    last: $last\n    isMine: $isMine\n  ) {\n    totalNum\n    pageInfo {\n      hasNextPage\n    }\n    edges {\n      node {\n        ...ugcSolutionArticleFragment\n      }\n    }\n  }\n}\n    \n    fragment ugcSolutionArticleFragment on SolutionArticleNode {\n  uuid\n  title\n  slug\n  summary\n  author {\n    realName\n    userAvatar\n    userSlug\n    userName\n    nameColor\n    certificationLevel\n    activeBadge {\n      icon\n      displayName\n    }\n  }\n  articleType\n  thumbnail\n  summary\n  createdAt\n  updatedAt\n  status\n  isLeetcode\n  canSee\n  canEdit\n  isMyFavorite\n  chargeType\n  myReactionType\n  topicId\n  hitCount\n  hasVideoArticle\n  reactions {\n    count\n    reactionType\n  }\n  title\n  slug\n  tags {\n    name\n    slug\n    tagType\n  }\n  topic {\n    id\n    topLevelCommentCount\n  }\n}\n    "
sol_query = "\n    query ugcArticleSolutionArticle($articleId: ID, $topicId: ID) {\n  ugcArticleSolutionArticle(articleId: $articleId, topicId: $topicId) {\n    ...ugcSolutionArticleFragment\n    content\n    isSerialized\n    isAuthorArticleReviewer\n    scoreInfo {\n      scoreCoefficient\n    }\n    prev {\n      uuid\n      slug\n      topicId\n      title\n    }\n    next {\n      uuid\n      slug\n      topicId\n      title\n    }\n  }\n}\n    \n    fragment ugcSolutionArticleFragment on SolutionArticleNode {\n  uuid\n  title\n  slug\n  summary\n  author {\n    realName\n    userAvatar\n    userSlug\n    userName\n    nameColor\n    certificationLevel\n    activeBadge {\n      icon\n      displayName\n    }\n  }\n  articleType\n  thumbnail\n  summary\n  createdAt\n  updatedAt\n  status\n  isLeetcode\n  canSee\n  canEdit\n  isMyFavorite\n  chargeType\n  myReactionType\n  topicId\n  hitCount\n  hasVideoArticle\n  reactions {\n    count\n    reactionType\n  }\n  title\n  slug\n  tags {\n    name\n    slug\n    tagType\n  }\n  topic {\n    id\n    topLevelCommentCount\n  }\n}\n    "
scrape(sol_query, {"topicId": 20571})

{'data': {'ugcArticleSolutionArticle': {'uuid': 'unzC2lUz',
   'title': '1-liner in Python + Ruby',
   'slug': '1-liner-in-python-ruby-by-stefanpochmann-rqep',
   'summary': 'Take the first row plus the spiral order of the rotated remaining matrix. Inefficient for large matrices, but here I got it accepted in 40 ms, one of the fastes',
   'author': {'realName': 'Stefan Pochmann',
    'userAvatar': 'https://assets.leetcode.com/users/avatars/avatar_1648919055.png',
    'userSlug': 'stefanpochmann',
    'userName': 'StefanPochmann',
    'nameColor': None,
    'certificationLevel': 'NORMAL',
    'activeBadge': {'icon': 'https://assets.leetcode.com/static_assets/marketing/lg365.png',
     'displayName': '365 Days Badge'}},
   'articleType': 'SOLUTION',
   'thumbnail': '',
   'createdAt': '2015-07-17T16:27:43+00:00',
   'updatedAt': '2018-10-26T23:18:53.558760+00:00',
   'status': 'OPEN',
   'isLeetcode': False,
   'canSee': True,
   'canEdit': False,
   'isMyFavorite': False,
   'chargeType

In [None]:
def process_question(slug):

    sol_list_variables = {
        "questionSlug": slug,
        "skip": 0,
        "first": 100,
        "orderBy": "MOST_VOTES",
        "userInput": "",
        "tagSlugs": []
    }

    try:
        sol_list = scrape(sol_list_query, sol_list_variables)
        sol_list = sol_list['data']['ugcArticleSolutionArticles']['edges']
    except Exception as e:
        print(f"[{slug}] Error during solution list scraping: {e}")
        return

    solutions = []
    for sol in sol_list:
        sol = sol['node']
        if sol['canSee'] and sol['chargeType'] == 'FREE':
            upvotes, downvotes = 0, 0
            tags = []
            for reaction in sol['reactions']:
                if reaction['reactionType'] == 'UPVOTE':
                    upvotes = reaction['count']
                elif reaction['reactionType'] == 'THUMBS_DOWN':
                    downvotes = reaction['count']
            for tag in sol['tags']:
                tags.append(tag['name'])
            topic_id = sol['topicId']
            comments = sol['topic']['topLevelCommentCount']
            try:
                sol_content = scrape(sol_query, {"topicId": topic_id})
                sol_content = sol_content['data']['ugcArticleSolutionArticle']
            except Exception as e:
                print(f"[{slug}] Error during solution content scraping: {e}")
                continue

            solutions.append({
                'question_slug': slug,
                'title': sol_content['title'],
                'slug': sol_content['slug'],
                'summary': sol_content['summary'],
                'author': sol_content['author']['userSlug'],
                'certification': sol_content['author']['certificationLevel'],
                'created_at': sol_content['createdAt'],
                'updated_at': sol_content['updatedAt'],
                'hit_count': sol_content['hitCount'],
                'has_video': sol_content['hasVideoArticle'],
                'content': sol_content['content'],
                'upvotes': upvotes,
                'downvotes': downvotes,
                'tags': tags,
                'comments': comments
            })

    if solutions:
        sol_df = pd.DataFrame(solutions)
        sol_df.to_csv(f'./corrected_sols/{slug}.csv', index=False)

with open('./logs.txt', 'r') as f:
    lines = f.readlines()
    slugs = []
    for line in lines:
        if line.strip() != '' and '[PROGRESS]' not in line.strip(' '):
            line = line.strip(' ').split(']')
            slug = line[0].strip('[')
            slugs.append(slug)

# Remove duplicates (optional)
slugs = list(set(slugs))

# Threaded execution
print(f"[INFO] Starting threaded processing for {len(slugs)} slugs...")

with ThreadPoolExecutor(max_workers=20) as executor:
    future_to_slug = {executor.submit(process_question, slug): slug for slug in slugs}
    for i, future in enumerate(as_completed(future_to_slug), 1):
        slug = future_to_slug[future]
        try:
            future.result()
            print(f"[PROGRESS] ✔ {i}/{len(slugs)} done - {slug}")
        except Exception as e:
            print(f"[ERROR] ❌ {slug} | {e}")


In [None]:
questions = pd.read_csv('./problem_set.csv')

In [None]:
""" No Multi-threading """
for i, row in questions.iterrows():
    if row['paidOnly']: continue
    sol_list_variables = {"questionSlug":row['titleSlug'],"skip":0,"first":100,"orderBy":"MOST_VOTES","userInput":"","tagSlugs":[]}
    sol_list = scrape(sol_list_query, sol_list_variables)
    sol_list = sol_list['data']['ugcArticleSolutionArticles']['edges']
    solutions = []
    for sol in sol_list:
        sol = sol['node']
        if sol['canSee'] and sol['chargeType'] == 'FREE':
            upvotes, downvotes = 0, 0
            tags = []
            for reaction in sol['reactions']:
                if reaction['reactionType'] == 'UPVOTE': upvotes = reaction['count']
                elif reaction['reactionType'] == 'THUMBS_DOWN': downvotes = reaction['count']
            for tag in sol['tags']: tags.append(tag['name'])
            topic_id = sol['topicId']
            comments = sol['topic']['topLevelCommentCount']
            sol_content = scrape(sol_query, {"topicId":topic_id})
            sol_content = sol_content['data']['ugcArticleSolutionArticle']

            solutions.append({
                'question_slug': row['titleSlug'],
                'title': sol_content['title'],
                'slug': sol_content['slug'],
                'summary': sol_content['summary'],
                'author': sol_content['author']['userSlug'],
                'certification': sol_content['author']['certificationLevel'],
                'created_at': sol_content['createdAt'],
                'updated_at': sol_content['updatedAt'],
                'hit_count': sol_content['hitCount'],
                'has_video': sol_content['hasVideoArticle'],
                'content': sol_content['content'],
                'upvotes': upvotes,
                'downvotes': downvotes,
                'tags': tags,
                'comments': comments
            })
    print(f"Scraped {i} solutions for {row['title']}")
    sol_df = pd.DataFrame(solutions)
    sol_df.to_csv(f'./sols/{row["titleSlug"]}.csv', index=False)

In [None]:
""" Using Multi-threading """
def process_question(row):
    if row['paidOnly']:
        return

    sol_list_variables = {
        "questionSlug": row['titleSlug'],
        "skip": 0,
        "first": 100,
        "orderBy": "MOST_VOTES",
        "userInput": "",
        "tagSlugs": []
    }

    try:
        sol_list = scrape(sol_list_query, sol_list_variables)
        sol_list = sol_list['data']['ugcArticleSolutionArticles']['edges']
    except Exception as e:
        print(f"[{row['titleSlug']}] Error during solution list scraping: {e}")
        return

    solutions = []
    for sol in sol_list:
        sol = sol['node']
        if sol['canSee'] and sol['chargeType'] == 'FREE':
            upvotes, downvotes = 0, 0
            tags = []
            for reaction in sol['reactions']:
                if reaction['reactionType'] == 'UPVOTE':
                    upvotes = reaction['count']
                elif reaction['reactionType'] == 'THUMBS_DOWN':
                    downvotes = reaction['count']
            for tag in sol['tags']:
                tags.append(tag['name'])
            topic_id = sol['topicId']
            comments = sol['topic']['topLevelCommentCount']
            try:
                sol_content = scrape(sol_query, {"topicId": topic_id})
                sol_content = sol_content['data']['ugcArticleSolutionArticle']
            except Exception as e:
                print(f"[{row['titleSlug']}] Error during solution content scraping: {e}")
                continue

            solutions.append({
                'question_slug': row['titleSlug'],
                'title': sol_content['title'],
                'slug': sol_content['slug'],
                'summary': sol_content['summary'],
                'author': sol_content['author']['userSlug'],
                'certification': sol_content['author']['certificationLevel'],
                'created_at': sol_content['createdAt'],
                'updated_at': sol_content['updatedAt'],
                'hit_count': sol_content['hitCount'],
                'has_video': sol_content['hasVideoArticle'],
                'content': sol_content['content'],
                'upvotes': upvotes,
                'downvotes': downvotes,
                'tags': tags,
                'comments': comments
            })

    if solutions:
        sol_df = pd.DataFrame(solutions)
        sol_df.to_csv(f'./sols/{row["titleSlug"]}.csv', index=False)

""" Find problem_set.csv on Hugging Face """

questions = pd.read_csv('./problem_set.csv')

with ThreadPoolExecutor(max_workers=50) as executor:
    futures = [executor.submit(process_question, row) for _, row in questions.iterrows()]
    for i, future in enumerate(as_completed(futures), 1):
        try:
            future.result()
        except Exception as e:
            print(f"[THREAD ERROR] {e}")
        print(f"[PROGRESS] {i}/{len(futures)} complete")