In [1]:
import requests,json,re
from requests_toolbelt import MultipartEncoder

In [2]:
session = requests.Session()
user_agent = r'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'

In [3]:
def get_problems():
    url = "https://leetcode.com/api/problems/all/"

    headers = {'User-Agent': user_agent, 'Connection': 'keep-alive'}
    resp = session.get(url, headers = headers, timeout = 10)
       
    question_list = json.loads(resp.content.decode('utf-8'))

    allTitle=[]
    for question in question_list['stat_status_pairs']:
        question_id = question['stat']['question_id']
        question_slug = question['stat']['question__title_slug']
        question_status = question['status']
        level = question['difficulty']['level']
        if question['paid_only']:
            continue
        allTitle.append(question_slug)
    return allTitle   

def get_problem_by_slug(slug):
    url = "https://leetcode.com/graphql"
    params = {'operationName': "getQuestionDetail",
        'variables': {'titleSlug': slug},
        'query': '''query getQuestionDetail($titleSlug: String!) {
            question(titleSlug: $titleSlug) {
                questionId
                questionFrontendId
                questionTitle
                questionTitleSlug
                content
                difficulty
                stats
                similarQuestions
                categoryTitle
                topicTags {
                        name
                        slug
                }
            }
        }'''
    }

    json_data = json.dumps(params).encode('utf8')
                        
    headers = {'User-Agent': user_agent, 'Connection': 
        'keep-alive', 'Content-Type': 'application/json',
        'Referer': 'https://leetcode.com/problems/' + slug}
    resp = session.post(url, data = json_data, headers = headers, timeout = 10)
    content = resp.json()

    # 题目详细信息
    # question = content['data']['question']['questionTitleSlug']
    # content=content['data']['question']['content']
    return content['data']['question']['questionFrontendId'],content['data']['question']['content']
    

In [4]:
def get_sol_by_slug(slug,start):
    url = f"https://leetcode.com/graphql/{slug}/"
    params = {'operationName': "communitySolutions",
        'variables': {'query': "", 'languageTags': ["cpp"], 'topicTags': [], 'questionSlug': slug, 'skip': start*15,'first':15,'orderBy':'hot'},
        'query': '''query communitySolutions($questionSlug: String!, $skip: Int!, $first: Int!, $query: String, $orderBy: TopicSortingOption, $languageTags: [String!], $topicTags: [String!]) {
            questionSolutions(filters: {questionSlug: $questionSlug, skip: $skip, first: $first, query: $query, orderBy: $orderBy, languageTags: $languageTags, topicTags: $topicTags}) {
                hasDirectResults
                totalNum
                solutions {
                    id
                    title
                    commentCount
                    topLevelCommentCount
                    viewCount
                    pinned
                    isFavorite
                    solutionTags {
                        name
                        slug
                    }    
                }
            }
        }    
        '''
    }
    try:
        json_data = json.dumps(params).encode('utf8')
    except:
        return []
                        
    headers = {'User-Agent': user_agent, 'Connection': 
        'keep-alive', 'Content-Type': 'application/json',
        'Referer': 'https://leetcode.com/problems/' + slug+'/solutions'}
    resp = session.post(url, data = json_data, headers = headers, timeout = 10)
    content = resp.json()
    return content['data']['questionSolutions']['solutions']

In [5]:
def getSolution(slug,id,t):
    url = f"https://leetcode.com/graphql/{slug}/solution"
    params = {'operationName': "communitySolution",
        'variables': {'topicId': id},
        'query': '''query communitySolution($topicId: Int!) {
            topic(id: $topicId) {
                post {
                    id
                    voteCount
                    voteStatus
                    content
                }
            }
        }   
        '''
    }

    json_data = json.dumps(params).encode('utf8')
                        
    headers = {'User-Agent': user_agent, 'Connection': 
        'keep-alive', 'Content-Type': 'application/json',
        'Referer': 'https://leetcode.com/problems/' + slug+'/solutions/'+str(id)+'/'+t}
    resp = session.post(url, data = json_data, headers = headers, timeout = 10)
    content = resp.json()


    return content['data']['topic']['post']['content']

In [9]:
import csv
def writecsv(htmlItems,t,title,id,counter,leetcodeDesc):
    # with open(f"test\\{title}_{t}_{id}_{counter}.txt",'w')as f:
    #     for i in htmlItems:
    #         i=i.replace("\\t","")
    #         f.write(i+'\n')
    try:
        with open('outputT.csv', 'a', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow([t, leetcodeDesc, htmlItems])
    except:
        return

In [7]:
def RemoveTag(htmlString):
    pattern = re.compile('<.*?>')
    result = re.sub(pattern, '', htmlString)
    result=result.replace("\n"," ")
    return result

In [11]:
with open('outputT.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['question', 'context', 'answer'])
titles=get_problems()
for t in titles:
    id,leetcodeDesc=get_problem_by_slug(t)
    leetcodeDesc=RemoveTag(leetcodeDesc)#取t的題目敘述，t是題目標題
    for i in range(7):
        try:
            temp=get_sol_by_slug(t,i)
            for j in iter(temp):
                title=j['title']
                title=re.sub('[^\w]'," ",title).lower()
                title=re.sub('\s+',"-",title)
                if(title[0]=='-'):
                    title=title[1:]
                if(title[-1]=='-'):
                    title=title[:-1]
                print(f"{t}, {j['id']}")
                htmlItems=getSolution(t,j['id'],title)
                counter=1
                while(True):
                    index=htmlItems.find("```")
                    if(index!=-1):
                        index=htmlItems.find("class")
                        if index==-1:
                            index=htmlItems.find("```")
                            htmlItems=htmlItems[index+3:]
                            continue
                        htmlItems=htmlItems[index:]
                        index=htmlItems.find("```")
                        temp=htmlItems[:index]
                        temp=temp.replace("\\n"," ")
                        temp=temp.replace("\n"," ")
                        htmlItems=htmlItems[index+3:]
                        writecsv(temp,id+t,title,j['id'],counter,leetcodeDesc)
                        counter+=1
                    else:
                        break
        except:
            continue
            

divisible-and-non-divisible-sums-difference, 4144364
divisible-and-non-divisible-sums-difference, 4144349
divisible-and-non-divisible-sums-difference, 4145287
divisible-and-non-divisible-sums-difference, 4144754
divisible-and-non-divisible-sums-difference, 4145763
divisible-and-non-divisible-sums-difference, 4146679
divisible-and-non-divisible-sums-difference, 4144201
divisible-and-non-divisible-sums-difference, 4144019
divisible-and-non-divisible-sums-difference, 4144599
divisible-and-non-divisible-sums-difference, 4144597
divisible-and-non-divisible-sums-difference, 4144513
divisible-and-non-divisible-sums-difference, 4144407
divisible-and-non-divisible-sums-difference, 4143960
divisible-and-non-divisible-sums-difference, 4143946
divisible-and-non-divisible-sums-difference, 4143913
divisible-and-non-divisible-sums-difference, 4147362
divisible-and-non-divisible-sums-difference, 4153206
divisible-and-non-divisible-sums-difference, 4150114
divisible-and-non-divisible-sums-difference, 4

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
df=pd.read_csv("output.csv")
df.head()
df_train,df_test=train_test_split(df,test_size=0.3)


df_train.to_json("mydata_train.jsonl", orient='records', lines=True)
df_test.to_json("mydata_eval.jsonl", orient='records', lines=True)

In [None]:
print(df_train)

                                                question  \
2199               minimize-the-total-price-of-the-trips   
11511                                       car-fleet-ii   
17973                number-of-visible-people-in-a-queue   
30794                              contains-duplicate-ii   
1820                find-the-losers-of-the-circular-game   
...                                                  ...   
15744    number-of-times-binary-string-is-prefix-aligned   
12537                                      stone-game-vi   
30833                                 contains-duplicate   
34319                                   integer-to-roman   
17704  longest-arithmetic-subsequence-of-given-differ...   

                                                 context  \
2199   <p>There exists an undirected and unrooted tre...   
11511  <p>There are <code>n</code> cars traveling at ...   
17973  <p>There are <code>n</code> people standing in...   
30794  <p>Given an integer array <code>