-
Notifications
You must be signed in to change notification settings - Fork 0
/
parser.ts
133 lines (126 loc) · 3.97 KB
/
parser.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import {
DOMParser,
HTMLDocument,
Element,
} from "https://deno.land/x/deno_dom@v0.1.35-alpha/deno-dom-wasm.ts";
import { CRAWLERS } from "./main.ts";
export function trimHtml(html: string) {
return html
.replace(/<!--[\s\S]*?-->/g, "")
.replace(/<br>_________________<br>/g, "")
.replace(/(<br>\s*){2,}/g, "<br>")
.trim()
.replace(/^( |<br>)*(.*?)( |<br>)*$/, "$2");
}
export function parseContentsFromDocument(document: HTMLDocument) {
const contents = Array.from(document.querySelectorAll(".item.text")).map(
(question) => {
const questionElement = question as Element;
let tobeRemoved = false;
Array.from(questionElement.children).forEach((node) => {
if (node.classList.contains("twoRowsBlock")) {
tobeRemoved = true;
if (tobeRemoved) {
node._remove();
}
}
});
questionElement
.querySelectorAll(
"script, .quotetitle, .quotecontent, .twoRowsBlock, .post_signature"
)
.forEach((node) => {
node._remove();
});
questionElement.getElementsByTagName("b").forEach((node) => {
if (
node.innerText.startsWith("The OA will be automatically revealed on")
) {
node.remove();
}
});
return trimHtml(questionElement.innerHTML);
}
);
return contents;
}
export function parseTagsFromDocument(document: HTMLDocument) {
const tags = Array.from(
document.querySelectorAll("#taglist > a:not([onclick])")
).map((tag) => tag.textContent);
return tags;
}
export function parseQuestionAndAnswersFromContent(content: string) {
const lines = content.split("<br>").map((line) => line.trim());
const regex = /^([A-Z]\.|[A-Z]:|\(?[A-Z]\))\s*/i;
const answers = [] as string[],
nonAnswers = [] as string[];
lines.forEach((line) => {
if (line.match(regex)) {
answers.push(line);
} else {
nonAnswers.push(line);
}
});
return {
answers: answers.map((line) => line.replace(regex, "")),
question: nonAnswers.join("<br>"),
};
}
export function parseSubQuestionsFromQuestion(questionContent: string) {
const dom = new DOMParser().parseFromString(questionContent, "text/html")!;
const [question, subQuestionsContent] = Array.from(
dom.querySelectorAll(".bbcodeBoxIn")
).map((d) => (d as Element).innerHTML);
const questionDiv = new DOMParser().parseFromString(
subQuestionsContent,
"text/html"
)!;
const subQuestions: string[] = [];
(Array.from(questionDiv.body.childNodes) as Element[]).forEach((el) => {
if (el instanceof Element) {
if (el.tagName.toUpperCase() !== "BR") {
if (el.classList.contains("placeholderTimerRC")) {
subQuestions.push("");
} else {
subQuestions[subQuestions.length - 1] += el.outerHTML;
}
}
} else {
subQuestions[subQuestions.length - 1] += "<br>" + el.textContent;
}
});
return { question, subQuestions };
}
export function parseQuestionFromRawContent(
rawContent: string,
type: keyof typeof CRAWLERS
) {
const content = trimHtml(rawContent);
if (type === "RC") {
const { question, subQuestions: subQuestionContents } =
parseSubQuestionsFromQuestion(content);
const subQuestions = subQuestionContents.map((questionContent) =>
parseQuestionAndAnswersFromContent(questionContent)
);
return {
type,
question,
subQuestions,
};
} else {
const { question, answers } = parseQuestionAndAnswersFromContent(content);
return {
type,
question,
answers: type === "DS" ? DSAnswers : answers,
};
}
}
export const DSAnswers = [
"Statement (1) ALONE is sufficient but statement (2) ALONE is not sufficient.",
"Statement (2) ALONE is sufficient but statement (1) ALONE is not sufficient.",
"BOTH statements TOGETHER are sufficient, but NEITHER statement ALONE is sufficient.",
"EACH statement ALONE is sufficient.",
"Statements (1) and (2) TOGETHER are not sufficient. ",
];