Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
tcllib/modules/htmlparse/htmlparse.tcl
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
1444 lines (1337 sloc)
59.2 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # htmlparse.tcl -- | |
| # | |
| # This file implements a simple HTML parsing library in Tcl. | |
| # It may take advantage of parsers coded in C in the future. | |
| # | |
| # The functionality here is a subset of the | |
| # | |
| # Simple HTML display library by Stephen Uhler (stephen.uhler@sun.com) | |
| # Copyright (c) 1995 by Sun Microsystems | |
| # Version 0.3 Fri Sep 1 10:47:17 PDT 1995 | |
| # | |
| # The main restriction is that all Tk-related code in the above | |
| # was left out of the code here. It is expected that this code | |
| # will go into a 'tklib' in the future. | |
| # | |
| # Copyright (c) 2001 by ActiveState Tool Corp. | |
| # See the file license.terms. | |
| package require Tcl 8.2 | |
| package require struct::stack | |
| package require cmdline 1.1 | |
| namespace eval ::htmlparse { | |
| namespace export \ | |
| parse \ | |
| debugCallback \ | |
| mapEscapes \ | |
| 2tree \ | |
| removeVisualFluff \ | |
| removeFormDefs | |
| # Table of escape characters. Maps from their names to the actual | |
| # character. See http://htmlhelp.org/reference/html40/entities/ | |
| variable namedEntities | |
| # I. Latin-1 Entities (HTML 4.01) | |
| array set namedEntities { | |
| nbsp \xa0 iexcl \xa1 cent \xa2 pound \xa3 curren \xa4 | |
| yen \xa5 brvbar \xa6 sect \xa7 uml \xa8 copy \xa9 | |
| ordf \xaa laquo \xab not \xac shy \xad reg \xae | |
| macr \xaf deg \xb0 plusmn \xb1 sup2 \xb2 sup3 \xb3 | |
| acute \xb4 micro \xb5 para \xb6 middot \xb7 cedil \xb8 | |
| sup1 \xb9 ordm \xba raquo \xbb frac14 \xbc frac12 \xbd | |
| frac34 \xbe iquest \xbf Agrave \xc0 Aacute \xc1 Acirc \xc2 | |
| Atilde \xc3 Auml \xc4 Aring \xc5 AElig \xc6 Ccedil \xc7 | |
| Egrave \xc8 Eacute \xc9 Ecirc \xca Euml \xcb Igrave \xcc | |
| Iacute \xcd Icirc \xce Iuml \xcf ETH \xd0 Ntilde \xd1 | |
| Ograve \xd2 Oacute \xd3 Ocirc \xd4 Otilde \xd5 Ouml \xd6 | |
| times \xd7 Oslash \xd8 Ugrave \xd9 Uacute \xda Ucirc \xdb | |
| Uuml \xdc Yacute \xdd THORN \xde szlig \xdf agrave \xe0 | |
| aacute \xe1 acirc \xe2 atilde \xe3 auml \xe4 aring \xe5 | |
| aelig \xe6 ccedil \xe7 egrave \xe8 eacute \xe9 ecirc \xea | |
| euml \xeb igrave \xec iacute \xed icirc \xee iuml \xef | |
| eth \xf0 ntilde \xf1 ograve \xf2 oacute \xf3 ocirc \xf4 | |
| otilde \xf5 ouml \xf6 divide \xf7 oslash \xf8 ugrave \xf9 | |
| uacute \xfa ucirc \xfb uuml \xfc yacute \xfd thorn \xfe | |
| yuml \xff | |
| } | |
| # II. Entities for Symbols and Greek Letters (HTML 4.01) | |
| array set namedEntities { | |
| fnof \u192 Alpha \u391 Beta \u392 Gamma \u393 Delta \u394 | |
| Epsilon \u395 Zeta \u396 Eta \u397 Theta \u398 Iota \u399 | |
| Kappa \u39A Lambda \u39B Mu \u39C Nu \u39D Xi \u39E | |
| Omicron \u39F Pi \u3A0 Rho \u3A1 Sigma \u3A3 Tau \u3A4 | |
| Upsilon \u3A5 Phi \u3A6 Chi \u3A7 Psi \u3A8 Omega \u3A9 | |
| alpha \u3B1 beta \u3B2 gamma \u3B3 delta \u3B4 epsilon \u3B5 | |
| zeta \u3B6 eta \u3B7 theta \u3B8 iota \u3B9 kappa \u3BA | |
| lambda \u3BB mu \u3BC nu \u3BD xi \u3BE omicron \u3BF | |
| pi \u3C0 rho \u3C1 sigmaf \u3C2 sigma \u3C3 tau \u3C4 | |
| upsilon \u3C5 phi \u3C6 chi \u3C7 psi \u3C8 omega \u3C9 | |
| thetasym \u3D1 upsih \u3D2 piv \u3D6 bull \u2022 | |
| hellip \u2026 prime \u2032 Prime \u2033 oline \u203E | |
| frasl \u2044 weierp \u2118 image \u2111 real \u211C | |
| trade \u2122 alefsym \u2135 larr \u2190 uarr \u2191 | |
| rarr \u2192 darr \u2193 harr \u2194 crarr \u21B5 | |
| lArr \u21D0 uArr \u21D1 rArr \u21D2 dArr \u21D3 hArr \u21D4 | |
| forall \u2200 part \u2202 exist \u2203 empty \u2205 | |
| nabla \u2207 isin \u2208 notin \u2209 ni \u220B prod \u220F | |
| sum \u2211 minus \u2212 lowast \u2217 radic \u221A | |
| prop \u221D infin \u221E ang \u2220 and \u2227 or \u2228 | |
| cap \u2229 cup \u222A int \u222B there4 \u2234 sim \u223C | |
| cong \u2245 asymp \u2248 ne \u2260 equiv \u2261 le \u2264 | |
| ge \u2265 sub \u2282 sup \u2283 nsub \u2284 sube \u2286 | |
| supe \u2287 oplus \u2295 otimes \u2297 perp \u22A5 | |
| sdot \u22C5 lceil \u2308 rceil \u2309 lfloor \u230A | |
| rfloor \u230B lang \u2329 rang \u232A loz \u25CA | |
| spades \u2660 clubs \u2663 hearts \u2665 diams \u2666 | |
| } | |
| # III. Special Entities (HTML 4.01) | |
| array set namedEntities { | |
| quot \x22 amp \x26 lt \x3C gt \x3E OElig \u152 oelig \u153 | |
| Scaron \u160 scaron \u161 Yuml \u178 circ \u2C6 | |
| tilde \u2DC ensp \u2002 emsp \u2003 thinsp \u2009 | |
| zwnj \u200C zwj \u200D lrm \u200E rlm \u200F ndash \u2013 | |
| mdash \u2014 lsquo \u2018 rsquo \u2019 sbquo \u201A | |
| ldquo \u201C rdquo \u201D bdquo \u201E dagger \u2020 | |
| Dagger \u2021 permil \u2030 lsaquo \u2039 rsaquo \u203A | |
| euro \u20AC | |
| } | |
| # IV. Special Entities (XHTML, XML) | |
| array set namedEntities { | |
| apos \u0027 | |
| } | |
| # HTML5 section 8.5 Named character references (additions only) | |
| # http://www.w3.org/TR/2011/WD-html5-20110113/named-character-references.html | |
| array set namedEntities { | |
| Abreve \u102 abreve \u103 ac \u223e acd \u223f | |
| acE \u223e\u333 Acy \u410 acy \u430 af \u2061 | |
| Afr \ud835\udd04 afr \ud835\udd1e aleph \u2135 Amacr \u100 | |
| amacr \u101 amalg \u2a3f AMP \u26 andand \u2a55 And \u2a53 | |
| andd \u2a5c andslope \u2a58 andv \u2a5a ange \u29a4 | |
| angle \u2220 angmsdaa \u29a8 angmsdab \u29a9 angmsdac \u29aa | |
| angmsdad \u29ab angmsdae \u29ac angmsdaf \u29ad | |
| angmsdag \u29ae angmsdah \u29af angmsd \u2221 angrt \u221f | |
| angrtvb \u22be angrtvbd \u299d angsph \u2222 angst \uc5 | |
| angzarr \u237c Aogon \u104 aogon \u105 Aopf \ud835\udd38 | |
| aopf \ud835\udd52 apacir \u2a6f ap \u2248 apE \u2a70 | |
| ape \u224a apid \u224b ApplyFunction \u2061 approx \u2248 | |
| approxeq \u224a Ascr \ud835\udc9c ascr \ud835\udcb6 | |
| Assign \u2254 ast \u2a asympeq \u224d awconint \u2233 | |
| awint \u2a11 backcong \u224c backepsilon \u3f6 | |
| backprime \u2035 backsim \u223d backsimeq \u22cd | |
| Backslash \u2216 Barv \u2ae7 barvee \u22bd barwed \u2305 | |
| Barwed \u2306 barwedge \u2305 bbrk \u23b5 bbrktbrk \u23b6 | |
| bcong \u224c Bcy \u411 bcy \u431 becaus \u2235 because \u2235 | |
| Because \u2235 bemptyv \u29b0 bepsi \u3f6 bernou \u212c | |
| Bernoullis \u212c beth \u2136 between \u226c Bfr \ud835\udd05 | |
| bfr \ud835\udd1f bigcap \u22c2 bigcirc \u25ef bigcup \u22c3 | |
| bigodot \u2a00 bigoplus \u2a01 bigotimes \u2a02 | |
| bigsqcup \u2a06 bigstar \u2605 bigtriangledown \u25bd | |
| bigtriangleup \u25b3 biguplus \u2a04 bigvee \u22c1 | |
| bigwedge \u22c0 bkarow \u290d blacklozenge \u29eb | |
| blacksquare \u25aa blacktriangle \u25b4 | |
| blacktriangledown \u25be blacktriangleleft \u25c2 | |
| blacktriangleright \u25b8 blank \u2423 blk12 \u2592 | |
| blk14 \u2591 blk34 \u2593 block \u2588 bne \u3d\u20e5 | |
| bnequiv \u2261\u20e5 bNot \u2aed bnot \u2310 Bopf \ud835\udd39 | |
| bopf \ud835\udd53 bot \u22a5 bottom \u22a5 bowtie \u22c8 | |
| boxbox \u29c9 boxdl \u2510 boxdL \u2555 boxDl \u2556 | |
| boxDL \u2557 boxdr \u250c boxdR \u2552 boxDr \u2553 | |
| boxDR \u2554 boxh \u2500 boxH \u2550 boxhd \u252c | |
| boxHd \u2564 boxhD \u2565 boxHD \u2566 boxhu \u2534 | |
| boxHu \u2567 boxhU \u2568 boxHU \u2569 boxminus \u229f | |
| boxplus \u229e boxtimes \u22a0 boxul \u2518 boxuL \u255b | |
| boxUl \u255c boxUL \u255d boxur \u2514 boxuR \u2558 | |
| boxUr \u2559 boxUR \u255a boxv \u2502 boxV \u2551 | |
| boxvh \u253c boxvH \u256a boxVh \u256b boxVH \u256c | |
| boxvl \u2524 boxvL \u2561 boxVl \u2562 boxVL \u2563 | |
| boxvr \u251c boxvR \u255e boxVr \u255f boxVR \u2560 | |
| bprime \u2035 breve \u2d8 Breve \u2d8 bscr \ud835\udcb7 | |
| Bscr \u212c bsemi \u204f bsim \u223d bsime \u22cd | |
| bsolb \u29c5 bsol \u5c bsolhsub \u27c8 bullet \u2022 | |
| bump \u224e bumpE \u2aae bumpe \u224f Bumpeq \u224e | |
| bumpeq \u224f Cacute \u106 cacute \u107 capand \u2a44 | |
| capbrcup \u2a49 capcap \u2a4b Cap \u22d2 capcup \u2a47 | |
| capdot \u2a40 CapitalDifferentialD \u2145 caps \u2229\ufe00 | |
| caret \u2041 caron \u2c7 Cayleys \u212d ccaps \u2a4d | |
| Ccaron \u10c ccaron \u10d Ccirc \u108 ccirc \u109 | |
| Cconint \u2230 ccups \u2a4c ccupssm \u2a50 Cdot \u10a | |
| cdot \u10b Cedilla \ub8 cemptyv \u29b2 centerdot \ub7 | |
| CenterDot \ub7 cfr \ud835\udd20 Cfr \u212d CHcy \u427 | |
| chcy \u447 check \u2713 checkmark \u2713 circeq \u2257 | |
| circlearrowleft \u21ba circlearrowright \u21bb | |
| circledast \u229b circledcirc \u229a circleddash \u229d | |
| CircleDot \u2299 circledR \uae circledS \u24c8 | |
| CircleMinus \u2296 CirclePlus \u2295 CircleTimes \u2297 | |
| cir \u25cb cirE \u29c3 cire \u2257 cirfnint \u2a10 | |
| cirmid \u2aef cirscir \u29c2 ClockwiseContourIntegral \u2232 | |
| CloseCurlyDoubleQuote \u201d CloseCurlyQuote \u2019 | |
| clubsuit \u2663 colon \u3a Colon \u2237 Colone \u2a74 | |
| colone \u2254 coloneq \u2254 comma \u2c commat \u40 | |
| comp \u2201 compfn \u2218 complement \u2201 complexes \u2102 | |
| congdot \u2a6d Congruent \u2261 conint \u222e Conint \u222f | |
| ContourIntegral \u222e copf \ud835\udd54 Copf \u2102 | |
| coprod \u2210 Coproduct \u2210 COPY \ua9 copysr \u2117 | |
| CounterClockwiseContourIntegral \u2233 cross \u2717 | |
| Cross \u2a2f Cscr \ud835\udc9e cscr \ud835\udcb8 csub \u2acf | |
| csube \u2ad1 csup \u2ad0 csupe \u2ad2 ctdot \u22ef | |
| cudarrl \u2938 cudarrr \u2935 cuepr \u22de cuesc \u22df | |
| cularr \u21b6 cularrp \u293d cupbrcap \u2a48 cupcap \u2a46 | |
| CupCap \u224d Cup \u22d3 cupcup \u2a4a cupdot \u228d | |
| cupor \u2a45 cups \u222a\ufe00 curarr \u21b7 curarrm \u293c | |
| curlyeqprec \u22de curlyeqsucc \u22df curlyvee \u22ce | |
| curlywedge \u22cf curvearrowleft \u21b6 curvearrowright \u21b7 | |
| cuvee \u22ce cuwed \u22cf cwconint \u2232 cwint \u2231 | |
| cylcty \u232d daleth \u2138 Darr \u21a1 dash \u2010 | |
| Dashv \u2ae4 dashv \u22a3 dbkarow \u290f dblac \u2dd | |
| Dcaron \u10e dcaron \u10f Dcy \u414 dcy \u434 ddagger \u2021 | |
| ddarr \u21ca DD \u2145 dd \u2146 DDotrahd \u2911 | |
| ddotseq \u2a77 Del \u2207 demptyv \u29b1 dfisht \u297f | |
| Dfr \ud835\udd07 dfr \ud835\udd21 dHar \u2965 dharl \u21c3 | |
| dharr \u21c2 DiacriticalAcute \ub4 DiacriticalDot \u2d9 | |
| DiacriticalDoubleAcute \u2dd DiacriticalGrave \u60 | |
| DiacriticalTilde \u2dc diam \u22c4 diamond \u22c4 | |
| Diamond \u22c4 diamondsuit \u2666 die \ua8 | |
| DifferentialD \u2146 digamma \u3dd disin \u22f2 div \uf7 | |
| divideontimes \u22c7 divonx \u22c7 DJcy \u402 djcy \u452 | |
| dlcorn \u231e dlcrop \u230d dollar \u24 Dopf \ud835\udd3b | |
| dopf \ud835\udd55 Dot \ua8 dot \u2d9 DotDot \u20dc | |
| doteq \u2250 doteqdot \u2251 DotEqual \u2250 dotminus \u2238 | |
| dotplus \u2214 dotsquare \u22a1 doublebarwedge \u2306 | |
| DoubleContourIntegral \u222f DoubleDot \ua8 | |
| DoubleDownArrow \u21d3 DoubleLeftArrow \u21d0 | |
| DoubleLeftRightArrow \u21d4 DoubleLeftTee \u2ae4 | |
| DoubleLongLeftArrow \u27f8 DoubleLongLeftRightArrow \u27fa | |
| DoubleLongRightArrow \u27f9 DoubleRightArrow \u21d2 | |
| DoubleRightTee \u22a8 DoubleUpArrow \u21d1 | |
| DoubleUpDownArrow \u21d5 DoubleVerticalBar \u2225 | |
| DownArrowBar \u2913 downarrow \u2193 DownArrow \u2193 | |
| Downarrow \u21d3 DownArrowUpArrow \u21f5 DownBreve \u311 | |
| downdownarrows \u21ca downharpoonleft \u21c3 | |
| downharpoonright \u21c2 DownLeftRightVector \u2950 | |
| DownLeftTeeVector \u295e DownLeftVectorBar \u2956 | |
| DownLeftVector \u21bd DownRightTeeVector \u295f | |
| DownRightVectorBar \u2957 DownRightVector \u21c1 | |
| DownTeeArrow \u21a7 DownTee \u22a4 drbkarow \u2910 | |
| drcorn \u231f drcrop \u230c Dscr \ud835\udc9f | |
| dscr \ud835\udcb9 DScy \u405 dscy \u455 dsol \u29f6 | |
| Dstrok \u110 dstrok \u111 dtdot \u22f1 dtri \u25bf | |
| dtrif \u25be duarr \u21f5 duhar \u296f dwangle \u29a6 | |
| DZcy \u40f dzcy \u45f dzigrarr \u27ff easter \u2a6e | |
| Ecaron \u11a ecaron \u11b ecir \u2256 ecolon \u2255 Ecy \u42d | |
| ecy \u44d eDDot \u2a77 Edot \u116 edot \u117 eDot \u2251 | |
| ee \u2147 efDot \u2252 Efr \ud835\udd08 efr \ud835\udd22 | |
| eg \u2a9a egs \u2a96 egsdot \u2a98 el \u2a99 Element \u2208 | |
| elinters \u23e7 ell \u2113 els \u2a95 elsdot \u2a97 | |
| Emacr \u112 emacr \u113 emptyset \u2205 | |
| EmptySmallSquare \u25fb emptyv \u2205 | |
| EmptyVerySmallSquare \u25ab emsp13 \u2004 emsp14 \u2005 | |
| ENG \u14a eng \u14b Eogon \u118 eogon \u119 Eopf \ud835\udd3c | |
| eopf \ud835\udd56 epar \u22d5 eparsl \u29e3 eplus \u2a71 | |
| epsi \u3b5 epsiv \u3f5 eqcirc \u2256 eqcolon \u2255 | |
| eqsim \u2242 eqslantgtr \u2a96 eqslantless \u2a95 Equal \u2a75 | |
| equals \u3d EqualTilde \u2242 equest \u225f Equilibrium \u21cc | |
| equivDD \u2a78 eqvparsl \u29e5 erarr \u2971 erDot \u2253 | |
| escr \u212f Escr \u2130 esdot \u2250 Esim \u2a73 esim \u2242 | |
| excl \u21 Exists \u2203 expectation \u2130 exponentiale \u2147 | |
| ExponentialE \u2147 fallingdotseq \u2252 Fcy \u424 fcy \u444 | |
| female \u2640 ffilig \ufb03 fflig \ufb00 ffllig \ufb04 | |
| Ffr \ud835\udd09 ffr \ud835\udd23 filig \ufb01 | |
| FilledSmallSquare \u25fc FilledVerySmallSquare \u25aa | |
| fjlig \u66\u6a flat \u266d fllig \ufb02 fltns \u25b1 | |
| Fopf \ud835\udd3d fopf \ud835\udd57 ForAll \u2200 fork \u22d4 | |
| forkv \u2ad9 Fouriertrf \u2131 fpartint \u2a0d frac13 \u2153 | |
| frac15 \u2155 frac16 \u2159 frac18 \u215b frac23 \u2154 | |
| frac25 \u2156 frac35 \u2157 frac38 \u215c frac45 \u2158 | |
| frac56 \u215a frac58 \u215d frac78 \u215e frown \u2322 | |
| fscr \ud835\udcbb Fscr \u2131 gacute \u1f5 Gammad \u3dc | |
| gammad \u3dd gap \u2a86 Gbreve \u11e gbreve \u11f | |
| Gcedil \u122 Gcirc \u11c gcirc \u11d Gcy \u413 gcy \u433 | |
| Gdot \u120 gdot \u121 gE \u2267 gEl \u2a8c gel \u22db | |
| geq \u2265 geqq \u2267 geqslant \u2a7e gescc \u2aa9 | |
| ges \u2a7e gesdot \u2a80 gesdoto \u2a82 gesdotol \u2a84 | |
| gesl \u22db\ufe00 gesles \u2a94 Gfr \ud835\udd0a | |
| gfr \ud835\udd24 gg \u226b Gg \u22d9 ggg \u22d9 gimel \u2137 | |
| GJcy \u403 gjcy \u453 gla \u2aa5 gl \u2277 glE \u2a92 | |
| glj \u2aa4 gnap \u2a8a gnapprox \u2a8a gne \u2a88 gnE \u2269 | |
| gneq \u2a88 gneqq \u2269 gnsim \u22e7 Gopf \ud835\udd3e | |
| gopf \ud835\udd58 grave \u60 GreaterEqual \u2265 | |
| GreaterEqualLess \u22db GreaterFullEqual \u2267 | |
| GreaterGreater \u2aa2 GreaterLess \u2277 | |
| GreaterSlantEqual \u2a7e GreaterTilde \u2273 Gscr \ud835\udca2 | |
| gscr \u210a gsim \u2273 gsime \u2a8e gsiml \u2a90 gtcc \u2aa7 | |
| gtcir \u2a7a GT \u3e Gt \u226b gtdot \u22d7 gtlPar \u2995 | |
| gtquest \u2a7c gtrapprox \u2a86 gtrarr \u2978 gtrdot \u22d7 | |
| gtreqless \u22db gtreqqless \u2a8c gtrless \u2277 | |
| gtrsim \u2273 gvertneqq \u2269\ufe00 gvnE \u2269\ufe00 | |
| Hacek \u2c7 hairsp \u200a half \ubd hamilt \u210b | |
| HARDcy \u42a hardcy \u44a harrcir \u2948 harrw \u21ad | |
| Hat \u5e hbar \u210f Hcirc \u124 hcirc \u125 heartsuit \u2665 | |
| hercon \u22b9 hfr \ud835\udd25 Hfr \u210c HilbertSpace \u210b | |
| hksearow \u2925 hkswarow \u2926 hoarr \u21ff homtht \u223b | |
| hookleftarrow \u21a9 hookrightarrow \u21aa hopf \ud835\udd59 | |
| Hopf \u210d horbar \u2015 HorizontalLine \u2500 | |
| hscr \ud835\udcbd Hscr \u210b hslash \u210f Hstrok \u126 | |
| hstrok \u127 HumpDownHump \u224e HumpEqual \u224f | |
| hybull \u2043 hyphen \u2010 ic \u2063 Icy \u418 icy \u438 | |
| Idot \u130 IEcy \u415 iecy \u435 iff \u21d4 ifr \ud835\udd26 | |
| Ifr \u2111 ii \u2148 iiiint \u2a0c iiint \u222d iinfin \u29dc | |
| iiota \u2129 IJlig \u132 ijlig \u133 Imacr \u12a imacr \u12b | |
| ImaginaryI \u2148 imagline \u2110 imagpart \u2111 imath \u131 | |
| Im \u2111 imof \u22b7 imped \u1b5 Implies \u21d2 | |
| incare \u2105 in \u2208 infintie \u29dd inodot \u131 | |
| intcal \u22ba Int \u222c integers \u2124 Integral \u222b | |
| intercal \u22ba Intersection \u22c2 intlarhk \u2a17 | |
| intprod \u2a3c InvisibleComma \u2063 InvisibleTimes \u2062 | |
| IOcy \u401 iocy \u451 Iogon \u12e iogon \u12f | |
| Iopf \ud835\udd40 iopf \ud835\udd5a iprod \u2a3c | |
| iscr \ud835\udcbe Iscr \u2110 isindot \u22f5 isinE \u22f9 | |
| isins \u22f4 isinsv \u22f3 isinv \u2208 it \u2062 | |
| Itilde \u128 itilde \u129 Iukcy \u406 iukcy \u456 Jcirc \u134 | |
| jcirc \u135 Jcy \u419 jcy \u439 Jfr \ud835\udd0d | |
| jfr \ud835\udd27 jmath \u237 Jopf \ud835\udd41 | |
| jopf \ud835\udd5b Jscr \ud835\udca5 jscr \ud835\udcbf | |
| Jsercy \u408 jsercy \u458 Jukcy \u404 jukcy \u454 | |
| kappav \u3f0 Kcedil \u136 kcedil \u137 Kcy \u41a kcy \u43a | |
| Kfr \ud835\udd0e kfr \ud835\udd28 kgreen \u138 KHcy \u425 | |
| khcy \u445 KJcy \u40c kjcy \u45c Kopf \ud835\udd42 | |
| kopf \ud835\udd5c Kscr \ud835\udca6 kscr \ud835\udcc0 | |
| lAarr \u21da Lacute \u139 lacute \u13a laemptyv \u29b4 | |
| lagran \u2112 Lang \u27ea langd \u2991 langle \u27e8 | |
| lap \u2a85 Laplacetrf \u2112 larrb \u21e4 larrbfs \u291f | |
| Larr \u219e larrfs \u291d larrhk \u21a9 larrlp \u21ab | |
| larrpl \u2939 larrsim \u2973 larrtl \u21a2 latail \u2919 | |
| lAtail \u291b lat \u2aab late \u2aad lates \u2aad\ufe00 | |
| lbarr \u290c lBarr \u290e lbbrk \u2772 lbrace \u7b | |
| lbrack \u5b lbrke \u298b lbrksld \u298f lbrkslu \u298d | |
| Lcaron \u13d lcaron \u13e Lcedil \u13b lcedil \u13c lcub \u7b | |
| Lcy \u41b lcy \u43b ldca \u2936 ldquor \u201e ldrdhar \u2967 | |
| ldrushar \u294b ldsh \u21b2 lE \u2266 LeftAngleBracket \u27e8 | |
| LeftArrowBar \u21e4 leftarrow \u2190 LeftArrow \u2190 | |
| Leftarrow \u21d0 LeftArrowRightArrow \u21c6 | |
| leftarrowtail \u21a2 LeftCeiling \u2308 | |
| LeftDoubleBracket \u27e6 LeftDownTeeVector \u2961 | |
| LeftDownVectorBar \u2959 LeftDownVector \u21c3 LeftFloor \u230a | |
| leftharpoondown \u21bd leftharpoonup \u21bc | |
| leftleftarrows \u21c7 leftrightarrow \u2194 | |
| LeftRightArrow \u2194 Leftrightarrow \u21d4 | |
| leftrightarrows \u21c6 leftrightharpoons \u21cb | |
| leftrightsquigarrow \u21ad LeftRightVector \u294e | |
| LeftTeeArrow \u21a4 LeftTee \u22a3 LeftTeeVector \u295a | |
| leftthreetimes \u22cb LeftTriangleBar \u29cf | |
| LeftTriangle \u22b2 LeftTriangleEqual \u22b4 | |
| LeftUpDownVector \u2951 LeftUpTeeVector \u2960 | |
| LeftUpVectorBar \u2958 LeftUpVector \u21bf LeftVectorBar \u2952 | |
| LeftVector \u21bc lEg \u2a8b leg \u22da leq \u2264 | |
| leqq \u2266 leqslant \u2a7d lescc \u2aa8 les \u2a7d | |
| lesdot \u2a7f lesdoto \u2a81 lesdotor \u2a83 lesg \u22da\ufe00 | |
| lesges \u2a93 lessapprox \u2a85 lessdot \u22d6 | |
| lesseqgtr \u22da lesseqqgtr \u2a8b LessEqualGreater \u22da | |
| LessFullEqual \u2266 LessGreater \u2276 lessgtr \u2276 | |
| LessLess \u2aa1 lesssim \u2272 LessSlantEqual \u2a7d | |
| LessTilde \u2272 lfisht \u297c Lfr \ud835\udd0f | |
| lfr \ud835\udd29 lg \u2276 lgE \u2a91 lHar \u2962 | |
| lhard \u21bd lharu \u21bc lharul \u296a lhblk \u2584 | |
| LJcy \u409 ljcy \u459 llarr \u21c7 ll \u226a Ll \u22d8 | |
| llcorner \u231e Lleftarrow \u21da llhard \u296b lltri \u25fa | |
| Lmidot \u13f lmidot \u140 lmoustache \u23b0 lmoust \u23b0 | |
| lnap \u2a89 lnapprox \u2a89 lne \u2a87 lnE \u2268 lneq \u2a87 | |
| lneqq \u2268 lnsim \u22e6 loang \u27ec loarr \u21fd | |
| lobrk \u27e6 longleftarrow \u27f5 LongLeftArrow \u27f5 | |
| Longleftarrow \u27f8 longleftrightarrow \u27f7 | |
| LongLeftRightArrow \u27f7 Longleftrightarrow \u27fa | |
| longmapsto \u27fc longrightarrow \u27f6 LongRightArrow \u27f6 | |
| Longrightarrow \u27f9 looparrowleft \u21ab | |
| looparrowright \u21ac lopar \u2985 Lopf \ud835\udd43 | |
| lopf \ud835\udd5d loplus \u2a2d lotimes \u2a34 lowbar \u5f | |
| LowerLeftArrow \u2199 LowerRightArrow \u2198 lozenge \u25ca | |
| lozf \u29eb lpar \u28 lparlt \u2993 lrarr \u21c6 | |
| lrcorner \u231f lrhar \u21cb lrhard \u296d lrtri \u22bf | |
| lscr \ud835\udcc1 Lscr \u2112 lsh \u21b0 Lsh \u21b0 | |
| lsim \u2272 lsime \u2a8d lsimg \u2a8f lsqb \u5b lsquor \u201a | |
| Lstrok \u141 lstrok \u142 ltcc \u2aa6 ltcir \u2a79 LT \u3c | |
| Lt \u226a ltdot \u22d6 lthree \u22cb ltimes \u22c9 | |
| ltlarr \u2976 ltquest \u2a7b ltri \u25c3 ltrie \u22b4 | |
| ltrif \u25c2 ltrPar \u2996 lurdshar \u294a luruhar \u2966 | |
| lvertneqq \u2268\ufe00 lvnE \u2268\ufe00 male \u2642 | |
| malt \u2720 maltese \u2720 Map \u2905 map \u21a6 | |
| mapsto \u21a6 mapstodown \u21a7 mapstoleft \u21a4 | |
| mapstoup \u21a5 marker \u25ae mcomma \u2a29 Mcy \u41c | |
| mcy \u43c mDDot \u223a measuredangle \u2221 MediumSpace \u205f | |
| Mellintrf \u2133 Mfr \ud835\udd10 mfr \ud835\udd2a mho \u2127 | |
| midast \u2a midcir \u2af0 mid \u2223 minusb \u229f | |
| minusd \u2238 minusdu \u2a2a MinusPlus \u2213 mlcp \u2adb | |
| mldr \u2026 mnplus \u2213 models \u22a7 Mopf \ud835\udd44 | |
| mopf \ud835\udd5e mp \u2213 mscr \ud835\udcc2 Mscr \u2133 | |
| mstpos \u223e multimap \u22b8 mumap \u22b8 Nacute \u143 | |
| nacute \u144 nang \u2220\u20d2 nap \u2249 napE \u2a70\u338 | |
| napid \u224b\u338 napos \u149 napprox \u2249 natural \u266e | |
| naturals \u2115 natur \u266e nbump \u224e\u338 | |
| nbumpe \u224f\u338 ncap \u2a43 Ncaron \u147 ncaron \u148 | |
| Ncedil \u145 ncedil \u146 ncong \u2247 ncongdot \u2a6d\u338 | |
| ncup \u2a42 Ncy \u41d ncy \u43d nearhk \u2924 nearr \u2197 | |
| neArr \u21d7 nearrow \u2197 nedot \u2250\u338 | |
| NegativeMediumSpace \u200b NegativeThickSpace \u200b | |
| NegativeThinSpace \u200b NegativeVeryThinSpace \u200b | |
| nequiv \u2262 nesear \u2928 nesim \u2242\u338 | |
| NestedGreaterGreater \u226b NestedLessLess \u226a NewLine \ua | |
| nexist \u2204 nexists \u2204 Nfr \ud835\udd11 nfr \ud835\udd2b | |
| ngE \u2267\u338 nge \u2271 ngeq \u2271 ngeqq \u2267\u338 | |
| ngeqslant \u2a7e\u338 nges \u2a7e\u338 nGg \u22d9\u338 | |
| ngsim \u2275 nGt \u226b\u20d2 ngt \u226f ngtr \u226f | |
| nGtv \u226b\u338 nharr \u21ae nhArr \u21ce nhpar \u2af2 | |
| nis \u22fc nisd \u22fa niv \u220b NJcy \u40a njcy \u45a | |
| nlarr \u219a nlArr \u21cd nldr \u2025 nlE \u2266\u338 | |
| nle \u2270 nleftarrow \u219a nLeftarrow \u21cd | |
| nleftrightarrow \u21ae nLeftrightarrow \u21ce nleq \u2270 | |
| nleqq \u2266\u338 nleqslant \u2a7d\u338 nles \u2a7d\u338 | |
| nless \u226e nLl \u22d8\u338 nlsim \u2274 nLt \u226a\u20d2 | |
| nlt \u226e nltri \u22ea nltrie \u22ec nLtv \u226a\u338 | |
| nmid \u2224 NoBreak \u2060 NonBreakingSpace \ua0 | |
| nopf \ud835\udd5f Nopf \u2115 Not \u2aec NotCongruent \u2262 | |
| NotCupCap \u226d NotDoubleVerticalBar \u2226 NotElement \u2209 | |
| NotEqual \u2260 NotEqualTilde \u2242\u338 NotExists \u2204 | |
| NotGreater \u226f NotGreaterEqual \u2271 | |
| NotGreaterFullEqual \u2267\u338 NotGreaterGreater \u226b\u338 | |
| NotGreaterLess \u2279 NotGreaterSlantEqual \u2a7e\u338 | |
| NotGreaterTilde \u2275 NotHumpDownHump \u224e\u338 | |
| NotHumpEqual \u224f\u338 notindot \u22f5\u338 | |
| notinE \u22f9\u338 notinva \u2209 notinvb \u22f7 | |
| notinvc \u22f6 NotLeftTriangleBar \u29cf\u338 | |
| NotLeftTriangle \u22ea NotLeftTriangleEqual \u22ec | |
| NotLess \u226e NotLessEqual \u2270 NotLessGreater \u2278 | |
| NotLessLess \u226a\u338 NotLessSlantEqual \u2a7d\u338 | |
| NotLessTilde \u2274 NotNestedGreaterGreater \u2aa2\u338 | |
| NotNestedLessLess \u2aa1\u338 notni \u220c notniva \u220c | |
| notnivb \u22fe notnivc \u22fd NotPrecedes \u2280 | |
| NotPrecedesEqual \u2aaf\u338 NotPrecedesSlantEqual \u22e0 | |
| NotReverseElement \u220c NotRightTriangleBar \u29d0\u338 | |
| NotRightTriangle \u22eb NotRightTriangleEqual \u22ed | |
| NotSquareSubset \u228f\u338 NotSquareSubsetEqual \u22e2 | |
| NotSquareSuperset \u2290\u338 NotSquareSupersetEqual \u22e3 | |
| NotSubset \u2282\u20d2 NotSubsetEqual \u2288 NotSucceeds \u2281 | |
| NotSucceedsEqual \u2ab0\u338 NotSucceedsSlantEqual \u22e1 | |
| NotSucceedsTilde \u227f\u338 NotSuperset \u2283\u20d2 | |
| NotSupersetEqual \u2289 NotTilde \u2241 NotTildeEqual \u2244 | |
| NotTildeFullEqual \u2247 NotTildeTilde \u2249 | |
| NotVerticalBar \u2224 nparallel \u2226 npar \u2226 | |
| nparsl \u2afd\u20e5 npart \u2202\u338 npolint \u2a14 | |
| npr \u2280 nprcue \u22e0 nprec \u2280 npreceq \u2aaf\u338 | |
| npre \u2aaf\u338 nrarrc \u2933\u338 nrarr \u219b nrArr \u21cf | |
| nrarrw \u219d\u338 nrightarrow \u219b nRightarrow \u21cf | |
| nrtri \u22eb nrtrie \u22ed nsc \u2281 nsccue \u22e1 | |
| nsce \u2ab0\u338 Nscr \ud835\udca9 nscr \ud835\udcc3 | |
| nshortmid \u2224 nshortparallel \u2226 nsim \u2241 | |
| nsime \u2244 nsimeq \u2244 nsmid \u2224 nspar \u2226 | |
| nsqsube \u22e2 nsqsupe \u22e3 nsubE \u2ac5\u338 nsube \u2288 | |
| nsubset \u2282\u20d2 nsubseteq \u2288 nsubseteqq \u2ac5\u338 | |
| nsucc \u2281 nsucceq \u2ab0\u338 nsup \u2285 nsupE \u2ac6\u338 | |
| nsupe \u2289 nsupset \u2283\u20d2 nsupseteq \u2289 | |
| nsupseteqq \u2ac6\u338 ntgl \u2279 ntlg \u2278 | |
| ntriangleleft \u22ea ntrianglelefteq \u22ec | |
| ntriangleright \u22eb ntrianglerighteq \u22ed num \u23 | |
| numero \u2116 numsp \u2007 nvap \u224d\u20d2 nvdash \u22ac | |
| nvDash \u22ad nVdash \u22ae nVDash \u22af nvge \u2265\u20d2 | |
| nvgt \u3e\u20d2 nvHarr \u2904 nvinfin \u29de nvlArr \u2902 | |
| nvle \u2264\u20d2 nvlt \u3c\u20d2 nvltrie \u22b4\u20d2 | |
| nvrArr \u2903 nvrtrie \u22b5\u20d2 nvsim \u223c\u20d2 | |
| nwarhk \u2923 nwarr \u2196 nwArr \u21d6 nwarrow \u2196 | |
| nwnear \u2927 oast \u229b ocir \u229a Ocy \u41e ocy \u43e | |
| odash \u229d Odblac \u150 odblac \u151 odiv \u2a38 | |
| odot \u2299 odsold \u29bc ofcir \u29bf Ofr \ud835\udd12 | |
| ofr \ud835\udd2c ogon \u2db ogt \u29c1 ohbar \u29b5 ohm \u3a9 | |
| oint \u222e olarr \u21ba olcir \u29be olcross \u29bb | |
| olt \u29c0 Omacr \u14c omacr \u14d omid \u29b6 ominus \u2296 | |
| Oopf \ud835\udd46 oopf \ud835\udd60 opar \u29b7 | |
| OpenCurlyDoubleQuote \u201c OpenCurlyQuote \u2018 operp \u29b9 | |
| orarr \u21bb Or \u2a54 ord \u2a5d order \u2134 orderof \u2134 | |
| origof \u22b6 oror \u2a56 orslope \u2a57 orv \u2a5b oS \u24c8 | |
| Oscr \ud835\udcaa oscr \u2134 osol \u2298 otimesas \u2a36 | |
| Otimes \u2a37 ovbar \u233d OverBar \u203e OverBrace \u23de | |
| OverBracket \u23b4 OverParenthesis \u23dc parallel \u2225 | |
| par \u2225 parsim \u2af3 parsl \u2afd PartialD \u2202 | |
| Pcy \u41f pcy \u43f percnt \u25 period \u2e pertenk \u2031 | |
| Pfr \ud835\udd13 pfr \ud835\udd2d phiv \u3d5 phmmat \u2133 | |
| phone \u260e pitchfork \u22d4 planck \u210f planckh \u210e | |
| plankv \u210f plusacir \u2a23 plusb \u229e pluscir \u2a22 | |
| plus \u2b plusdo \u2214 plusdu \u2a25 pluse \u2a72 | |
| PlusMinus \ub1 plussim \u2a26 plustwo \u2a27 pm \ub1 | |
| Poincareplane \u210c pointint \u2a15 popf \ud835\udd61 | |
| Popf \u2119 prap \u2ab7 Pr \u2abb pr \u227a prcue \u227c | |
| precapprox \u2ab7 prec \u227a preccurlyeq \u227c | |
| Precedes \u227a PrecedesEqual \u2aaf PrecedesSlantEqual \u227c | |
| PrecedesTilde \u227e preceq \u2aaf precnapprox \u2ab9 | |
| precneqq \u2ab5 precnsim \u22e8 pre \u2aaf prE \u2ab3 | |
| precsim \u227e primes \u2119 prnap \u2ab9 prnE \u2ab5 | |
| prnsim \u22e8 Product \u220f profalar \u232e profline \u2312 | |
| profsurf \u2313 Proportional \u221d Proportion \u2237 | |
| propto \u221d prsim \u227e prurel \u22b0 Pscr \ud835\udcab | |
| pscr \ud835\udcc5 puncsp \u2008 Qfr \ud835\udd14 | |
| qfr \ud835\udd2e qint \u2a0c qopf \ud835\udd62 Qopf \u211a | |
| qprime \u2057 Qscr \ud835\udcac qscr \ud835\udcc6 | |
| quaternions \u210d quatint \u2a16 quest \u3f questeq \u225f | |
| QUOT \u22 rAarr \u21db race \u223d\u331 Racute \u154 | |
| racute \u155 raemptyv \u29b3 Rang \u27eb rangd \u2992 | |
| range \u29a5 rangle \u27e9 rarrap \u2975 rarrb \u21e5 | |
| rarrbfs \u2920 rarrc \u2933 Rarr \u21a0 rarrfs \u291e | |
| rarrhk \u21aa rarrlp \u21ac rarrpl \u2945 rarrsim \u2974 | |
| Rarrtl \u2916 rarrtl \u21a3 rarrw \u219d ratail \u291a | |
| rAtail \u291c ratio \u2236 rationals \u211a rbarr \u290d | |
| rBarr \u290f RBarr \u2910 rbbrk \u2773 rbrace \u7d | |
| rbrack \u5d rbrke \u298c rbrksld \u298e rbrkslu \u2990 | |
| Rcaron \u158 rcaron \u159 Rcedil \u156 rcedil \u157 rcub \u7d | |
| Rcy \u420 rcy \u440 rdca \u2937 rdldhar \u2969 rdquor \u201d | |
| rdsh \u21b3 realine \u211b realpart \u211c reals \u211d | |
| Re \u211c rect \u25ad REG \uae ReverseElement \u220b | |
| ReverseEquilibrium \u21cb ReverseUpEquilibrium \u296f | |
| rfisht \u297d rfr \ud835\udd2f Rfr \u211c rHar \u2964 | |
| rhard \u21c1 rharu \u21c0 rharul \u296c rhov \u3f1 | |
| RightAngleBracket \u27e9 RightArrowBar \u21e5 rightarrow \u2192 | |
| RightArrow \u2192 Rightarrow \u21d2 RightArrowLeftArrow \u21c4 | |
| rightarrowtail \u21a3 RightCeiling \u2309 | |
| RightDoubleBracket \u27e7 RightDownTeeVector \u295d | |
| RightDownVectorBar \u2955 RightDownVector \u21c2 | |
| RightFloor \u230b rightharpoondown \u21c1 rightharpoonup \u21c0 | |
| rightleftarrows \u21c4 rightleftharpoons \u21cc | |
| rightrightarrows \u21c9 rightsquigarrow \u219d | |
| RightTeeArrow \u21a6 RightTee \u22a2 RightTeeVector \u295b | |
| rightthreetimes \u22cc RightTriangleBar \u29d0 | |
| RightTriangle \u22b3 RightTriangleEqual \u22b5 | |
| RightUpDownVector \u294f RightUpTeeVector \u295c | |
| RightUpVectorBar \u2954 RightUpVector \u21be | |
| RightVectorBar \u2953 RightVector \u21c0 ring \u2da | |
| risingdotseq \u2253 rlarr \u21c4 rlhar \u21cc | |
| rmoustache \u23b1 rmoust \u23b1 rnmid \u2aee roang \u27ed | |
| roarr \u21fe robrk \u27e7 ropar \u2986 ropf \ud835\udd63 | |
| Ropf \u211d roplus \u2a2e rotimes \u2a35 RoundImplies \u2970 | |
| rpar \u29 rpargt \u2994 rppolint \u2a12 rrarr \u21c9 | |
| Rrightarrow \u21db rscr \ud835\udcc7 Rscr \u211b rsh \u21b1 | |
| Rsh \u21b1 rsqb \u5d rsquor \u2019 rthree \u22cc | |
| rtimes \u22ca rtri \u25b9 rtrie \u22b5 rtrif \u25b8 | |
| rtriltri \u29ce RuleDelayed \u29f4 ruluhar \u2968 rx \u211e | |
| Sacute \u15a sacute \u15b scap \u2ab8 Sc \u2abc sc \u227b | |
| sccue \u227d sce \u2ab0 scE \u2ab4 Scedil \u15e scedil \u15f | |
| Scirc \u15c scirc \u15d scnap \u2aba scnE \u2ab6 | |
| scnsim \u22e9 scpolint \u2a13 scsim \u227f Scy \u421 | |
| scy \u441 sdotb \u22a1 sdote \u2a66 searhk \u2925 | |
| searr \u2198 seArr \u21d8 searrow \u2198 semi \u3b | |
| seswar \u2929 setminus \u2216 setmn \u2216 sext \u2736 | |
| Sfr \ud835\udd16 sfr \ud835\udd30 sfrown \u2322 sharp \u266f | |
| SHCHcy \u429 shchcy \u449 SHcy \u428 shcy \u448 | |
| ShortDownArrow \u2193 ShortLeftArrow \u2190 shortmid \u2223 | |
| shortparallel \u2225 ShortRightArrow \u2192 ShortUpArrow \u2191 | |
| sigmav \u3c2 simdot \u2a6a sime \u2243 simeq \u2243 | |
| simg \u2a9e simgE \u2aa0 siml \u2a9d simlE \u2a9f | |
| simne \u2246 simplus \u2a24 simrarr \u2972 slarr \u2190 | |
| SmallCircle \u2218 smallsetminus \u2216 smashp \u2a33 | |
| smeparsl \u29e4 smid \u2223 smile \u2323 smt \u2aaa | |
| smte \u2aac smtes \u2aac\ufe00 SOFTcy \u42c softcy \u44c | |
| solbar \u233f solb \u29c4 sol \u2f Sopf \ud835\udd4a | |
| sopf \ud835\udd64 spadesuit \u2660 spar \u2225 sqcap \u2293 | |
| sqcaps \u2293\ufe00 sqcup \u2294 sqcups \u2294\ufe00 | |
| Sqrt \u221a sqsub \u228f sqsube \u2291 sqsubset \u228f | |
| sqsubseteq \u2291 sqsup \u2290 sqsupe \u2292 sqsupset \u2290 | |
| sqsupseteq \u2292 square \u25a1 Square \u25a1 | |
| SquareIntersection \u2293 SquareSubset \u228f | |
| SquareSubsetEqual \u2291 SquareSuperset \u2290 | |
| SquareSupersetEqual \u2292 SquareUnion \u2294 squarf \u25aa | |
| squ \u25a1 squf \u25aa srarr \u2192 Sscr \ud835\udcae | |
| sscr \ud835\udcc8 ssetmn \u2216 ssmile \u2323 sstarf \u22c6 | |
| Star \u22c6 star \u2606 starf \u2605 straightepsilon \u3f5 | |
| straightphi \u3d5 strns \uaf Sub \u22d0 subdot \u2abd | |
| subE \u2ac5 subedot \u2ac3 submult \u2ac1 subnE \u2acb | |
| subne \u228a subplus \u2abf subrarr \u2979 subset \u2282 | |
| Subset \u22d0 subseteq \u2286 subseteqq \u2ac5 | |
| SubsetEqual \u2286 subsetneq \u228a subsetneqq \u2acb | |
| subsim \u2ac7 subsub \u2ad5 subsup \u2ad3 succapprox \u2ab8 | |
| succ \u227b succcurlyeq \u227d Succeeds \u227b | |
| SucceedsEqual \u2ab0 SucceedsSlantEqual \u227d | |
| SucceedsTilde \u227f succeq \u2ab0 succnapprox \u2aba | |
| succneqq \u2ab6 succnsim \u22e9 succsim \u227f SuchThat \u220b | |
| Sum \u2211 sung \u266a Sup \u22d1 supdot \u2abe | |
| supdsub \u2ad8 supE \u2ac6 supedot \u2ac4 Superset \u2283 | |
| SupersetEqual \u2287 suphsol \u27c9 suphsub \u2ad7 | |
| suplarr \u297b supmult \u2ac2 supnE \u2acc supne \u228b | |
| supplus \u2ac0 supset \u2283 Supset \u22d1 supseteq \u2287 | |
| supseteqq \u2ac6 supsetneq \u228b supsetneqq \u2acc | |
| supsim \u2ac8 supsub \u2ad4 supsup \u2ad6 swarhk \u2926 | |
| swarr \u2199 swArr \u21d9 swarrow \u2199 swnwar \u292a | |
| Tab \u9 target \u2316 tbrk \u23b4 Tcaron \u164 tcaron \u165 | |
| Tcedil \u162 tcedil \u163 Tcy \u422 tcy \u442 tdot \u20db | |
| telrec \u2315 Tfr \ud835\udd17 tfr \ud835\udd31 | |
| therefore \u2234 Therefore \u2234 thetav \u3d1 | |
| thickapprox \u2248 thicksim \u223c ThickSpace \u205f\u200a | |
| ThinSpace \u2009 thkap \u2248 thksim \u223c Tilde \u223c | |
| TildeEqual \u2243 TildeFullEqual \u2245 TildeTilde \u2248 | |
| timesbar \u2a31 timesb \u22a0 timesd \u2a30 tint \u222d | |
| toea \u2928 topbot \u2336 topcir \u2af1 top \u22a4 | |
| Topf \ud835\udd4b topf \ud835\udd65 topfork \u2ada tosa \u2929 | |
| tprime \u2034 TRADE \u2122 triangle \u25b5 triangledown \u25bf | |
| triangleleft \u25c3 trianglelefteq \u22b4 triangleq \u225c | |
| triangleright \u25b9 trianglerighteq \u22b5 tridot \u25ec | |
| trie \u225c triminus \u2a3a TripleDot \u20db triplus \u2a39 | |
| trisb \u29cd tritime \u2a3b trpezium \u23e2 Tscr \ud835\udcaf | |
| tscr \ud835\udcc9 TScy \u426 tscy \u446 TSHcy \u40b | |
| tshcy \u45b Tstrok \u166 tstrok \u167 twixt \u226c | |
| twoheadleftarrow \u219e twoheadrightarrow \u21a0 Uarr \u219f | |
| Uarrocir \u2949 Ubrcy \u40e ubrcy \u45e Ubreve \u16c | |
| ubreve \u16d Ucy \u423 ucy \u443 udarr \u21c5 Udblac \u170 | |
| udblac \u171 udhar \u296e ufisht \u297e Ufr \ud835\udd18 | |
| ufr \ud835\udd32 uHar \u2963 uharl \u21bf uharr \u21be | |
| uhblk \u2580 ulcorn \u231c ulcorner \u231c ulcrop \u230f | |
| ultri \u25f8 Umacr \u16a umacr \u16b UnderBar \u5f | |
| UnderBrace \u23df UnderBracket \u23b5 UnderParenthesis \u23dd | |
| Union \u22c3 UnionPlus \u228e Uogon \u172 uogon \u173 | |
| Uopf \ud835\udd4c uopf \ud835\udd66 UpArrowBar \u2912 | |
| uparrow \u2191 UpArrow \u2191 Uparrow \u21d1 | |
| UpArrowDownArrow \u21c5 updownarrow \u2195 UpDownArrow \u2195 | |
| Updownarrow \u21d5 UpEquilibrium \u296e upharpoonleft \u21bf | |
| upharpoonright \u21be uplus \u228e UpperLeftArrow \u2196 | |
| UpperRightArrow \u2197 upsi \u3c5 Upsi \u3d2 UpTeeArrow \u21a5 | |
| UpTee \u22a5 upuparrows \u21c8 urcorn \u231d urcorner \u231d | |
| urcrop \u230e Uring \u16e uring \u16f urtri \u25f9 | |
| Uscr \ud835\udcb0 uscr \ud835\udcca utdot \u22f0 Utilde \u168 | |
| utilde \u169 utri \u25b5 utrif \u25b4 uuarr \u21c8 | |
| uwangle \u29a7 vangrt \u299c varepsilon \u3f5 varkappa \u3f0 | |
| varnothing \u2205 varphi \u3d5 varpi \u3d6 varpropto \u221d | |
| varr \u2195 vArr \u21d5 varrho \u3f1 varsigma \u3c2 | |
| varsubsetneq \u228a\ufe00 varsubsetneqq \u2acb\ufe00 | |
| varsupsetneq \u228b\ufe00 varsupsetneqq \u2acc\ufe00 | |
| vartheta \u3d1 vartriangleleft \u22b2 vartriangleright \u22b3 | |
| vBar \u2ae8 Vbar \u2aeb vBarv \u2ae9 Vcy \u412 vcy \u432 | |
| vdash \u22a2 vDash \u22a8 Vdash \u22a9 VDash \u22ab | |
| Vdashl \u2ae6 veebar \u22bb vee \u2228 Vee \u22c1 | |
| veeeq \u225a vellip \u22ee verbar \u7c Verbar \u2016 | |
| vert \u7c Vert \u2016 VerticalBar \u2223 VerticalLine \u7c | |
| VerticalSeparator \u2758 VerticalTilde \u2240 | |
| VeryThinSpace \u200a Vfr \ud835\udd19 vfr \ud835\udd33 | |
| vltri \u22b2 vnsub \u2282\u20d2 vnsup \u2283\u20d2 | |
| Vopf \ud835\udd4d vopf \ud835\udd67 vprop \u221d vrtri \u22b3 | |
| Vscr \ud835\udcb1 vscr \ud835\udccb vsubnE \u2acb\ufe00 | |
| vsubne \u228a\ufe00 vsupnE \u2acc\ufe00 vsupne \u228b\ufe00 | |
| Vvdash \u22aa vzigzag \u299a Wcirc \u174 wcirc \u175 | |
| wedbar \u2a5f wedge \u2227 Wedge \u22c0 wedgeq \u2259 | |
| Wfr \ud835\udd1a wfr \ud835\udd34 Wopf \ud835\udd4e | |
| wopf \ud835\udd68 wp \u2118 wr \u2240 wreath \u2240 | |
| Wscr \ud835\udcb2 wscr \ud835\udccc xcap \u22c2 xcirc \u25ef | |
| xcup \u22c3 xdtri \u25bd Xfr \ud835\udd1b xfr \ud835\udd35 | |
| xharr \u27f7 xhArr \u27fa xlarr \u27f5 xlArr \u27f8 | |
| xmap \u27fc xnis \u22fb xodot \u2a00 Xopf \ud835\udd4f | |
| xopf \ud835\udd69 xoplus \u2a01 xotime \u2a02 xrarr \u27f6 | |
| xrArr \u27f9 Xscr \ud835\udcb3 xscr \ud835\udccd xsqcup \u2a06 | |
| xuplus \u2a04 xutri \u25b3 xvee \u22c1 xwedge \u22c0 | |
| YAcy \u42f yacy \u44f Ycirc \u176 ycirc \u177 Ycy \u42b | |
| ycy \u44b Yfr \ud835\udd1c yfr \ud835\udd36 YIcy \u407 | |
| yicy \u457 Yopf \ud835\udd50 yopf \ud835\udd6a | |
| Yscr \ud835\udcb4 yscr \ud835\udcce YUcy \u42e yucy \u44e | |
| Zacute \u179 zacute \u17a Zcaron \u17d zcaron \u17e Zcy \u417 | |
| zcy \u437 Zdot \u17b zdot \u17c zeetrf \u2128 | |
| ZeroWidthSpace \u200b zfr \ud835\udd37 Zfr \u2128 ZHcy \u416 | |
| zhcy \u436 zigrarr \u21dd zopf \ud835\udd6b Zopf \u2124 | |
| Zscr \ud835\udcb5 zscr \ud835\udccf | |
| } | |
| # Internal cache for the foreach variable-lists and the | |
| # substitution strings used to split a HTML string into | |
| # incrementally handleable scripts. This should reduce the | |
| # time compute this information for repeated calls with the same | |
| # split-factor. The array is indexed by a combination of the | |
| # numerical split factor and the length of the command prefix and | |
| # maps this to a 2-element list containing variable- and | |
| # subst-string. | |
| variable splitdata | |
| array set splitdata {} | |
| } | |
| # htmlparse::parse -- | |
| # | |
| # This command is the basic parser for HTML. It takes a HTML | |
| # string, parses it and invokes a command prefix for every tag | |
| # encountered. It is not necessary for the HTML to be valid for | |
| # this parser to function. It is the responsibility of the | |
| # command invoked for every tag to check this. Another | |
| # responsibility of the invoked command is the handling of tag | |
| # attributes and character entities (escaped characters). The | |
| # parser provides the un-interpreted tag attributes to the | |
| # invoked command to aid in the former, and the package at large | |
| # provides a helper command, '::htmlparse::mapEscapes', to aid | |
| # in the handling of the latter. The parser *does* ignore | |
| # leading DOCTYPE declarations and all valid HTML comments it | |
| # encounters. | |
| # | |
| # All information beyond the HTML string itself is specified via | |
| # options, these are explained below. | |
| # | |
| # To help understanding the options some more background | |
| # information about the parser. | |
| # | |
| # It is capable to detect incomplete tags in the HTML string | |
| # given to it. Under normal circumstances this will cause the | |
| # parser to throw an error, but if the option '-incvar' is used | |
| # to specify a global (or namespace) variable the parser will | |
| # store the incomplete part of the input into this variable | |
| # instead. This will aid greatly in the handling of | |
| # incrementally arriving HTML as the parser will handle whatever | |
| # he can and defer the handling of the incomplete part until | |
| # more data has arrived. | |
| # | |
| # Another feature of the parser are its two possible modes of | |
| # operation. The normal mode is activated if the option '-queue' | |
| # is not present on the command line invoking the parser. If it | |
| # is present the parser will go into the incremental mode instead. | |
| # | |
| # The main difference is that a parser in normal mode will | |
| # immediately invoke the command prefix for each tag it | |
| # encounters. In incremental mode however the parser will | |
| # generate a number of scripts which invoke the command prefix | |
| # for groups of tags in the HTML string and then store these | |
| # scripts in the specified queue. It is then the responsibility | |
| # of the caller of the parser to ensure the execution of the | |
| # scripts in the queue. | |
| # | |
| # Note: The queue objecct given to the parser has to provide the | |
| # same interface as the queue defined in tcllib -> struct. This | |
| # does for example mean that all queues created via that part of | |
| # tcllib can be immediately used here. Still, the queue doesn't | |
| # have to come from tcllib -> struct as long as the same | |
| # interface is provided. | |
| # | |
| # In both modes the parser will return an empty string to the | |
| # caller. | |
| # | |
| # To a parser in incremental mode the option '-split' can be | |
| # given and will specify the size of the groups he creates. In | |
| # other words, -split 5 means that each of the generated scripts | |
| # will invoke the command prefix for 5 consecutive tags in the | |
| # HTML string. A parser in normal mode will ignore this option | |
| # and its value. | |
| # | |
| # The option '-vroot' specifies a virtual root tag. A parser in | |
| # normal mode will invoke the command prefix for it immediately | |
| # before and after he processes the tags in the HTML, thus | |
| # simulating that the HTML string is enclosed in a <vroot> | |
| # </vroot> combination. In incremental mode however the parser | |
| # is unable to provide the closing virtual root as he never | |
| # knows when the input is complete. In this case the first | |
| # script generated by each invocation of the parser will contain | |
| # an invocation of the command prefix for the virtual root as | |
| # its first command. | |
| # | |
| # Interface to the command prefix: | |
| # | |
| # In normal mode the parser will invoke the command prefix with | |
| # for arguments appended. See '::htmlparse::debugCallback' for a | |
| # description. In incremental mode however the generated scripts | |
| # will invoke the command prefix with five arguments | |
| # appended. The last four of these are the same which were | |
| # mentioned above. The first however is a placeholder string | |
| # (\win\) for a clientdata value to be supplied later during the | |
| # actual execution of the generated scripts. This could be a tk | |
| # window path, for example. This allows the user of this package | |
| # to preprocess HTML strings without commiting them to a | |
| # specific window, object, whatever during parsing. This | |
| # connection can be made later. This also means that it is | |
| # possible to cache preprocessed HTML. Of course, nothing | |
| # prevents the user of the parser to replace the placeholder | |
| # with an empty string. | |
| # | |
| # Arguments: | |
| # args An option/value-list followed by the string to | |
| # parse. Available options are: | |
| # | |
| # -cmd The command prefix to invoke for every tag in | |
| # the HTML string. Defaults to | |
| # '::htmlparse::debugCallback'. | |
| # | |
| # -vroot The virtual root tag to add around the HTML in | |
| # normal mode. In incremental mode it is the | |
| # first tag in each chunk processed by the | |
| # parser, but there will be no closing tags. | |
| # Defaults to 'hmstart'. | |
| # | |
| # -split The size of the groups produced by an | |
| # incremental mode parser. Ignored when in | |
| # normal mode. Defaults to 10. Values <= 0 are | |
| # not allowed. | |
| # | |
| # -incvar The name of the variable where to store any | |
| # incomplete HTML into. Optional. | |
| # | |
| # -queue | |
| # The handle/name of the queue objecct to store | |
| # the generated scripts into. Activates | |
| # incremental mode. Normal mode is used if this | |
| # option is not present. | |
| # | |
| # After the options the command expects a single argument | |
| # containing the HTML string to parse. | |
| # | |
| # Side Effects: | |
| # In normal mode as of the invoked command. Else none. | |
| # | |
| # Results: | |
| # None. | |
| proc ::htmlparse::parse {args} { | |
| # Convert the HTML string into a evaluable command sequence. | |
| variable splitdata | |
| # Option processing, start with the defaults, then run through the | |
| # list of arguments. | |
| set cmd ::htmlparse::debugCallback | |
| set vroot hmstart | |
| set incvar "" | |
| set split 10 | |
| set queue "" | |
| while {[set err [cmdline::getopt args {cmd.arg vroot.arg incvar.arg split.arg queue.arg} opt arg]]} { | |
| if {$err < 0} { | |
| return -code error "::htmlparse::parse : $arg" | |
| } | |
| switch -exact -- $opt { | |
| cmd - | |
| vroot - | |
| incvar - | |
| queue { | |
| if {[string length $arg] == 0} { | |
| return -code error "::htmlparse::parse : -$opt illegal argument (empty)" | |
| } | |
| # Each option has an variable with the same name associated with it. | |
| # FRINK: nocheck | |
| set $opt $arg | |
| } | |
| split { | |
| if {$arg <= 0} { | |
| return -code error "::htmlparse::parse : -split illegal argument (<= 0)" | |
| } | |
| set split $arg | |
| } | |
| default { | |
| # Cannot happen | |
| } | |
| } | |
| } | |
| if {[llength $args] > 1} { | |
| return -code error "::htmlparse::parse : to many arguments behind the options, expected one" | |
| } | |
| if {[llength $args] < 1} { | |
| return -code error "::htmlparse::parse : html string missing" | |
| } | |
| set html [PrepareHtml [lindex $args 0]] | |
| # Look for incomplete HTML from the last iteration and prepend it | |
| # to the input we just got. | |
| if {$incvar != {}} { | |
| upvar $incvar incomplete | |
| } else { | |
| set incomplete "" | |
| } | |
| if {[catch {set new $incomplete$html}]} {set new $html} | |
| set html $new | |
| # Handle incomplete HTML (Recognize incomplete tag at end, buffer | |
| # it up for the next call). | |
| set end [lindex \{$html\} end] | |
| if {[set idx [string last < $end]] > [string last > $end]} { | |
| if {$incvar == {}} { | |
| return -code error "::htmlparse::parse : HTML is incomplete, option -incvar is missing" | |
| } | |
| # upvar $incvar incomplete -- Already done, s.a. | |
| set incomplete [string range $end $idx end] | |
| incr idx -1 | |
| set html [string range $end 0 $idx] | |
| } else { | |
| set incomplete "" | |
| } | |
| # Convert the HTML string into a script. First look for tag | |
| # patterns and convert them into command invokations. The command | |
| # is actually a placeholder ((LF) NUL SOH @ NUL). See step 2 for | |
| # the explanation. | |
| regsub -all -- {<([^\s>]+)\s*([^>]*)/>} $html {<\1 \2></\1>} html | |
| #set sub "\}\n\0\1@\0 {\\1} {} {\\2} \{\}\n\0\1@\0 {\\1} {/} {} \{" | |
| #regsub -all -- {<([^\s>]+)\s*([^>]*)/>} $html $sub html | |
| set sub "\}\n\0\1@\0 {\\2} {\\1} {\\3} \{" | |
| regsub -all -- {<(/?)([^\s>]+)\s*([^>]*)>} $html $sub html | |
| # Step 2, replace the command placeholder with the command | |
| # itself. This way any characters in the command prefix which are | |
| # special to regsub are kept from the regsub. | |
| set html [string map [list \n\0\1@\0 \n$cmd] $html] | |
| # The value of queue now determines wether we process the HTML by | |
| # ourselves (queue is empty) or if we generate a list of scripts | |
| # each of which processes n tags, n the argument to -split. | |
| if {$queue == {}} { | |
| # And evaluate it. This is the main parsing step. | |
| eval "$cmd {$vroot} {} {} \{$html\}" | |
| eval "$cmd {$vroot} / {} {}" | |
| } else { | |
| # queue defined, generate list of scripts doing small chunks of tags. | |
| set lcmd [llength $cmd] | |
| set key $split,$lcmd | |
| if {![info exists splitdata($key)]} { | |
| for {set i 0; set group {}} {$i < $split} {incr i} { | |
| # Use the length of the command prefix to generate | |
| # additional variables before the main variable after | |
| # which the placeholder will be inserted. | |
| for {set j 1} {$j < $lcmd} {incr j} { | |
| append group "b${j}_$i " | |
| } | |
| append group "a$i c$i d$i e$i f$i\n" | |
| } | |
| regsub -all -- {(a[0-9]+)} $group {{$\1} @win@} subgroup | |
| regsub -all -- {([b-z_0-9]+[0-9]+)} $subgroup {{$\1}} subgroup | |
| set splitdata($key) [list $group $subgroup] | |
| } | |
| foreach {group subgroup} $splitdata($key) break ; # lassign | |
| foreach $group "$cmd {$vroot} {} {} \{$html\}" { | |
| $queue put [string trimright [subst $subgroup]] | |
| } | |
| } | |
| return | |
| } | |
| # htmlparse::PrepareHtml -- | |
| # | |
| # Internal helper command of '::htmlparse::parse'. Removes | |
| # leading DOCTYPE declarations and comments, protects the | |
| # special characters of tcl from evaluation. | |
| # | |
| # Arguments: | |
| # html The HTML string to prepare | |
| # | |
| # Side Effects: | |
| # None. | |
| # | |
| # Results: | |
| # The provided HTML string with the described modifications | |
| # applied to it. | |
| proc ::htmlparse::PrepareHtml {html} { | |
| # Remove the following items from the text: | |
| # - A leading <!DOCTYPE...> declaration. | |
| # - All comments <!-- ... --> | |
| # | |
| # Also normalize the line endings (\r -> \n). | |
| # Tcllib SF Bug 861287 - Processing of comments. | |
| # Recognize EOC by RE, instead of fixed string. | |
| set html [string map [list \r \n] $html] | |
| regsub -- "^.*<!DOCTYPE\[^>\]*>" $html {} html | |
| regsub -all -- "--(\[ \t\n\]*)>" $html "\001\\1\002" html | |
| # Recognize borken beginnings of a comment and convert them to PCDATA. | |
| regsub -all -- "<--(\[^\001\]*)\001(\[^\002\]*)\002" $html {\<--\1--\2\>} html | |
| # And now recognize true comments, remove them. | |
| regsub -all -- "<!--\[^\001\]*\001(\[^\002\]*)\002" $html {} html | |
| # Protect characters special to tcl (braces, slashes) by | |
| # converting them to their escape sequences. | |
| return [string map [list \ | |
| "\{" "{" \ | |
| "\}" "}" \ | |
| "\\" "\"] $html] | |
| } | |
| # htmlparse::debugCallback -- | |
| # | |
| # The standard callback used by the parser in | |
| # '::htmlparse::parse' if none was specified by the user. Simply | |
| # dumps its arguments to stdout. This callback can be used for | |
| # both normal and incremental mode of the calling parser. In | |
| # other words, it accepts four or five arguments. The last four | |
| # arguments are described below. The optional fifth argument | |
| # contains the clientdata value given to the callback by a | |
| # parser in incremental mode. All callbacks have to follow the | |
| # signature of this command in the last four arguments, and | |
| # callbacks used in incremental parsing have to follow this | |
| # signature in the last five arguments. | |
| # | |
| # Arguments: | |
| # tag The name of the tag currently | |
| # processed by the parser. | |
| # | |
| # slash Either empty or a slash. Allows us to | |
| # distinguish between opening (slash is | |
| # empty) and closing tags (slash is | |
| # equal to a '/'). | |
| # | |
| # param The un-interpreted list of parameters | |
| # to the tag. | |
| # | |
| # textBehindTheTag The text found by the parser behind | |
| # the tag named in 'tag'. | |
| # | |
| # Side Effects: | |
| # None. | |
| # | |
| # Results: | |
| # None. | |
| proc ::htmlparse::debugCallback {args} { | |
| # args = ?clientData? tag slash param textBehindTheTag | |
| puts "==> $args" | |
| return | |
| } | |
| # htmlparse::mapEscapes -- | |
| # | |
| # Takes a HTML string, substitutes all escape sequences with | |
| # their actual characters and returns the resulting string. | |
| # HTML not containing escape sequences or invalid escape | |
| # sequences is returned unchanged. | |
| # | |
| # Arguments: | |
| # html The string to modify | |
| # | |
| # Side Effects: | |
| # None. | |
| # | |
| # Results: | |
| # The argument string with all escape sequences replaced with | |
| # their actual characters. | |
| proc ::htmlparse::mapEscapes {html} { | |
| # Find HTML escape characters of the form &xxx(;|EOW) | |
| # Quote special Tcl chars so they pass through [subst] unharmed. | |
| set new [string map [list \] \\\] \[ \\\[ \$ \\\$ \\ \\\\] $html] | |
| regsub -all -- {&([[:alnum:]]{2,31})(;|\M)} $new {[DoNamedMap \1 {\2}]} new | |
| regsub -all -- {&#([[:digit:]]{1,5})(;|\M)} $new {[DoDecMap \1 {\2}]} new | |
| regsub -all -- {&#x([[:xdigit:]]{1,4})(;|\M)} $new {[DoHexMap \1 {\2}]} new | |
| return [subst $new] | |
| } | |
| proc ::htmlparse::DoNamedMap {name endOf} { | |
| variable namedEntities | |
| if {[info exist namedEntities($name)]} { | |
| return $namedEntities($name) | |
| } else { | |
| # Put it back.. | |
| return "&$name$endOf" | |
| } | |
| } | |
| proc ::htmlparse::DoDecMap {dec endOf} { | |
| scan $dec %d dec | |
| if {$dec <= 0xFFFD} { | |
| return [format %c $dec] | |
| } else { | |
| # Put it back.. | |
| return "&#$dec$endOf" | |
| } | |
| } | |
| proc ::htmlparse::DoHexMap {hex endOf} { | |
| scan $hex %x value | |
| if {$value <= 0xFFFD} { | |
| return [format %c $value] | |
| } else { | |
| # Put it back.. | |
| return "&#x$hex$endOf" | |
| } | |
| } | |
| # htmlparse::2tree -- | |
| # | |
| # This command is a wrapper around '::htmlparse::parse' which | |
| # takes a HTML string and converts it into a tree containing the | |
| # logical structure of the parsed document. The tree object has | |
| # to be created by the caller. It is also expected that the tree | |
| # object provides the same interface as the tree object from | |
| # tcllib -> struct. It doesn't have to come from that module | |
| # though. The internal callback does some basic checking of HTML | |
| # validity and tries to recover from the most basic errors. | |
| # | |
| # Arguments: | |
| # html The HTML string to parse and convert. | |
| # tree The name of the tree to fill. | |
| # | |
| # Side Effects: | |
| # Creates a tree object (see tcllib -> struct) | |
| # and modifies it. | |
| # | |
| # Results: | |
| # The contents of 'tree'. | |
| proc ::htmlparse::2tree {html tree} { | |
| # One internal datastructure is required, a stack of open | |
| # tags. This stack is also provided by the 'struct' module of | |
| # tcllib. As the operation of this command is synchronuous we | |
| # don't have to take care against multiple running copies at the | |
| # same times (Such are possible, but will be in different | |
| # interpreters and true concurrency is possible only if they are | |
| # in different threads too). IOW, no need for tricks to make the | |
| # internal datastructure unique. | |
| catch {::htmlparse::tags destroy} | |
| ::struct::stack ::htmlparse::tags | |
| ::htmlparse::tags push root | |
| $tree set root type root | |
| parse -cmd [list ::htmlparse::2treeCallback $tree] $html | |
| # A bit hackish, correct the ordering of nodes for the optional | |
| # tag types, over a larger area when was seen by the parser itself. | |
| $tree walk root -order post n { | |
| ::htmlparse::Reorder $tree $n | |
| } | |
| ::htmlparse::tags destroy | |
| return $tree | |
| } | |
| # htmlparse::2treeCallback -- | |
| # | |
| # Internal helper command. A special callback to | |
| # '::htmlparse::parse' used by '::htmlparse::2tree' which takes | |
| # the incoming stream of tags and converts them into a tree | |
| # representing the inner structure of the parsed HTML | |
| # document. Recovers from simple HTML errors like missing | |
| # opening tags, missing closing tags and overlapping tags. | |
| # | |
| # Arguments: | |
| # tree The name of the tree to manipulate. | |
| # tag See '::htmlparse::debugCallback'. | |
| # slash See '::htmlparse::debugCallback'. | |
| # param See '::htmlparse::debugCallback'. | |
| # textBehindTheTag See '::htmlparse::debugCallback'. | |
| # | |
| # Side Effects: | |
| # Manipulates the tree object whose name was given as the first | |
| # argument. | |
| # | |
| # Results: | |
| # None. | |
| proc ::htmlparse::2treeCallback {tree tag slash param textBehindTheTag} { | |
| # This could be table-driven I think but for now the switches | |
| # should work fine. | |
| # Normalize tag information for later comparisons. Also remove | |
| # superfluous whitespace. Don't forget to decode the standard | |
| # entities. | |
| set tag [string tolower $tag] | |
| set textBehindTheTag [string trim $textBehindTheTag] | |
| if {$textBehindTheTag != {}} { | |
| set text [mapEscapes $textBehindTheTag] | |
| } | |
| if {"$slash" == "/"} { | |
| # Handle closing tags. Standard operation is to pop the tag | |
| # from the stack of open tags. We don't do this for </p> and | |
| # </li>. As they were optional they were never pushed onto the | |
| # stack (Well, actually they are just popped immediately after | |
| # they were pusheed, see below). | |
| switch -exact -- $tag { | |
| base - option - meta - li - p { | |
| # Ignore, nothing to do. | |
| } | |
| default { | |
| # The moment we get a closing tag which does not match | |
| # the tag on the stack we have two possibilities on how | |
| # this came into existence to choose from: | |
| # | |
| # a) A tag is now closed but was never opened. | |
| # b) A tag requiring an end tag was opened but the end | |
| # tag was omitted and we now are at a tag which was | |
| # opened before the one with the omitted end tag. | |
| # NOTE: | |
| # Pages delivered from the amazon.uk site contain both | |
| # cases: </a> without opening, <b> & <font> without | |
| # closing. Another error: <a><b></a></b>, i.e. overlapping | |
| # tags. Fortunately this can be handled by the algorithm | |
| # below, in two cycles, one of which is case (b), followed | |
| # by case (a). It seems as if Amazon/UK believes that visual | |
| # markup like <b> and <font> is an option (switch-on) instead | |
| # of a region. | |
| # Algorithm used here to deal with these: | |
| # 1) Search whole stack for the matching opening tag. | |
| # If there is one assume case (b) and pop everything | |
| # until and including this opening tag. | |
| # 2) If no matching opening tag was found assume case | |
| # (a) and ignore the tag. | |
| # | |
| # Part (1) also subsumes the normal case, i.e. the | |
| # matching tag is at the top of the stack. | |
| set nodes [::htmlparse::tags peek [::htmlparse::tags size]] | |
| # Note: First item is top of stack, last item is bottom of stack ! | |
| # (This behaviour of tcllib stacks is not documented | |
| # -> we should update the manpage). | |
| #foreach n $nodes {lappend tstring [p get $n -key type]} | |
| #puts stderr --[join $tstring]-- | |
| set level 1 | |
| set found 0 | |
| foreach n $nodes { | |
| set type [$tree get $n type] | |
| if {0 == [string compare $tag $type]} { | |
| # Found an earlier open tag -> (b). | |
| set found 1 | |
| break | |
| } | |
| incr level | |
| } | |
| if {$found} { | |
| ::htmlparse::tags pop $level | |
| if {$level > 1} { | |
| #foreach n $nodes {lappend tstring [$tree get $n type]} | |
| #puts stderr "\tdesync at <$tag> ($tstring) => pop $level" | |
| } | |
| } else { | |
| #foreach n $nodes {lappend tstring [$tree get $n type]} | |
| #puts stderr "\tdesync at <$tag> ($tstring) => ignore" | |
| } | |
| } | |
| } | |
| # If there is text behind a closing tag X it belongs to the | |
| # parent tag of X. | |
| if {$textBehindTheTag != {}} { | |
| # Attach the text behind the closing tag to the reopened | |
| # context. | |
| set pcd [$tree insert [::htmlparse::tags peek] end] | |
| $tree set $pcd type PCDATA | |
| $tree set $pcd data $textBehindTheTag | |
| } | |
| } else { | |
| # Handle opening tags. The standard operation for most is to | |
| # push them onto the stack and thus open a nested context. | |
| # This does not happen for both the optional tags (p, li) and | |
| # the ones which don't have closing tags (meta, br, option, | |
| # input, area, img). | |
| # | |
| # The text coming with the tag will be added after the tag if | |
| # it is a tag without a matching close, else it will be added | |
| # as a node below the tag (as it is the region between the | |
| # opening and closing tag and thus nested inside). Empty text | |
| # is ignored under all circcumstances. | |
| set node [$tree insert [::htmlparse::tags peek] end] | |
| $tree set $node type $tag | |
| $tree set $node data $param | |
| if {$textBehindTheTag != {}} { | |
| switch -exact -- $tag { | |
| input - area - img - br { | |
| set pcd [$tree insert [::htmlparse::tags peek] end] | |
| } | |
| default { | |
| set pcd [$tree insert $node end] | |
| } | |
| } | |
| $tree set $pcd type PCDATA | |
| $tree set $pcd data $textBehindTheTag | |
| } | |
| ::htmlparse::tags push $node | |
| # Special handling: <p>, <li> may have no closing tag => pop | |
| # : them immediately. | |
| # | |
| # Special handling: <meta>, <br>, <option>, <input>, <area>, | |
| # : <img>: no closing tags for these. | |
| switch -exact -- $tag { | |
| hr - base - meta - li - br - option - input - area - img - p - h1 - h2 - h3 - h4 - h5 - h6 { | |
| ::htmlparse::tags pop | |
| } | |
| default {} | |
| } | |
| } | |
| } | |
| # htmlparse::removeVisualFluff -- | |
| # | |
| # This command walks a tree as generated by '::htmlparse::2tree' | |
| # and removes all the nodes which represent visual tags and not | |
| # structural ones. The purpose of the command is to make the | |
| # tree easier to navigate without getting bogged down in visual | |
| # information not relevant to the search. | |
| # | |
| # Arguments: | |
| # tree The name of the tree to cut down. | |
| # | |
| # Side Effects: | |
| # Modifies the specified tree. | |
| # | |
| # Results: | |
| # None. | |
| proc ::htmlparse::removeVisualFluff {tree} { | |
| $tree walk root -order post n { | |
| ::htmlparse::RemoveVisualFluff $tree $n | |
| } | |
| return | |
| } | |
| # htmlparse::removeFormDefs -- | |
| # | |
| # Like '::htmlparse::removeVisualFluff' this command is here to | |
| # cut down on the size of the tree as generated by | |
| # '::htmlparse::2tree'. It removes all nodes representing forms | |
| # and form elements. | |
| # | |
| # Arguments: | |
| # tree The name of the tree to cut down. | |
| # | |
| # Side Effects: | |
| # Modifies the specified tree. | |
| # | |
| # Results: | |
| # None. | |
| proc ::htmlparse::removeFormDefs {tree} { | |
| $tree walk root -order post n { | |
| ::htmlparse::RemoveFormDefs $tree $n | |
| } | |
| return | |
| } | |
| # htmlparse::RemoveVisualFluff -- | |
| # | |
| # Internal helper command to | |
| # '::htmlparse::removeVisualFluff'. Does the actual work. | |
| # | |
| # Arguments: | |
| # tree The name of the tree currently processed | |
| # node The name of the node to look at. | |
| # | |
| # Side Effects: | |
| # Modifies the specified tree. | |
| # | |
| # Results: | |
| # None. | |
| proc ::htmlparse::RemoveVisualFluff {tree node} { | |
| switch -exact -- [$tree get $node type] { | |
| hmstart - html - font - center - div - sup - b - i { | |
| # Removes the node, but does not affect the nodes below | |
| # it. These are just made into chiildren of the parent of | |
| # this node, in its place. | |
| $tree cut $node | |
| } | |
| script - option - select - meta - map - img { | |
| # Removes this node and everything below it. | |
| $tree delete $node | |
| } | |
| default { | |
| # Ignore tag | |
| } | |
| } | |
| } | |
| # htmlparse::RemoveFormDefs -- | |
| # | |
| # Internal helper command to | |
| # '::htmlparse::removeFormDefs'. Does the actual work. | |
| # | |
| # Arguments: | |
| # tree The name of the tree currently processed | |
| # node The name of the node to look at. | |
| # | |
| # Side Effects: | |
| # Modifies the specified tree. | |
| # | |
| # Results: | |
| # None. | |
| proc ::htmlparse::RemoveFormDefs {tree node} { | |
| switch -exact -- [$tree get $node type] { | |
| form { | |
| $tree delete $node | |
| } | |
| default { | |
| # Ignore tag | |
| } | |
| } | |
| } | |
| # htmlparse::Reorder -- | |
| # Internal helper command to '::htmlparse::2tree'. Moves the | |
| # nodes between p/p, li/li and h<i> sequences below the | |
| # paragraphs and items. IOW, corrects misconstructions for | |
| # the optional node types. | |
| # | |
| # Arguments: | |
| # tree The name of the tree currently processed | |
| # node The name of the node to look at. | |
| # | |
| # Side Effects: | |
| # Modifies the specified tree. | |
| # | |
| # Results: | |
| # None. | |
| proc ::htmlparse::Reorder {tree node} { | |
| switch -exact -- [set tp [$tree get $node type]] { | |
| h1 - h2 - h3 - h4 - h5 - h6 - p - li { | |
| # Look for right siblings until the next node with a | |
| # similar type (or end of level) and move these below this | |
| # node. | |
| while {1} { | |
| set sibling [$tree next $node] | |
| if { | |
| ($sibling == {}) || | |
| ([lsearch -exact {h1 h2 h3 h4 h5 h6 p li} [$tree get $sibling type]] != -1) | |
| } { | |
| break | |
| } | |
| $tree move $node end $sibling | |
| } | |
| } | |
| default { | |
| # Ignore tag | |
| } | |
| } | |
| } | |
| # ### ######### ########################### | |
| package provide htmlparse 1.2.2 |