-
Notifications
You must be signed in to change notification settings - Fork 24
/
index.html
1298 lines (1089 loc) · 123 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html>
<html lang="en">
<head>
<title>Internationalization Best Practices for Spec Developers</title>
<meta charset="utf-8"/>
<link href="https://www.w3.org/StyleSheets/TR/2016/W3C-ED" rel="stylesheet">
<script src="make_checklist.js" type="application/ecmascript"></script>
<!--script src="devt/dumpdata.js" type="application/ecmascript"></script-->
<!--
=== NOTA BENE ===
For the three scripts below, if your spec resides on dev.w3 you can check them
out in the same tree and use relative links so that they'll work offline,
-->
<script src="https://www.w3.org/Tools/respec/respec-w3c-common" class="remove"></script>
<script class="remove">
var respecConfig = {
// specification status (e.g. WD, LCWD, NOTE, etc.). If in doubt use ED.
specStatus: "ED",
//publishDate: "2015-10-20",
previousPublishDate: "2015-10-20",
previousMaturity: "FPWD",
noRecTrack: true,
shortName: "international-specs",
copyrightStart: "2014",
edDraftURI: "https://w3c.github.io/bp-i18n-specdev/",
// editors, add as many as you like
// only "name" is required
editors: [
{ name: "Richard Ishida", url: "",
company: "W3C", companyURL: "",
w3cid: 39125 }
],
wg: "Internationalization Working Group",
wgURI: "https://www.w3.org/International/core/Overview",
wgPublicList: "www-international",
bugTracker: { new: "https://github.com/w3c/bp-i18n-specdev/issues", open: "https://github.com/w3c/bp-i18n-specdev/issues" } ,
otherLinks: [
{
key: "Github",
data: [
{
value: "repository",
href: "https://github.com/w3c/bp-i18n-specdev"
}
]
}
],
// URI of the patent status for this WG, for Rec-track documents
// !!!! IMPORTANT !!!!
// This is important for Rec-track documents, do not copy a patent URI from a random
// document unless you know what you're doing. If in doubt ask your friendly neighbourhood
// Team Contact.
wgPatentURI: "https://www.w3.org/2004/01/pp-impl/32113/status",
maxTocLevel: 2,
};
</script>
<link rel="stylesheet" href="local.css" type="text/css" />
</head>
<body>
<div id="sotd">
<p>This document provides advice to specification developers about how to incorporate requirements for international use. What is currently available here is expected to be useful immediately, but is still an early draft and the document is in flux, and will grow over time as knowledge applied in reviews and discussions can be crystallized into guidelines.</p>
<div class="note">
<p data-lang="en" style="font-weight: bold; font-size: 120%">Sending comments on this document</p>
<p data-lang="en">If you wish to make comments regarding this document, please raise them as <a href="https://github.com/w3c/bp-i18n-specdev/issues" style="font-size: 120%;">github issues</a>. Only send comments by email if you are unable to raise issues on github (see links below).</p>
<p data-lang="en">To make it easier to track comments, please raise separate issues or emails for each comment, and point to the section you are commenting on using a URL for the dated version of the document. All comments are welcome.</p>
</div>
</div>
<div id="abstract">
<p>This document provides a checklist of internationalization-related considerations when developing a specification. Most checklist items point to detailed supporting information in other documents. Where such information does not yet exist, it can be given a temporary home in this document. The dynamic page <a href="https://www.w3.org/International/techniques/developing-specs-dynamic">Internationalization Techniques: Developing specifications</a> is automatically generated from this document. <strong>The current version is still an early draft, and it is expected that the information will change regularly as new content is added and existing content is modified in the light of experience and discussion.</strong></p>
</div>
<section id="intro">
<h2>Introduction</h2>
<p>Developers of specifications need advice to ensure that what they produce will work for communities around the globe.</p>
<p>The Internationalization (i18n) WG tries to assist working groups by reviewing specifications and engaging in discussion. Often, however, such interventions come later in the process than would be ideal, or mean that the i18n WG has to repeat the same information for each working group it interacts with.</p>
<p>It would be better if specification developers could access a checklist of best practices, which points to explanations, examples and rationales where developers need it. Developers would then be able to build this knowledge into their work from the earliest stages, and could thereby reduce rework needed when the i18n WG reviews their specification.</p>
<p>This document contains the beginnings of a checklist, and points to locations where you can find explanations, examples and rationales for recommendations made. If there is no such other place, that extra information will be added to this document. It is still early days for this document, and it may also be used to develop ideas and organize them.</p>
<p>The guidelines in this document are not intended to be hard and fast requirements. This document will achieve a significant part of its purpose if, where you don't understand the guidelines or disagree with them, you contact the Internationalization WG to discuss what should be done.</p>
<p>You may prefer to use <a href="https://www.w3.org/International/techniques/developing-specs?collapse">Internationalization Techniques: Developing specifications</a> most of the time, since it uses JavaScript to help you more quickly see what's available and drill down to the information you need. (Where needed, it links to this or other documents.) There is also a <a href="https://www.w3.org/International/techniques/developing-specs">non-dynamic version</a> of the document available.</p>
<p>If your spec is github based, you can now create a snapshot of the checklist items in markdown. If you add that to a github issue, you can check off items that are ok, and add comments while doing a self-review. <a href="https://w3c.github.io/bp-i18n-specdev/#ghChecklist">Generate the code here</a>.</p>
</section>
<!-- end of characters section -->
<section id="resource" class="topic">
<h2>Language</h2>
<ul class="summary">
<li><a href="#sec_lang_decl">Language basics</a></li>
<li><a href="#sec_lang_values">Defining language values</a></li>
<li><a href="#sec_lang_mixed">Declaring language at the resource level</a></li>
<li><a href="#lang_block">Establishing the language of a content block</a></li>
<li><a href="#lang_inline">Establishing the language of inline runs</a></li>
</ul>
<section id="sec_lang_decl" class="subtopic">
<h3>Language basics</h3>
<p class="advisement" id="lang_basics_1"><a class="self" href="#lang_basics_1">​</a>It should be possible to associate a language with any piece of natural language text that will be read by a user. <a class="local" href="https://w3c.github.io/bp-i18n-specdev/#sec_lang_decl">more</a></p>
<p class="advisement" id="lang_basics_inline"><a class="self" href="#lang_basics_inline">​</a>Where possible, there should be a way to label natural language changes in inline text. <a class="local" href="https://w3c.github.io/bp-i18n-specdev/#sec_lang_decl">more</a></p>
<p>Text is rendered or processed differently according to the language it is in. For example, screen readers need to be prompted when a language changes, and spell checkers should be language-sensitive. When rendering text a knowledge of language is need in order to apply correct fonts, hyphenation, line-breaking, upper/lower case changes, and other features.</p>
<p>For example, ideographic characters such as 雪, 刃, 直, 令, 垔 have slight but important differences when used with Japanese vs Chinese fonts, and it's important not to apply a Chinese font to the Japanese text, and vice versa when it is presented to a user.</p>
<p class="advisement" id="lang_basics_meta"><a class="self" href="#lang_basics_meta">​</a>Consider whether it is useful to express the <a href="https://w3c.github.io/bp-i18n-specdev/#sec_lang_meta">intended linguistic audience</a> of a resource, in addition to specifying the language used for <a href="https://w3c.github.io/bp-i18n-specdev/#sec_text_processing_lang">text processing</a>. <a class="local" href="https://w3c.github.io/bp-i18n-specdev/#sec_lang_decl">more</a></p>
<p>Language information for a given resource can be used with two main objectives in mind: for text-processing, or as a statement of the intended use of the resource. We will explain the difference below.</p>
<section id="sec_text_processing_lang">
<h4>Text-processing language information</h4>
<p class="advisement" id="tp_lang_values"><a class="self" href="#tp_lang_values">​</a>A language declaration that indicates the <a href="https://w3c.github.io/bp-i18n-specdev/#sec_text_processing_lang">text processing language</a> for a range of text must associate a single language value with a specific range of text. <a class="local" href="https://w3c.github.io/bp-i18n-specdev/#sec_text_processing_lang">more</a></p>
<p>When specifying the <dfn>text-processing language</dfn> you are declaring the language in which <strong>a specific range of text is actually written</strong>, so that user agents or applications that manipulate the text, such as voice browsers, spell checkers, style processors, hyphenators, etc., can apply the appropriate rules to the text in question. So we are, by necessity, talking about associating a <em>single</em> language with a <em>specific</em> range of text.</p>
<p>It is normal to express a text-processing language as the default language, for processing the resource as a whole, but it may also be necessary to indicate where the language changes within the resource.</p>
<p class="advisement" id="lang_attribute_xml"><a class="self" href="#lang_attribute_xml">​</a>Use the HTML <code class="kw" translate="no">lang</code> and XML <code class="kw" translate="no">xml:lang</code> language attributes where appropriate to identify the <a href="https://w3c.github.io/bp-i18n-specdev/#sec_text_processing_lang">text processing language</a>, rather than creating a new attribute or mechanism. <a class="local" href="https://w3c.github.io/bp-i18n-specdev/#lang_attribute_xml">more</a></p>
To identify the text-processing language for a range of text, HTML provides the <code class="kw" translate="no">lang</code> attribute, while XML provides <code class="kw" translate="no">xml:lang</code> which can be used in all XML formats. It's useful to continue using those attributes for relevant markup formats, since authors recognize them, as do HTML and XML processors.
</section>
<section id="sec_lang_meta">
<h4>Language metadata about the resource as a whole</h4>
<p>It may also be useful to describe the language of a resource <strong> as a whole</strong>. This type of language declaration typically indicates the <strong>intended use of the resource</strong>. For example, such metadata may be used for searching, serving the right language version, classification, etc. </p>
<p>This type of language declaration differs from that of the text-processing declaration in that (a) the value for such declarations may be more than one language subtag, and (b) the language value declared doesn't indicate which bits of a multilingual resource are in which language.</p>
<p class="advisement" id="metadata_lang_values"><a class="self" href="#metadata_lang_values">​</a>It should be possible to associate a metadata-type language declaration (which indicates the intended use of the resource rather than the language of a specific range of text) with multiple language values. <a class="local" href="https://w3c.github.io/bp-i18n-specdev/#sec_lang_meta">more</a></p>
<p>The language(s) describing the intended use of a resource do not necessarily include every language used in a document. For example, many documents on the Web contain embedded fragments of content in different languages, whereas the page is clearly aimed at speakers of one particular language. For example, a German city-guide for Beijing may contain useful phrases in Chinese, but it is aimed at a German-speaking audience, not a Chinese one.</p>
<p>On the other hand, it is also possible to imagine a situation where a document contains the same or parallel content in more than one language. For example, a web page may welcome Canadian readers with French content in the left column, and the same content in English in the right-hand column. Here the document is equally targeted at speakers of both languages, so there are two audience languages. Another use case is a blog or a news page aimed at a multilingual community, where some articles on a page are in one language and some in another. In this case, it may make sense to list more than one language tag as the value of the language declaration.</p>
<p class="advisement" id="metadata_not_lang"><a class="self" href="#metadata_not_lang">​</a>Attributes that express the language of external resources should not use the HTML <code class="kw" translate="no">lang</code> and XML <code class="kw" translate="no">xml:lang</code> language attributes, but should use a different attribute when they represent metadata (which indicates the intended use of the resource rather than the language of a specific range of text). <a class="local" href="https://w3c.github.io/bp-i18n-specdev/#sec_lang_meta">more</a></p>
<p>Using a different attribute to indicate the language of an external resource allows the attribute to specify more than one language. It also works better if the resource pointed to is not in a single language. </p>
<p>This distinction can be seen in HTML in the separation of the <code class="kw" translate="no">lang</code> and <code class="kw" translate="no">hreflang</code> attributes. The former indicates the language of the text within the HTML page; the latter is metadata indicating the expected language of a page that is linked to.</p>
<p>For a longer discussion of this see <a href="https://www.w3.org/International/questions/qa-when-xmllang">xml:lang in XML document schemas</a>. This article talks specifically about <code class="kw" translate="no">xml:lang</code>, but the concepts are applicable to other situations.</p>
</section>
<section class="links">
<h4>Links</h4>
<section class="howto">
<h5>How to's</h5>
<ul>
<li class="w3">
<p class="link"><a href="https://www.w3.org/International/questions/qa-when-xmllang">xml:lang in XML document schemas</a></p>
<p class="desc">When should I use <code class="kw" translate="no">xml:lang</code> and when should I define my own element or attribute for passing language values in an XML document schema (DTD)?</p>
</li>
<li class="w3">
<p class="link"><a href="https://w3c.github.io/bp-i18n-specdev/#sec_lang_decl">Language basics</a></p>
<p class="desc">In <a href="https://w3c.github.io/bp-i18n-specdev/">Internationalization Best Practices for Spec Developers</a>.</p>
</li>
</ul>
</section>
<section class="background">
<h5>Background</h5>
<ul>
<li class="w3">
<p class="link"><a href="https://www.w3.org/International/questions/qa-lang-why">Why use the language attribute?</a></p>
<p class="desc">Describes why it is useful to use the <code class="kw" translate="no">lang</code> or <code class="kw" translate="no">xml:lang</code> attribute to label language in web pages.</p>
</li>
<li class="w3">
<p class="link"><a href="https://w3c.github.io/i18n-discuss/notes/annotation-language-use-cases">Use cases for language information in web annotations</a></p>
<p class="desc">Description of use cases for annotations that illustrate the differences between text-processing and metadata types of language declaration.</p>
</li>
<li class="w3">
<p class="link"><a href="https://www.w3.org/International/questions/qa-http-and-lang">HTTP headers, meta elements and language information</a></p>
<p class="desc">How the distinction between text-processing language and language metadata plays out in HTML5.</p>
</li>
</ul>
</section>
</section>
</section>
<section id="sec_lang_values" class="subtopic">
<h3>Defining language values</h3>
<p class="advisement" id="lang_use_bcp47"><a class="self" href="#lang_use_bcp47">​</a>Values for language declarations must use BCP 47. <a class="local" href="https://w3c.github.io/bp-i18n-specdev/#sec_lang_values">more</a></p>
<p>BCP 47 defines a method to combine subtags in order to create a much more powerful notation for language tags than that provided by the old ISO lists, but it is also backwards compatible with the ISO lists.</p>
<p>For an overview of the key features of BCP 47, see <a href="https://www.w3.org/International/articles/language-tags/">Language tags in HTML and XML</a>.</p>
<p class="advisement" id="lang_bcp_not_rfc"><a class="self" href="#lang_bcp_not_rfc">​</a>Refer to BCP 47, not to RFC 5646. <a class="local" href="https://w3c.github.io/bp-i18n-specdev/#sec_lang_values">more</a></p>
<p>The link to and name of BCP 47 was created specifically so that there is an unchanging reference to the definition of Tags for the Identification of Languages. RFCs 3066, 4646 and 5646 are versions of BCP 47.</p>
<p class="advisement" id="lang_values_valid"><a class="self" href="#lang_values_valid">​</a>Be specific about what level of conformance you expect for language tags. The word "valid" has special meaning in BCP 47. Generally "well-formed" is a better choice.</p>
<p>A well-formed BCP 47 language tag has hyphen-separated subtags with specific lengths and in a particular order. Valid BCP 47 language tags are well-formed but also use only subtags that are listed in the IANA Subtag Registry. Note that the IANA Subtag Registry is frequently updated with new subtags.</p>
<p class="advisement" id="lang_matching_bcp"><a class="self" href="#lang_matching_bcp">​</a>Reference BCP47 for language tag matching.</p>
<p>BCP 47 contains one RFC dedicated to the syntax and subtags of language tags, and another dedicated to how to match two or more subtags. (This topic needs more detail, and may merit being a separate section.)</p>
<section class="links">
<h4>Links</h4>
<section class="howto">
<h5>How to's</h5>
<ul>
<li class="w3">
<p class="link"><a href="https://w3c.github.io/bp-i18n-specdev/#sec_lang_values">Defining language values</a></p>
<p class="desc">In <a href="https://w3c.github.io/bp-i18n-specdev/">Internationalization Best Practices for Spec Developers</a>.</p>
</li>
<li class="nonw3">
<p class="link"><a href="https://www.rfc-editor.org/rfc/bcp/bcp47.txt">BCP 47</a></p>
<p class="desc">The IETF specification that indicates how to create language subtags and how to match them<a href="https://w3c.github.io/bp-i18n-specdev/"></a>.</p>
</li>
</ul>
</section>
<section class="background">
<h5>Background</h5>
<ul>
<li class="w3">
<p class="link"><a href="https://www.w3.org/International/articles/language-tags/">Language tags in HTML and XML</a></p>
<p class="desc">An overview of how to create language tags using BCP 47.</p>
</li>
</ul>
</section>
</section>
</section>
<section id="sec_lang_mixed" class="subtopic">
<h3>Declaring language at the resource level</h3>
<p>Here we are talking about an independent unit of data that contains structured text. Examples may include a whole HTML page, an XML document, a JSON file, a WebVTT script, an annotation, etc.</p>
<p class="advisement" id="lang_whole_res"><a class="self" href="#lang_whole_res">​</a>The specification should indicate how to define the default text-processing language for the resource as a whole. <a class="local" href="https://w3c.github.io/bp-i18n-specdev/#lang_whole_res">more</a></p>
<p>It often saves trouble to identify the language, or at least the default language, of the resource as a whole in one place. For example, in an HTML file, this is done by setting the <code class="kw" translate="no">lang</code> attribute on the <code class="kw" translate="no">html</code> element.</p>
<p class="advisement" id="lang_inherit"><a class="self" href="#lang_inherit">​</a>Content within the resource should inherit the language of the text-processing declared at the resource level, unless it is specifically overridden.</p>
<p class="advisement" id="lang_tp_meta"><a class="self" href="#lang_tp_meta">​</a>Consider whether it is necessary to have separate declarations to indicate the text-processing language versus metadata about the expected use of the resource. <a class="local" href="https://w3c.github.io/bp-i18n-specdev/#lang_tp_meta">more</a></p>
<p>In many cases a resource contains text in only one language, and in many more cases the language declared as the default language for text-processing is the same as the language that describes the metadata about the resource as a whole. In such cases it makes sense to have a single declaration.</p>
<p>It becomes problematic, however, to use a single declaration when it refers to more than one language unless there is a way to determine which one language should be used as the text-processing default.</p>
<p class="advisement" id="lang_mixing"><a class="self" href="#lang_mixing">​</a>If there is only one language declaration for a resource, and it has more than one language tag as a value, it must be possible to identify the default text-processing language for the resource. <a class="local" href="https://w3c.github.io/bp-i18n-specdev/#lang_tp_meta">more</a></p>
<section class="links">
<h4>Links</h4>
<section class="howto">
<h5>How to's</h5>
<ul>
<li class="w3">
<p class="link"><a href="https://w3c.github.io/bp-i18n-specdev/#sec_lang_mixed">Declaring language at the resource level</a></p>
<p class="desc">In <a href="https://w3c.github.io/bp-i18n-specdev/">Internationalization Best Practices for Spec Developers</a>.</p>
</li>
</ul>
</section>
<section class="background">
<h5>Background</h5>
<ul>
<li class="w3">
<p class="link"><a href="https://w3c.github.io/i18n-discuss/notes/annotation-language-use-cases">Use cases for language information in web annotations</a></p>
<p class="desc">Description of use cases for annotations that illustrate the differences between text-processing and metadata types of language declaration.</p>
</li>
<li class="w3">
<p class="link"><a href="https://www.w3.org/International/questions/qa-http-and-lang">HTTP headers, meta elements and language information</a></p>
<p class="desc">How the distinction between text-processing language and language metadata plays out in HTML5.</p>
</li>
</ul>
</section>
</section>
<section class="xref"><h4>See also</h4>
<p><a href="#sec_lang_values">Defining language values</a>.</p>
</section>
</section>
<section id="lang_block" class="subtopic">
<h3>Establishing the language of a content block</h3>
<p>The words <dfn>block</dfn> and/or <dfn>chunk</dfn> are used here to refer to a structural component within the resource as a whole that groups content together and separates it from adjacent content. Boundaries between one block and another are equivalent to paragraph or section boundaries in text, or discrete data items inside a file. </p>
<p>For example, this could refer to a block or paragraph in XML or HTML, an object declaration in JSON, a cue in WebVTT, a line in a CSV file, etc. Contrast this with <dfn>inline</dfn> content, which describes a range within a paragraph, sentence, etc.</p>
<p>The interpretation of which structures defined in a spec are relevant to these requirements may require a little consideration, and will depend on the format of the data involved.</p>
<p class="advisement" id="lang_block_inherit"><a class="self" href="#lang_block_inherit">​</a>By default, blocks of content should inherit any text-processing language set for the resource as a whole. <a class="local" href="https://w3c.github.io/bp-i18n-specdev/#lang_block_inherit">more</a></p>
<p>See <a href="#sec_lang_decl"></a> for guidance related to the default text-processing language information.</p>
<p class="advisement" id="lang_block_change"><a class="self" href="#lang_block_change">​</a>It should be possible to indicate a change in language for blocks of content where the language changes. <a class="local" href="https://w3c.github.io/bp-i18n-specdev/#lang_block_change">more</a></p>
<section class="links">
<h4>Links</h4>
<section class="howto">
<h5>How to's</h5>
<ul>
<li class="w3">
<p class="link"><a href="https://w3c.github.io/bp-i18n-specdev/#lang_block">Establishing the language of a content block</a></p>
<p class="desc">In <a href="https://w3c.github.io/bp-i18n-specdev/">Internationalization Best Practices for Spec Developers</a>.</p>
</li>
</ul>
</section>
</section>
<section class="xref"><h4>See also</h4>
<p><a href="#sec_lang_values">Defining language values</a>.</p>
</section>
</section>
<section id="lang_inline" class="subtopic">
<h3>Establishing the language of inline runs</h3>
<p>Here we refer to information that needs to be provided for a range of characters in the middle of a paragraph or string.</p>
<p class="advisement" id="lang_inline_spans"><a class="self" href="#lang_inline_spans">​</a>It should be possible to indicate language for spans of inline text where the language changes. <a class="local" href="https://w3c.github.io/bp-i18n-specdev/#lang_inline_spans">more</a></p>
<p>Where a switch in language can affect operations on the content, such as spell-checking, rendering, styling, voice production, translation, information retrieval, and so forth, it is necessary to indicate the range of text affected and identify the language of that content.</p>
<section class="links">
<h4>Links</h4>
<section class="howto">
<h5>How to's</h5>
<ul>
<li class="w3">
<p class="link"><a href="https://w3c.github.io/bp-i18n-specdev/#lang_inline">Establishing the language</a></p>
<p class="desc">In <a href="https://w3c.github.io/bp-i18n-specdev/">Internationalization Best Practices for Spec Developers</a>.</p>
</li>
</ul>
</section>
</section>
<section class="xref"><h4>See also</h4>
<p><a href="#sec_lang_values">Defining language values</a>.</p>
</section>
</section>
</section>
<section id="text_direction" class="topic">
<h2>Text direction</h2>
<ul class="summary">
<li><a href="#sec_dir_basic">Basic requirements</a></li>
<li><a href="#sec_dir_background">Background information</a></li>
<li><a href="#sec_bidi_markup">Handling direction in markup</a></li>
</ul>
<p>It is important to establish direction for text written or mixed with right-to-left scripts. Characters in these scripts are stored in memory in the order they are typed and pronounced – called the logical order. The Unicode Bidirectional Algorithm (UBA) provides a lot of support for automatically rendering a sequence of characters stored in logical order so that they are visually ordered as expected. Unfortunately, the UBA alone is not sufficient to correctly render bidirectional text, and additional information has to be provided about the default directional context to apply for a given sequence of characters.</p>
<section id="sec_dir_basic" class="subtopic">
<h3>Basic requirements</h3>
<p>The basic requirements are as follows.</p>
<p class="advisement" id="dir_paragraphs"><a class="self" href="#dir_paragraphs">​</a>It must be possible to indicate base direction for each individual paragraph-level item of natural language text that will be read by someone. <a class="local" href="https://w3c.github.io/bp-i18n-specdev/#dir_paragraphs">more</a></p>
<p class="advisement" id="dir_inline"><a class="self" href="#dir_inline">​</a>It must be possible to indicate base direction changes for embedded runs of inline bidirectional text for all natural language text that will be read by someone. <a class="local" href="https://w3c.github.io/bp-i18n-specdev/#dir_inline">more</a></p>
<p class="advisement" id="dir_reasonable"><a class="self" href="#dir_inline">​</a>Annotating right-to-left text must require the minimum amount of effort for people who work natively with right-to-left scripts. <a class="local" href="https://w3c.github.io/bp-i18n-specdev/#dir_reasonable">more</a></p>
<p>Requiring a speaker of Arabic, Divehi, Hebrew, Persian, Urdu, etc. to add markup or control characters to every paragraph or small data item they write is far too much to be manageable. Typically, the format should establish a default direction and require the user to intervene only when exceptions have to be dealt with.</p>
</section>
<section id="sec_dir_background" class="subtopic">
<h3>Background information</h3>
<p>In this section we try to set out some key concepts associated with text direction, so that it will be easier to understand the recommendations that follow.</p>
<section id="sec_dir_defs">
<h4>Important definitions</h4>
<p>In order to correctly display text written in a 'right-to-left' script or left-to-right text containing bidirectional elements, it is important to establish the <a href="https://www.w3.org/International/articles/inline-bidi-markup/uba-basics#context" class="termref">base direction</a> that will be used to dictate the order in which elements of the text will be displayed.</p>
<p>If you are not familiar with what the Unicode Bidirectional Algorithm (UBA) does and doesn't do, and why the base direction is so important, read <a href="https://www.w3.org/International/articles/inline-bidi-markup/uba-basics">Unicode Bidirectional Algorithm basics</a>.</p>
<aside class="example">
<p>For example, the following annotation will not display correctly unless the application doing the display knows that the base direction needs to be right-to-left.</p>
<pre>{
"@context": "http://www.w3.org/ns/anno.jsonld",
"id": "http://example.org/anno5",
"type":"Annotation",
"body": {
"type" : "TextualBody",
"text" : "פעילות הבינאום, W3C",
"format" : "text/html",
"language" : "he"
},
"target": "http://example.org/photo1"
}
</pre>
<p>You would expect the phrase in the <code class="kw" translate="no">text</code> property value to be displayed as</p>
<p><span dir="rtl">פעילות הבינאום, W3C</span></p>
<p>however, if there is no indication that the base direction should be right-to-left the following incorrect display will be produced:</p>
<p>פעילות הבינאום, W3C</p>
</aside>
<p>In this section, the word <dfn>paragraph</dfn> indicates a run of text followed by a hard line-break in plain text, but may signify different things in other situations. In CSV it equates to 'cell', so a single line of comma-separated items is actually a set of comma-separated paragraphs. In HTML it equates to the lowest level of block element, which is often a <code class="kw" translate="no">p</code> element, but may be things such as <code class="kw" translate="no">div</code>, <code class="kw" translate="no">li</code>, etc., if they only contain text and/or inline elements. In JSON, it often equates to a quoted string value, but if a string value uses markup then paragraphs are associated with block elements, and if the string value is multiple lines of plain text then each line is a paragraph.</p>
<p>The term <dfn>metadata</dfn> is used here to mean information which could be an annotation or property associated with the data, or could be markup in scenarios that allow that, or could be a higher-level protocol, etc.</p>
</section>
<section id="setting_bd">
<h4>Ways base direction can be set for paragraphs</h4>
<p>There are a number of possible ways of setting the base direction.</p>
<ol>
<li>The base direction of a paragraph may be set by an application or a user applying metadata to the paragraph. Typical values for base direction may include <code class="kw" translate="no">ltr</code>, <code class="kw" translate="no">rtl</code> or <code class="kw" translate="no">auto</code>.
<ul>
<li>The metadata may specifically indicate that heuristics should be used. Then you would expect to consider the actual characters used in order to determine the base direction. (This is what happens if you set <code>dir=auto</code> on an HTML element.)</li>
<li>The application may expect metadata, but there may be no such information provided. In this case you would usually expect there to be a default direction specified, and the base direction for a cell would be set to that default. The default is usually LTR. (This is what happens if you have no <code class="kw" translate="no">dir</code> attributes in your HTML file.)</li>
<li>Where a format contains many paragraphs or chunks of information, and the language of text in all those chunks is the same, it is sometimes useful to allow a default base direction to be set for and inherited by all. This is what happens when you set the <code class="kw" translate="no">dir</code> attribute on the <code class="kw" translate="no">html</code> tag in HTML. Another example would be a subtitling file containing many cues, all written in Arabic; it would be best to allow the author to say at the start of the file that the default is RTL for all cue text. There should always be a way to override the direction information for a specific paragraph where needed.</li>
</ul></li>
<li>If the application expects no metadata to be available it should use heuristics to determine the base direction for each paragraph/cell. A typical solution, and one described by UAX 9 <cite>Unicode Bidirectional Algorithm</cite>, is to look for the first-strong character in the paragraph/cell. (This is likely to apply if you are looking at plain text that is not expected to be associated with metadata. It only happens with HTML if the direction is set to <code class="kw" translate="no">auto</code>, since HTML specifies a default direction.)
<ul style="margin-left:0; margin-right:1em;">
<li>Not all paragraphs using the first-strong method will have the correct base direction applied. In some cases, an Arabic or Hebrew, etc, paragraph may start with strong LTR characters. There must be a way to deal with this.</li>
<li>Where a syntactic unit contains multiple lines of plain text (for example, a multiline cue text in a subtitling file), the first-strong heuristic needs to be applied to each line separately.</li>
<li>There may be special rules that involve ignoring some sequence of characters or type of markup at the start of the paragraph before identifying the first strong character.</li>
<li>In some cases there are no strong characters in a paragraph, and the base direction can be critically important for the data to be understood correctly, eg. telephone numbers or MAC addresses. There needs to be a way to resort to an appropriate default for these cases.</li>
</ul></li>
<li>Whether or not any metadata is specified, if a paragraph contains a string that starts with one of the Unicode bidi control characters RLI, LRI, FSI, LRE, RLE, LRO, or RLO and ends with PDF/PDI, these characters will determine the base direction for the contained string. These characters, when placed in the content, explicitly override any previously set direction by creating an inline range and assigning a base direction to it.
<ul style="margin-left:0; margin-right:1em;"><li>The effect of such characters does not extend past paragraph boundaries, but the range ought to be explicitly ended using the PDF/PDI control character, especially if a paragraph end is not easily detectable by the application.)</li>
<li>Because isolation is needed for bidirectional text to work properly, the Unicode Standard says that the isolating control codes RLI, LRI and FSI should be used rather than LRE or RLE. Unfortunately, those characters are still not widely supported.</li>
<li>For structural components in markup, above the paragraph level, it is not possible to use the Unicode bidi control characters to define direction for paragraphs, since these are inline controls only, and the effect is terminated by a paragraph end.</li>
</ul>
</li>
</ol>
<p>When capturing text input by a user it is usually necessary to understand the context in which the user was inputting the data to determine the base direction of the input. In HTML, for example, this may be set by the direction inherited from the <code class="kw" translate="no">html</code> tag, or by the user pressing keys to set the base direction for a form field. It is then necessary to find some way of storing the information about base direction or associating it with the string when rendered. Typically, in this situation, any direction changes internal to the string being input are handled by the user and will be captured as part of the string.</p>
</section>
<section id="inline_changes">
<h4>Inline changes to base direction</h4>
<p>Embedded ranges of text <em>within</em> a single paragraph may need to have a different base direction. For example, </p>
<p>"The title was '!NOITASILANOITANRETNI'."</p>
<p>where the span within the single quotes is in Hebrew/Arabic/Divehi, etc., and needs to have a RTL base direction, instead of the LTR base direction of the surrounding paragraph, in order to place the exclamation mark correctly. </p>
<p>If markup is available to the content author, it is likely to be easier and safer to use markup to indicate such inline ranges (see below). In HTML you would usually use an inline element with a <code class="kw" translate="no">dir</code> attribute to establish the base direction for such runs of text. If you can't mark up the text, such as in HTML's <code class="kw" translate="no">title</code> element, or any environment that handles only plain text content, you have to resort to Unicode's paired control characters to establish the base direction for such an internal range.</p>
<p>Furthermore, inline ranges where the base direction is changed should be isolated from surrounding text, so that the UBA doesn't produce incorrect results due to interference across boundaries. <a href="http://www.w3.org/International/articles/inline-bidi-markup/#uc5">See an example</a> of how this can produce incorrect ordering of things such as text followed by numbers in HTML, or <a href="http://www.w3.org/International/articles/inline-bidi-markup/#usecase3">another example</a> of how it can affect lists.</p>
<p>This means that if a content author is using Unicode control codes they should use RLI/LRI...PDI rather than RLE/LRE...PDF. These isolating codes are fairly new, and applications may not yet support them.</p>
</section>
<section id="control_problems">
<h4>Problems with control characters</h4>
<p>Reasons to avoid relying on control characters to set direction include the following:</p>
<ol>
<li>They are invisible in most editors and are therefore difficult to work with, and can easily lead to orphans and overlapping ranges. They can be particularly difficult to manage when editing bidirectional inline text because it's hard to position the cursor in the correct place. If you ask someone who writes in a right-to-left script, you are likely to find that they dislike using control codes.</li>
<li>Users often don't have the necessary characters available on their keyboard, or have difficulty inputting them.</li>
<li>It is sometimes necessary to choose which to use based on context or the type of the data, and this means that a content author typically needs to select the control codes – specifying control codes in this way for all paragraphs is time-consuming and error-prone.</li>
<li>Processors that extract parts of the data, add to it, or reuse in combination with other text may incorrectly handle the control codes.</li>
<li>Search and comparison algorithms should ignore these characters, but typically don't.</li>
</ol>
<p>The last two items above may also hold for markup, but implementers often support included markup better than included control codes.</p>
<p>Don't expect users to add control codes at the start and end of every paragraph. That's far too much work.</p>
</section>
<section id="rlmlrm">
<h4>RLM and LRM</h4>
<p>A word about the Unicode characters <span class="uname">U+200F RIGHT-TO-LEFT MARK</span> (RLM) and <span class="uname">U+200E LEFT-TO-RIGHT MARK</span> (LRM) is warranted at this point.</p>
<p>The first point to be clear about is that neither RLM nor LRM establish the base direction for a range of text. They are simply invisible characters with strong directional properties.</p>
<p>This means that you cannot use RLM for example, to make the text W3C appear to the left of the Hebrew text in the following example.</p>
<p>The title is "<span dir="rtl">פעילות הבינאום, W3C</span>".</p>
<p>For this you can only use metadata or the paired control characters.</p>
<p>Of course, if you are detecting base direction using first-strong heuristics then RLM and LRM can be useful for setting the base direction where the text in question begins with something that would otherwise give the wrong result, eg. </p>
<p>"<span dir="rtl">نشاط التدويل</span>" is how you say "i18n Activity" in Arabic.</p>
<p>Here an LRM could be placed at the start of the text, before the strong RTL Arabic characters, to prevent the algorithm from assuming that the text should be right-to-left. (Remember that if metadata is used to set the base direction, that character is ignored, unless the metadata specifically says that first-strong heuristics should be used.)</p>
</section>
<section id="bd_language">
<h4>Base direction and language</h4>
<p class="advisement" id="bidi_lang"><a class="self" href="#bidi_lang">​</a>Do not assume that direction can be determined from language information. <a class="local" href="https://w3c.github.io/bp-i18n-specdev/#bidi_lang">more</a></p>
<p>The following are all reasons you cannot use language tags to provide information about base direction:</p>
<ol>
<li>you can't produce the <code class="kw" translate="no">auto</code> value with language tags.</li>
<li>some languages are written with both RTL and LTR scripts.</li>
<li>the only reliable part of the language tag that would indicate the base direction is the script tag, but BCP47 recommends that you suppress the use of the script tag for languages that don't usually need it, such as Hebrew (<code>suppressscript: Hebr</code>). Languages, such as Persian, that are usually written in a RTL script may be written in transcribed form, and it's not possible to guarantee that the necessary script tag would be present to carry the directional information. In summary, you won't be able to rely on people supplying script tags as part of the language information in order to influence direction.</li>
<li>the incidence of use of language tags and base direction markers often don't coincide.</li>
<li>they are not semantically equivalent.</li>
</ol>
</section>
</section>
<section id="sec_bidi_markup" class="subtopic">
<h3>Handling direction in markup</h3>
<section id="sec_default_base">
<h4>Setting the default base direction</h4>
<p class="advisement" id="bidi_whole_res"><a class="self" href="#bidi_whole_res">​</a>The spec should indicate how to define a default base direction for the resource as a whole, ie. set the overall base direction. <a class="local" href="https://w3c.github.io/bp-i18n-specdev/#bidi_whole_res">more</a></p>
<p class="advisement" id="bidi_res_default"><a class="self" href="#bidi_res_default">​</a>The default base direction, in the absence of other information, should be LTR. <a class="local" href="https://w3c.github.io/bp-i18n-specdev/#bidi_res_default">more</a></p>
</section>
<section id="sec_bidi_values">
<h4>Base direction values</h4>
<p class="advisement" id="bidi_values"><a class="self" href="#bidi_values">​</a>Values for the default base direction should include left-to-right, right-to-left, and auto. <a class="local" href="https://w3c.github.io/bp-i18n-specdev/#bidi_values">more</a></p>
<p>The <code class="kw" translate="no">auto</code> value allows automatic detection of the base direction for a piece of text. For example, the <code class="kw" translate="no">auto</code> value of <code class="kw" translate="no">dir</code> in HTML looks for the first strong directional character in the text, but ignores certain items of markup also, to guess the base direction of the text. Note that automatic detection algorithms are far from perfect. First-strong detection is unable to correctly identify text that is really right-to-left, but that begins with a strong LTR character. Algorithms that attempt to judge the base direction based on contents of the text are also problematic. The best scenario is one where the base direction is known and declared.</p>
</section>
<section id="bidi_block">
<h4>Establishing the base direction for paragraphs</h4>
<p class="advisement" id="bidi_block_change"><a class="self" href="#bidi_block_change">​</a>The content author must be able to indicate parts of the text where the base direction changes. At the block level, this should be achieved using attributes or metadata, and should not rely on Unicode control characters. </p>
<p>Relying on Unicode control characters to establish direction for every block is not feasible because line breaks terminate the effect of such control characters. It also makes the data much less stable, and unnecessarily difficult to manage if control characters have to appear at every point where they would be needed.</p>
<p class="advisement" id="bidi_block_auto"><a class="self" href="#bidi_block_auto">​</a>It must be possible to also set the direction for content fragments to <code class="kw" translate="no">auto</code>. This means that the base direction will be determined by examining the content itself.</p>
<p>A typical approach here would be to set the direction based on the first strong directional character outside of any markup, but this is not the only possible method. The algorithm used to determine directionality when direction is set to auto should match that expected by the receiver.</p>
<p>The first-strong algorithm looks for the first character in the paragraph with a strong directional property according to the Unicode definitions. It then sets the base direction of the paragraph according to the direction of that character.</p>
<p>Note that the first-strong algorithm may incorrectly guess the direction of the paragraph when the first character is not typical of the rest of the paragraph, such as when a RTL paragraph or line starts with a LTR brand name or technical term.</p>
<p>For additional information about algorithms for detecting direction, see <a href="https://www.w3.org/TR/html-bidi/#auto-direction-algorithms">Estimation algorithms</a> in the document where this was discussed with reference to HTML.</p>
<p class="advisement" id="bidi_block_para"><a class="self" href="#bidi_block_para">​</a>If the overall base direction is set to <code class="kw" translate="no">auto</code> for plain text, the direction of content paragraphs should be determined on a paragraph by paragraph basis. </p>
<p class="advisement" id="bidi_block_befaft"><a class="self" href="#bidi_block_befaft">​</a>To indicate the sides of a block of text where relative to the start and end of its contained lines, you should use 'before' and 'after' (maybe block-start/block-end – the terminology is changing), rather than 'top' and 'bottom'. </p>
<p class="advisement" id="bidi_inline_start_end"><a class="self" href="#bidi_inline_start_end">​</a>To indicate the start/end of a line you should use 'start' and 'end' rather than 'left' and 'right'. </p>
<p class="advisement" id="bidi_dedicated_attr"><a class="self" href="#bidi_dedicated_attr">​</a>Provide dedicated attributes for control of base direction and bidirectional overrides; do not rely on the user applying style properties to arbitrary markup to achieve bidi control.</p>
<p>For example, HTML has a <code class="kw" translate="no">dir</code> attribute that is capable of managing base direction without assistance from CSS styling. XML formats should define dedicated markup to represent directional information, even if they need CSS to achieve the required display, since the text may be used in other ways.</p>
<p>Style sheets such as CSS may not always be used with the data, or carried with the data when it is syndicated, etc. Directional information is fundamentally important to correct display of the data, and should be associated more closely and more permanently with the markup or data.</p>
</section>
<section id="bidi_inline">
<h4>Setting base direction for inline text</h4>
<p class="advisement" id="bidi_inline_change"><a class="self" href="#bidi_inline_change">​</a>It must be possible to indicate spans of inline text where the base direction changes. If markup is available, this is the preferred method. Otherwise your specification must require that Unicode control characters are recognized by the receiving application, and correctly implemented.</p>
<p class="advisement" id="bidi_inline_auto"><a class="self" href="#bidi_inline_auto">​</a>It must be possible to also set the direction for a span to auto. This means that the base direction will be determined by examining the content itself. A typical approach here would be to set the direction based on the first strong directional character outside of any markup. <a class="local" href="https://w3c.github.io/bp-i18n-specdev/#bidi_inline_auto">more</a></p>
<p>The first-strong algorithm looks for the first character in the paragraph with a strong directional property according to the Unicode definitions. It then sets the base direction of the paragraph according to the direction of that character.</p>
<p>Note that the first-strong algorithm may incorrectly guess the direction of the paragraph when the first character is not typical of the rest of the paragraph, such as when a RTL paragraph or line starts with a LTR brand name or technical term.</p>
<p>For additional information about algorithms for detecting direction, see <a href="https://www.w3.org/TR/html-bidi/#auto-direction-algorithms">Estimation algorithms</a> in the document where this was discussed with reference to HTML.</p>
<p class="advisement" id="bidi_inline_rli"><a class="self" href="#bidi_inline_rli">​</a>If users use Unicode bidirectional control characters, the RLI/LRI/FSI with PDI characters must be supported by the application and recommended (rather than RLE/LRE with PDF) by the spec.</p>
<p class="advisement" id="bidi_inline_rlm"><a class="self" href="#bidi_inline_rlm">​</a>Use of RLM/LRM should be appropriate, and expectations of what those controls can and cannot do should be clear in the spec. <a class="local" href="https://w3c.github.io/bp-i18n-specdev/#bidi_inline_rlm">more</a></p>
<p>The Unicode bidirectional control characters <span class="uname">U+200F RIGHT-TO-LEFT MARK</span> and <span class="uname">U+200E LEFT-TO-RIGHT MARK</span> are not sufficient on their own to manage bidirectional text. They cannot produce a different base direction for embedded text. For that you need to be able to indicate the start and end of the range of the embedded text. This is best done by markup, if available, or failing that using the other Unicode bidirectional controls mentioned just above.</p><p class="advisement" id="bidi_inline_dedicated_attr"><a class="self" href="#bidi_inline_dedicated_attr">​</a>Provide dedicated attributes for control of base direction and bidirectional overrides; do not rely on the user applying style properties to arbitrary markup to achieve bidi control.</p>
<p class="advisement" id="bidi_inline_all_elems"><a class="self" href="#bidi_inline_all_elems">​</a>Allow bidi attributes on all inline elements in markup that contain text.</p>
<p class="advisement" id="bidi_inline_embed"><a class="self" href="#bidi_inline_embed">​</a>Provide attributes that allow the user to (a) create an embedded base direction or (b) override the bidirectional algorithm altogether; the attribute should allow the user to set the direction to LTR or RTL in either of these two scenarios.</p>
</section>
<!--section class="links">
<h5>Links</h5>
<section class="howto">
<h5>How to's</h5>
<ul>
<li class="w3">
<p class="link"><a href="https://w3c.github.io/bp-i18n-specdev/#sec_bidi_resource">Setting the default base direction</a></p>
<p class="desc">In <a href="https://w3c.github.io/bp-i18n-specdev/">Internationalization Best Practices for Spec Developers</a>.</p>
</li>
<li class="nonw3">
<p class="link"><a href="http://unicode.org/reports/tr9/">Unicode Bidirectional Algorithm</a></p>
<p class="desc">In Unicode® Standard Annex #9. Specifies the detail of how the bidirectional algorithm works.</p>
</li>
</ul>
</section>
<section class="background">
<h5>Background</h5>
<ul>
<li class="w3">
<p class="link"><a href="https://www.w3.org/International/articles/inline-bidi-markup/uba-basics">Unicode Bidirectional Algorithm basics</a></p>
<p class="desc">Article describing the basics about how the Unicode Bidirectional Algorithm works.</p>
</li>
</ul>
</section>
</section-->
<section class="links">
<h4>Links</h4>
<section class="howto">
<h5>How to's</h5>
<ul>
<li class="w3">
<p class="link"><a href="https://www.w3.org/TR/html-bidi/#auto-direction-algorithms">Estimation algorithms</a></p>
<p class="desc">In <a href="https://www.w3.org/TR/html-bidi/">Additional Requirements for Bidi in HTML & CSS</a>.</p>
</li>
</ul>
</section>
</section>
</section>
</section>
<section id="characters" class="topic">
<h2>Characters</h2>
<ul class="summary">
<li><a href="#char_def">Choosing a definition of 'character' </a></li>
<li><a href="#char_referencemodel">Defining a Reference Processing Model </a></li>
<li><a href="#char_ranges">Including and excluding character ranges</a></li>
<li><a href="#char_pua">Using the Private Use Area </a></li>
<li><a href="#char_choosing">Choosing character encodings </a></li>
<li><a href="#char_identifying">Identifying character encodings </a></li>
<li><a href="#char_escapes">Designing character escapes </a></li>
<li><a href="#char_storing">Storing text </a></li>
<li><a href="#char_sort">Specifying sort and search functionality </a></li>
<li><a href="#char_string">Defining 'string' </a></li>
<li><a href="#char_indexing">Indexing strings </a></li>
<li><a href="#char_unicoderef">Referencing the Unicode Standard </a></li>
</ul>
<p>See the <a href="https://www.w3.org/TR/charmod/">Character Model for the World Wide Web: Fundamentals</a> for basic guidelines related to the use of characters and encodings.</p>
<p>See the <a href="https://encoding.spec.whatwg.org/">Encoding</a> specification for further guidelines related to use of character encodings.</p>
<p>Another Character Model document is currently in development, entitled <a href="https://w3c.github.io/charmod-norm/">String Matching and Searching</a>. It looks at issues that arise when you try to compare two strings, be it identifiers or authored content.</p>
<section id="char_def" class="subtopic">
<h3>Choosing a definition of 'character'</h3>
<p class="advisement" id="char_sounds"><a class="self" href="#char_sounds">​</a>Specifications, software and content MUST NOT require or depend on a one-to-one correspondence between characters and the sounds of a language. <a href="https://www.w3.org/TR/charmod/#C001">more</a></p>
<p class="advisement" id="char_display"><a class="self" href="#char_display">​</a>Specifications, software and content MUST NOT require or depend on a one-to-one mapping between characters and units of displayed text. <a href="https://www.w3.org/TR/charmod/#C002">more</a></p>
<p class="advisement" id="char_logical"><a class="self" href="#char_logical">​</a>Protocols, data formats and APIs MUST store, interchange or process text data in logical order. <a href="https://www.w3.org/TR/charmod/#C003">more</a></p>
<p class="advisement" id="char_logical_storage"><a class="self" href="#char_logical_storage">​</a>Independent of whether some implementation uses logical selection or visual selection, characters selected MUST be kept in logical order in storage. <a href="https://www.w3.org/TR/charmod/#C075">more</a></p>
<p class="advisement" id="char_logical_discontiguous"><a class="self" href="#char_logical_discontiguous">​</a>Specifications of protocols and APIs that involve selection of ranges SHOULD provide for discontiguous logical selections, at least to the extent necessary to support implementation of visual selection on screen on top of those protocols and APIs. <a href="https://www.w3.org/TR/charmod/#C004">more</a></p>
<p class="advisement" id="char_keystroke"><a class="self" href="#char_keystroke">​</a>Specifications and software MUST NOT require nor depend on a single keystroke resulting in a single character, nor that a single character be input with a single keystroke (even with modifiers), nor that keyboards are the same all over the world. <a href="https://www.w3.org/TR/charmod/#C005">more</a></p>
<p class="advisement" id="char_physical_storage"><a class="self" href="#char_physical_storage">​</a>Specifications, software and content MUST NOT require or depend on a one-to-one relationship between characters and units of physical storage. <a href="https://www.w3.org/TR/charmod/#C009">more</a></p>
<p class="advisement" id="char_define"><a class="self" href="#char_define">​</a>When specifications use the term 'character' the specifications MUST define which meaning they intend. <a href="https://www.w3.org/TR/charmod/#C010">more</a></p>
<p class="advisement" id="char_specific"><a class="self" href="#char_specific">​</a>Specifications SHOULD use specific terms, when available, instead of the general term 'character'. <a href="https://www.w3.org/TR/charmod/#C067">more</a></p>
<section class="links"><h4>Links</h4>
<section class="howto"><h5>How to's</h5>
<ul>
<li class="w3">
<p class="link"><a href="https://www.w3.org/TR/charmod/#sec-Perceptions">Perceptions of Characters </a></p>
<p class="desc">In W3C Recommendation, <a href="https://www.w3.org/TR/charmod/#sec-Characters">Character Model for the World Wide Web</a>.</p>
</li>
</ul>
</section>
</section>
<section class="xref"><h4>See also</h4>
<p><a href="#char_string">Defining 'string'</a>.</p>
</section>
</section>
<section id="char_referencemodel" class="subtopic">
<h3>Defining a Reference Processing Model</h3>
<p class="advisement" id="char_single_enc"><a class="self" href="#char_single_enc">​</a>Textual data objects defined by protocol or format specifications MUST be in a single character encoding. <a href="https://www.w3.org/TR/charmod/#C013">more</a></p>
<p class="advisement" id="char_rpm"><a class="self" href="#char_rpm">​</a>All specifications that involve processing of text MUST specify the processing of text according to the Reference Processing Model described by the rest of the recommendations in this list. <a href="https://www.w3.org/TR/charmod/#C014">more</a></p>
<p class="advisement" id="char_unicode_chars"><a class="self" href="#char_unicode_chars">​</a>Specifications MUST define text in terms of Unicode characters, not bytes or glyphs. <a href="https://www.w3.org/TR/charmod/#C014">more</a></p>
<p class="advisement" id="char_transcode"><a class="self" href="#char_transcode">​</a>For their textual data objects specifications MAY allow use of any character encoding which can be transcoded to a Unicode encoding form. <a href="https://www.w3.org/TR/charmod/#C014">more</a></p>
<p class="advisement" id="char_as_unicode"><a class="self" href="#char_as_unicode">​</a>Specifications MAY choose to disallow or deprecate some character encodings and to make others mandatory. Independent of the actual character encoding, the specified behavior MUST be the same as if the processing happened as follows: (a) The character encoding of any textual data object received by the application implementing the specification MUST be determined and the data object MUST be interpreted as a sequence of Unicode characters - this MUST be equivalent to transcoding the data object to some Unicode encoding form, adjusting any character encoding label if necessary, and receiving it in that Unicode encoding form, (b) All processing MUST take place on this sequence of Unicode characters, (c) If text is output by the application, the sequence of Unicode characters MUST be encoded using a character encoding chosen among those allowed by the specification. <a href="https://www.w3.org/TR/charmod/#C014">more</a></p>
<p class="advisement" id="char_different_enc"><a class="self" href="#char_different_enc">​</a>If a specification is such that multiple textual data objects are involved (such as an XML document referring to external parsed entities), it MAY choose to allow these data objects to be in different character encodings. In all cases, the Reference Processing Model MUST be applied to all textual data objects. <a href="https://www.w3.org/TR/charmod/#C014">more</a></p>
<section class="links">
<h4>Links</h4>
<section class="howto">
<h5>How to's</h5>
<ul>
<li class="w3">
<p class="link"><a href="https://www.w3.org/TR/charmod/#sec-Characters">Digital Encoding of Characters </a></p>
<p class="desc">In W3C Recommendation, <a href="https://www.w3.org/TR/charmod/#sec-Characters">Character Model for the World Wide Web</a>.</p>
</li>
</ul>
</section>
</section>
<section class="xref">
<h4>See also</h4>
<p><a href="#char_ranges">Including and excluding character ranges</a>.</p>
</section>
</section>
<section id="char_ranges" class="subtopic">
<h3>Including and excluding character ranges </h3>
<p class="advisement" id="char_exclude"><a class="self" href="#char_exclude">​</a>Specifications SHOULD NOT arbitrarily exclude code points from the full range of Unicode code points from U+0000 to U+10FFFF inclusive. <a href="https://www.w3.org/TR/charmod/#C070">more</a></p>
<p class="advisement" id="char_10ffff"><a class="self" href="#char_10ffff">​</a>Specifications MUST NOT allow code points above U+10FFFF. <a href="https://www.w3.org/TR/charmod/#C077">more</a></p>
<p class="advisement" id="char_internal_use"><a class="self" href="#char_internal_use">​</a>Specifications SHOULD NOT allow the use of codepoints reserved by Unicode for internal use. <a href="https://www.w3.org/TR/charmod/#C079">more</a></p>
<p class="advisement" id="char_surrogate"><a class="self" href="#char_surrogate">​</a>Specifications MUST NOT allow the use of surrogate code points. <a href="https://www.w3.org/TR/charmod/#C078">more</a></p>
<p class="advisement" id="char_compatibility"><a class="self" href="#char_compatibility">​</a>Specifications SHOULD exclude compatibility characters in the syntactic elements (markup, delimiters, identifiers) of the formats they define. <a href="https://www.w3.org/TR/charmod/#C050">more</a></p>
<section class="links">
<h4>Links</h4>
<section class="howto">
<h5>How to's</h5>
<ul>
<li class="w3">
<p class="link"><a href="https://www.w3.org/TR/charmod/#sec-Characters">Digital Encoding of Characters </a></p>
<p class="desc">In W3C Recommendation, <a href="https://www.w3.org/TR/charmod/#sec-Characters">Character Model for the World Wide Web</a>.</p>
</li>
</ul>
</section>
</section>
<section class="xref">
<h4>See also</h4>
<p><a href="#char_pua">Using the Private Use Area</a>.</p>
</section>
</section>
<section id="char_pua" class="subtopic">
<h3>Using the Private Use Area</h3>
<p class="advisement" id="char_not_pua"><a class="self" href="#char_not_pua">​</a>Specifications MUST NOT require the use of private use area characters with particular assignments. <a href="https://www.w3.org/TR/charmod/#C038">more</a></p>
<p class="advisement" id="char_pua_mechanisms"><a class="self" href="#char_pua_mechanisms">​</a>Specifications MUST NOT require the use of mechanisms for defining agreements of private use code points. <a href="https://www.w3.org/TR/charmod/#C039">more</a></p>
<p class="advisement" id="char_pua_allow"><a class="self" href="#char_pua_allow">​</a>Specifications and implementations SHOULD NOT disallow the use of private use code points by private agreement. <a href="https://www.w3.org/TR/charmod/#C040">more</a></p>
<p class="advisement" id="char_symbols"><a class="self" href="#char_symbols">​</a>Specifications MAY define markup to allow the transmission of symbols not in Unicode or to identify specific variants of Unicode characters. <a href="https://www.w3.org/TR/charmod/#C041">more</a></p>
<p class="advisement" id="char_pictures"><a class="self" href="#char_pictures">​</a>Specifications SHOULD allow the inclusion of or reference to pictures and graphics where appropriate, to eliminate the need to (mis)use character-oriented mechanisms for pictures or graphics. <a href="https://www.w3.org/TR/charmod/#C068">more</a></p>
<section class="links">
<h4>Links</h4>
<section class="howto">
<h5>How to's</h5>
<ul>
<li class="w3">
<p class="link"><a href="https://www.w3.org/TR/charmod/#sec-PrivateUse">Private use code points</a></p>
<p class="desc">In W3C Recommendation, <a href="https://www.w3.org/TR/charmod/#sec-Characters">Character Model for the World Wide Web</a>.</p>
</li>
</ul>
</section>
</section>
<section class="xref">
<h4>See also</h4>
<p><a href="#char_ranges">Including and excluding character ranges</a>.</p>
</section>
</section>
<section id="char_choosing" class="subtopic">
<h3>Choosing character encodings</h3>
<p class="advisement" id="char_identification"><a class="self" href="#char_identification">​</a>Specifications MUST either specify a unique character encoding, or provide character encoding identification mechanisms such that the encoding of text can be reliably identified. <a href="https://www.w3.org/TR/charmod/#C015">more</a></p>
<p class="advisement" id="char_unique_for_new"><a class="self" href="#char_unique_for_new">​</a>When designing a new protocol, format or API, specifications SHOULD require a unique character encoding. <a href="https://www.w3.org/TR/charmod/#C016">more</a></p>
<p class="advisement" id="char_enc_rules"><a class="self" href="#char_enc_rules">​</a>When basing a protocol, format, or API on a protocol, format, or API that already has rules for character encoding, specifications SHOULD use rather than change these rules. <a href="https://www.w3.org/TR/charmod/#C017">more</a></p>
<p class="advisement" id="char_use_utf8"><a class="self" href="#char_use_utf8">​</a>When a unique character encoding is required, the character encoding MUST be UTF-8, UTF-16 or UTF-32. <a href="https://www.w3.org/TR/charmod/#C018">more</a></p>
<p>This guideline needs further consideration: utf-16 and utf-32 are not recommended these days. UTF-8 is the recommended encoding.</p>
<p class="advisement" id="char_charset"><a class="self" href="#char_charset">​</a>Specifications SHOULD avoid using the terms 'character set' and 'charset' to refer to a character encoding, except when the latter is used to refer to the MIME charset parameter or its IANA-registered values. The term 'character encoding', or in specific cases the terms 'character encoding form' or 'character encoding scheme', are RECOMMENDED. <a href="https://www.w3.org/TR/charmod/#C020">more</a></p>
<p class="advisement" id="char_iana"><a class="self" href="#char_iana">​</a>If the unique encoding approach is not taken, specifications SHOULD require the use of the IANA charset registry names, and in particular the names identified in the registry as 'MIME preferred names', to designate character encodings in protocols, data formats and APIs. <a href="https://www.w3.org/TR/charmod/#C021">more</a></p>
<p>This guideline needs further consideration: the list of character encodings recommended for Web specifications is listed in the Encoding specification.</p>
<p class="advisement" id="char_non_iana"><a class="self" href="#char_non_iana">​</a>Character encodings that are not in the IANA registry SHOULD NOT be used, except by private agreement. <a href="https://www.w3.org/TR/charmod/#C022">more</a></p>
<p class="advisement" id="char_x"><a class="self" href="#char_x">​</a>If an unregistered character encoding is used, the convention of using 'x-' at the beginning of the name MUST be followed. <a href="https://www.w3.org/TR/charmod/#C023">more</a></p>
<p class="advisement" id="char_not_unique"><a class="self" href="#char_not_unique">​</a>If the unique encoding approach is not chosen, specifications MUST designate at least one of the UTF-8 and UTF-16 encoding forms of Unicode as admissible character encodings and SHOULD choose at least one of UTF-8 or UTF-16 as required encoding forms (encoding forms that MUST be supported by implementations of the specification). <a href="https://www.w3.org/TR/charmod/#C026">more</a></p>
<p class="advisement" id="char_default"><a class="self" href="#char_default">​</a>Specifications that require a default encoding MUST define either UTF-8 or UTF-16 as the default, or both if they define suitable means of distinguishing them. <a href="https://www.w3.org/TR/charmod/#C027">more</a></p>
<section class="links">
<h4>Links</h4>
<section class="howto">
<h5>How to's</h5>
<ul>
<li class="w3">
<p class="link"><a href="https://www.w3.org/TR/charmod/#sec-Encodings">Choice and identification of code points</a></p>
<p class="desc">In W3C Recommendation, <a href="https://www.w3.org/TR/charmod/#sec-Characters">Character Model for the World Wide Web</a>.</p>
</li>
</ul>
</section>
<section class="background">
<h5>Background reading</h5>
<ul>
<li class="w3">
<p class="link"><a href="https://www.w3.org/International/questions/qa-doc-charset">Document character set</a></p>
<p class="desc">What is the 'Document Character Set' for XML and HTML, and how does it relate to the encodings I use for my documents? </p>
</li>
</ul>
</section>
</section>
</section>
<section id="char_identifying" class="subtopic">
<h3>Identifying character encodings</h3>
<p class="advisement" id="char_heuristics"><a class="self" href="#char_heuristics">​</a>Specifications MUST NOT propose the use of heuristics to determine the encoding of data. <a href="https://www.w3.org/TR/charmod/#C028">more</a></p>
<p class="advisement" id="char_conflict"><a class="self" href="#char_conflict">​</a>Specifications MUST define conflict-resolution mechanisms (e.g. priorities) for cases where there is multiple or conflicting information about character encoding. <a href="https://www.w3.org/TR/charmod/#C035">more</a></p>
<section class="links">
<h4>Links</h4>
<section class="howto">
<h5>How to's</h5>
<ul>
<li class="w3">
<p class="link"><a href="https://www.w3.org/TR/charmod/#sec-Encodings">Choice and identification of code points</a></p>
<p class="desc">In W3C Recommendation, <a href="https://www.w3.org/TR/charmod/#sec-Characters">Character Model for the World Wide Web</a>.</p>
</li>
</ul>
</section>
</section>
</section>
<section id="char_escapes" class="subtopic">
<h3>Designing character escapes</h3>
<p class="advisement" id="char_escaping"><a class="self" href="#char_escaping">​</a>Specifications should provide a mechanism for escaping characters, particularly those which are invisible or ambiguous. <a href="https://w3c.github.io/bp-i18n-specdev/#char_heuristics">more</a></p>
<p>It is generally recommended that character escapes be provided so that difficult to enter or edit sequences can be introduced using a plain text editor. Escape sequences are particularly useful for invisible or ambiguous Unicode characters, including zero-width spaces, soft-hyphens, various bidi controls, mongolian vowel separators, etc. </p>
<p>For advice on use of escapes in markup, but which is mostly generalisable to other formats, see <a href="https://www.w3.org/International/questions/qa-escapes">Using character escapes in markup and CSS</a>.</p>
<p class="advisement" id="char_esc_new"><a class="self" href="#char_esc_new">​</a>Specifications SHOULD NOT invent a new escaping mechanism if an appropriate one already exists. <a href="https://www.w3.org/TR/charmod/#C042">more</a></p>
<p class="advisement" id="char_esc_alternates"><a class="self" href="#char_esc_alternates">​</a>The number of different ways to escape a character SHOULD be minimized (ideally to one). <a href="https://www.w3.org/TR/charmod/#C043">more</a></p>
<p class="advisement" id="char_esc_end"><a class="self" href="#char_esc_end">​</a>Escape syntax SHOULD require either explicit end delimiters or a fixed number of characters in each character escape. Escape syntaxes where the end is determined by any character outside the set of characters admissible in the character escape itself SHOULD be avoided. <a href="https://www.w3.org/TR/charmod/#C044">more</a></p>
<p class="advisement" id="char_esc_hex"><a class="self" href="#char_esc_hex">​</a>Whenever specifications define character escapes that allow the representation of characters using a number, the number MUST represent the Unicode code point of the character and SHOULD be in hexadecimal notation. <a href="https://www.w3.org/TR/charmod/#C045">more</a></p>
<p class="advisement" id="char_esc_acceptable"><a class="self" href="#char_esc_acceptable">​</a>Escaped characters SHOULD be acceptable wherever their unescaped forms are; this does not preclude that syntax-significant characters, when escaped, lose their significance in the syntax. In particular, if a character is acceptable in identifiers and comments, then its escaped form should also be acceptable. <a href="https://www.w3.org/TR/charmod/#C046">more</a></p>
<section class="links">
<h4>Links</h4>
<section class="howto">
<h5>How to's</h5>
<ul>
<li class="w3">
<p class="link"><a href="https://www.w3.org/TR/charmod/#sec-Escaping">Character escaping</a></p>
<p class="desc">In W3C Recommendation, <a href="https://www.w3.org/TR/charmod/#sec-Characters">Character Model for the World Wide Web</a>.</p>
</li>
</ul>
</section>
</section>
</section>
<section id="char_storing" class="subtopic">
<h3>Storing text</h3>
<p class="advisement" id="char_storing_logical"><a class="self" href="#char_storing_logical">​</a>Protocols, data formats and APIs MUST store, interchange or process text data in logical order. <a href="https://www.w3.org/TR/charmod/#C003">more</a></p>
<p class="advisement" id="char_storing_discontiguous"><a class="self" href="#char_storing_discontiguous">​</a>Specifications of protocols and APIs that involve selection of ranges SHOULD provide for discontiguous logical selections, at least to the extent necessary to support implementation of visual selection on screen on top of those protocols and APIs. <a href="https://www.w3.org/TR/charmod/#C004">more</a></p>
<section class="links">
<h4>Links</h4>
<section class="howto">
<h5>How to's</h5>
<ul>
<li class="w3">
<p class="link"><a href="https://www.w3.org/TR/charmod/#sec-LogicalOrder">Visual rendering and logical order</a></p>
<p class="desc">In W3C Recommendation, <a href="https://www.w3.org/TR/charmod/">Character Model for the World Wide Web</a>.</p>
</li>
</ul>
</section>
</section>
</section>
<section id="char_sort" class="subtopic">
<h3>Specifying sort and search functionality</h3>
<p class="advisement" id="char_sort_units"><a class="self" href="#char_sort_units">​</a>Software that sorts or searches text for users SHOULD do so on the basis of appropriate collation units and ordering rules for the relevant language and/or application. <a href="https://www.w3.org/TR/charmod/#C006">more</a></p>
<p class="advisement" id="char_sort_user"><a class="self" href="#char_sort_user">​</a>Where searching or sorting is done dynamically, particularly in a multilingual environment, the 'relevant language' SHOULD be determined to be that of the current user, and may thus differ from user to user. <a href="https://www.w3.org/TR/charmod/#C007">more</a></p>
<p class="advisement" id="char_sort_alternatives"><a class="self" href="#char_sort_alternatives">​</a>Software that allows users to sort or search text SHOULD allow the user to select alternative rules for collation units and ordering. <a href="https://www.w3.org/TR/charmod/#C066">more</a></p>
<p class="advisement" id="char_sort_anything"><a class="self" href="#char_sort_anything">​</a>Specifications and implementations of sorting and searching algorithms SHOULD accommodate text that contains any character in Unicode. <a href="https://www.w3.org/TR/charmod/#C008">more</a></p>
<section class="links">
<h4>Links</h4>
<section class="howto">
<h5>How to's</h5>
<ul>
<li class="w3">
<p class="link"><a href="https://www.w3.org/TR/charmod/#sec-CollationUnits">Units of collation</a></p>
<p class="desc">In W3C Recommendation, <a href="https://www.w3.org/TR/charmod/">Character Model for the World Wide Web</a>.</p>
</li>
</ul>
</section>
</section>
</section>
<section id="char_n11n" class="subtopic">
<h3>Converting to a Common Unicode Form</h3>
<p class="advisement" id="char_n11n_nfc"><a class="self" href="#char_n11n_nfc">​</a>Specifications of text-based formats and protocols MAY specify that all or part of the textual content of that format or protocol is normalized using Unicode Normalization Form C (NFC). <a href="https://www.w3.org/TR/charmod-norm/#h-convertingtocommonunicodeform">more</a></p>
<p class="advisement" id="char_n11n_security"><a class="self" href="#char_n11n_security">​</a>Specifications that do not normalize MUST document or provide a health-warning if canonically equivalent but disjoint Unicode character sequences represent a security issue. <a href="https://www.w3.org/TR/charmod-norm/#h-non-normalizing">more</a></p>
<p class="advisement" id="char_n11n_assumptions"><a class="self" href="#char_n11n_assumptions">​</a>Specifications and implementations MUST NOT assume that content is in any particular normalization form. <a href="https://www.w3.org/TR/charmod-norm/#h-non-normalizing">more</a></p>
<p class="advisement" id="char_n11n_comparison"><a class="self" href="#char_n11n_comparison">​</a>Specifications MUST specify that string matching takes the form of "code point-by-code point" comparison of the Unicode character sequence, or, if a specific Unicode character encoding is specified, code unit-by-code unit comparison of the sequences. <a href="https://www.w3.org/TR/charmod-norm/#h-non-normalizing">more</a></p>
<p class="advisement" id="char_n11n_regex"><a class="self" href="#char_n11n_regex">​</a>Specifications that define a regular expression syntax MUST provide at least Basic Unicode Level 1 support per <cite>Unicode Technical Standard #18: Unicode Regular Expressions</cite> and SHOULD provide Extended or Tailored (Levels 2 and 3) support. <a href="https://www.w3.org/TR/charmod-norm/#h-non-normalizing">more</a></p>
<p class="advisement" id="char_n11n_comparison2"><a class="self" href="#char_n11n_comparison2">​</a>Specifications of text-based formats and protocols that, as part of their syntax definition, require that the text be in normalized form MUST define string matching in terms of normalized string comparison and MUST define the normalized form to be NFC. <a href="https://www.w3.org/TR/charmod-norm/#h-normalizing-spec">more</a></p>
<p class="advisement" id="char_n11n_suspect"><a class="self" href="#char_n11n_suspect">​</a>A normalizing text-processing component which receives suspect text MUST NOT perform any normalization-sensitive operations unless it has first either confirmed through inspection that the text is in normalized form or it has re-normalized the text itself. Private agreements MAY, however, be created within private systems which are not subject to these rules, but any externally observable results MUST be the same as if the rules had been obeyed. <a href="https://www.w3.org/TR/charmod-norm/#h-normalizing-spec">more</a></p>
<p class="advisement" id="char_n11n_constructs"><a class="self" href="#char_n11n_constructs">​</a>Specifications of text-based languages and protocols SHOULD define precisely the construct boundaries necessary to obtain a complete definition of full-normalization. These definitions SHOULD include at least the boundaries between markup and character data as well as entity boundaries (if the language has any include mechanism) , SHOULD include any other boundary that may create denormalization when instances of the language are processed, but SHOULD NOT include character escapes designed to express arbitrary characters. <a href="https://www.w3.org/TR/charmod-norm/#h-normalizing-spec">more</a></p>
<p class="advisement" id="char_n11n_implementation"><a class="self" href="#char_n11n_implementation">​</a>Where operations can produce denormalized output from normalized text input, specifications of API components (functions/methods) that implement these operations MUST define whether normalization is the responsibility of the caller or the callee. Specifications MAY state that performing normalization is optional for some API components; in this case the default SHOULD be that normalization is performed, and an explicit option SHOULD be used to switch normalization off. Specifications SHOULD NOT make the implementation of normalization optional. <a href="https://www.w3.org/TR/charmod-norm/#h-normalizing-spec">more</a></p>
<p class="advisement" id="char_n11n_mechanism"><a class="self" href="#char_n11n_mechanism">​</a>Specifications that define a mechanism (for example an API or a defining language) for producing textual data object SHOULD require that the final output of this mechanism be normalized. <a href="https://www.w3.org/TR/charmod-norm/#h-normalizing-spec">more</a></p>
<section class="links">
<h4>Links</h4>
<section class="howto">
<h5>How to's</h5>
<ul>
<li class="w3">
<p class="link"><a href="https://www.w3.org/TR/charmod-norm/#convertingToCommonUnicodeForm">Converting to a Common Unicode Form</a></p>
<p class="desc">In W3C Working Draft, <a href="https://www.w3.org/TR/charmod-norm/">Character Model for the World Wide Web: String Matching and Searching</a>.</p>
</li>
</ul>
</section>
</section>
</section>
<section id="char_case" class="subtopic">
<h3>Handling Case Folding</h3>
<p class="advisement" id="char_case_sensitive"><a class="self" href="#char_case_sensitive">​</a>Case sensitive matching is RECOMMENDED as the default for new protocols and formats. <a href="https://www.w3.org/TR/charmod-norm/#h-handlingcasefolding">more</a></p>
<p class="advisement" id="char_case_unicodecf"><a class="self" href="#char_case_unicodecf">​</a>Because the "simple" case-fold mapping removes information that can be important to forming an identity match, the "Common plus Full" (or "Unicode C+F") case fold mapping is RECOMMENDED for Unicode case-insensitive matching. <a href="https://www.w3.org/TR/charmod-norm/#h-handlingcasefolding">more</a></p>
<p class="advisement" id="char_case_asciici"><a class="self" href="#char_case_asciici">​</a>ASCII case-insensitive matching MUST only be applied to vocabularies that are restricted to ASCII. Unicode case-insensitivity MUST be used for all other vocabularies. <a href="https://www.w3.org/TR/charmod-norm/#h-handlingcasefolding">more</a></p>
<p class="advisement" id="char_case_asciicinot"><a class="self" href="#char_case_asciicinot">​</a>If the vocabulary is not restricted to ASCII or permits user-defined values that use a broader range of Unicode, ASCII case-insensitive matching MUST NOT be required. <a href="https://www.w3.org/TR/charmod-norm/#h-handlingcasefolding">more</a></p>
<p class="advisement" id="char_case_vocabularies"><a class="self" href="#char_case_vocabularies">​</a>The Unicode C+F case-fold form is RECOMMENDED as the case-insensitive matching for vocabularies. The Unicode C+S form MUST NOT be used for string identity matching on the Web. <a href="https://www.w3.org/TR/charmod-norm/#h-handlingcasefolding">more</a></p>
<p class="advisement" id="char_case_options"><a class="self" href="#char_case_options">​</a>Specifications and implementations that define string matching as part of the definition of a format, protocol, or formal language (which might include operations such as parsing, matching, tokenizing, etc.) MUST define the criteria and matching forms used. These MUST be one of: (a) Case-sensitive (b) Unicode case-insensitive using Unicode case-folding C+F (c) ASCII case-insensitive. <a href="https://www.w3.org/TR/charmod-norm/#h-handlingcasefolding">more</a></p>
<p class="advisement" id="char_case_noci"><a class="self" href="#char_case_noci">​</a>Specifications SHOULD NOT specify case-insensitive comparison of strings. <a href="https://www.w3.org/TR/charmod-norm/#h-handlingcasefolding">more</a></p>
<p class="advisement" id="char_case_unicodecf2"><a class="self" href="#char_case_unicodecf2">​</a>Specifications that specify case-insensitive comparison for non-ASCII vocabularies SHOULD specify Unicode case-folding C+F. <a href="https://www.w3.org/TR/charmod-norm/#h-handlingcasefolding">more</a></p>
<p class="advisement" id="char_case_asciionly"><a class="self" href="#char_case_asciionly">​</a>Specifications MAY specify ASCII case-insensitive comparison for portions of a format or protocol that are restricted to an ASCII-only vocabulary. <a href="https://www.w3.org/TR/charmod-norm/#h-handlingcasefolding">more</a></p>
<p class="advisement" id="char_case_nonascii"><a class="self" href="#char_case_nonascii">​</a>Specifications and implementations MUST NOT specify ASCII-only case-insensitive matching for values or constructs that permit non-ASCII characters. <a href="https://www.w3.org/TR/charmod-norm/#h-handlingcasefolding">more</a></p>
<section class="links">
<h4>Links</h4>
<section class="howto">
<h5>How to's</h5>
<ul>
<li class="w3">
<p class="link"><a href="https://www.w3.org/TR/charmod-norm/#handlingCaseFolding">Handling Case Folding</a></p>
<p class="desc">In W3C Working Draft, <a href="https://www.w3.org/TR/charmod-norm/">Character Model for the World Wide Web: String Matching and Searching</a>.</p>
</li>
</ul>
</section>
</section>
</section>
<section id="char_string" class="subtopic">
<h3>Defining 'string'</h3>
<p class="advisement" id="char_string_byte"><a class="self" href="#char_string_byte">​</a>Specifications SHOULD NOT define a string as a 'byte string'. <a href="https://www.w3.org/TR/charmod/#C011">more</a></p>
<p class="advisement" id="char_string_char"><a class="self" href="#char_string_char">​</a>The 'character string' definition SHOULD be used by most specifications. <a href="https://www.w3.org/TR/charmod/#C012">more</a></p>
<section class="links">
<h4>Links</h4>
<section class="howto">
<h5>How to's</h5>
<ul>
<li class="w3">
<p class="link"><a href="https://www.w3.org/TR/charmod/#sec-Strings">String concepts</a></p>
<p class="desc">In W3C Recommendation, <a href="https://www.w3.org/TR/charmod/">Character Model for the World Wide Web</a>.</p>
</li>
</ul>
</section>
</section>
<section class="xref">
<h4>See also</h4>
<p><a href="#char_indexing">Indexing strings </a> and <a href="#char_def">Choosing a definition of 'character'</a>.</p>
</section>
</section>
<section id="char_indexing" class="subtopic">
<h3>Indexing strings</h3>
<p class="advisement" id="char_index_char"><a class="self" href="#char_index_char">​</a>The character string is RECOMMENDED as a basis for string indexing. <a href="https://www.w3.org/TR/charmod/#C051">more</a></p>
<p class="advisement" id="char_index_codeunit"><a class="self" href="#char_index_codeunit">​</a>A code unit string MAY be used as a basis for string indexing if this results in a significant improvement in the efficiency of internal operations when compared to the use of character string. <a href="https://www.w3.org/TR/charmod/#C052">more</a></p>
<p class="advisement" id="char_index_grapheme"><a class="self" href="#char_index_grapheme">​</a>Grapheme clusters MAY be used as a basis for string indexing in applications where user interaction is the primary concern. <a href="https://www.w3.org/TR/charmod/#C071">more</a></p>
<p class="advisement" id="char_index_grapheme_plus"><a class="self" href="#char_index_grapheme_plus">​</a>Specifications that define indexing in terms of grapheme clusters MUST either: (a) define grapheme clusters in terms of default grapheme clusters as defined in Unicode Standard Annex #29, Text Boundaries [UTR #29], or (b) define specifically how tailoring is applied to the indexing operation. <a href="https://www.w3.org/TR/charmod/#C074">more</a></p>
<p>Need to check the above recommendation, since extended grapheme clusters are now recommended.</p>
<p class="advisement" id="char_index_byte"><a class="self" href="#char_index_byte">​</a>The use of byte strings for indexing is NOT RECOMMENDED. <a href="https://www.w3.org/TR/charmod/#C072">more</a></p>
<p class="advisement" id="char_index_substrings"><a class="self" href="#char_index_substrings">​</a>Specifications that need a way to identify substrings or point within a string SHOULD provide ways other than string indexing to perform this operation. <a href="https://www.w3.org/TR/charmod/#C053">more</a></p>
<p class="advisement" id="char_index_counting"><a class="self" href="#char_index_counting">​</a>Specifications SHOULD understand and process single characters as substrings, and treat indices as boundary positions between counting units, regardless of the choice of counting units. <a href="https://www.w3.org/TR/charmod/#C055">more</a></p>
<p class="advisement" id="char_index_api"><a class="self" href="#char_index_api">​</a>Specifications of APIs SHOULD NOT specify single characters or single 'units of encoding' as argument or return types. <a href="https://www.w3.org/TR/charmod/#C056">more</a></p>
<p class="advisement" id="char_index_0"><a class="self" href="#char_index_0">​</a>When the positions between the units are counted for string indexing, starting with an index of 0 for the position at the start of the string is the RECOMMENDED solution, with the last index then being equal to the number of counting units in the string. <a href="https://www.w3.org/TR/charmod/#C057">more</a></p>
<section class="links">
<h4>Links</h4>
<section class="howto">
<h5>How to's</h5>
<ul>
<li class="w3">
<p class="link"><a href="https://www.w3.org/TR/charmod/#sec-stringIndexing">String indexing</a></p>
<p class="desc">In W3C Recommendation, <a href="https://www.w3.org/TR/charmod/">Character Model for the World Wide Web</a>.</p>
</li>
</ul>
</section>
</section>
<section class="xref">
<h4>See also</h4>
<p><a href="#char_string">Defining 'string'</a>.</p>
</section>
</section>
<section id="char_ref_Unicode_char" class="subtopic">
<h3>Referring to Unicode characters</h3>
<p class="advisement" id="char_ref_Uchar"><a class="self" href="#char_ref_Uchar">​</a>Use U+XXXX syntax to represent Unicode code points in the specification. <a class="local" href="https://w3c.github.io/bp-i18n-specdev/#char_ref_Uchar">more</a></p>
<p>The U+XXXX format is well understood when referring to Unicode code points in a specification. These are space separated when appearing in a sequence. No additional decoration is needed. Note that a code point may contain four, five, or six hexadecimal digits. When fewer than four digits are needed, the code point number is zero filled. E.g. U+0020.</p>
</section>
<section id="char_unicoderef" class="subtopic">
<h3>Referencing the Unicode Standard</h3>
<p class="advisement" id="char_unicoderef_do"><a class="self" href="#char_unicoderef_do">​</a>Since specifications in general need both a definition for their characters and the semantics associated with these characters, specifications SHOULD include a reference to the Unicode Standard, whether or not they include a reference to ISO/IEC 10646. <a href="https://www.w3.org/TR/charmod/#C062">more</a></p>
<p class="advisement" id="char_unicoderef_generic"><a class="self" href="#char_unicoderef_generic">​</a>A generic reference to the Unicode Standard MUST be made if it is desired that characters allocated after a specification is published are usable with that specification. A specific reference to the Unicode Standard MAY be included to ensure that functionality depending on a particular version is available and will not change over time. <a href="https://www.w3.org/TR/charmod/#C063">more</a></p>
<p class="advisement" id="char_unicoderef_latest"><a class="self" href="#char_unicoderef_latest">​</a>All generic references to the Unicode Standard MUST refer to the latest version of the Unicode Standard available at the date of publication of the containing specification. <a href="https://www.w3.org/TR/charmod/#C064">more</a></p>
<p class="advisement" id="char_unicoderef_10646"><a class="self" href="#char_unicoderef_10646">​</a>All generic references to ISO/IEC 10646 MUST refer to the latest version of ISO/IEC 10646 available at the date of publication of the containing specification. <a href="https://www.w3.org/TR/charmod/#C065">more</a></p>
<section class="links">
<h4>Links</h4>
<section class="howto">
<h5>How to's</h5>
<ul>
<li class="w3">
<p class="link"><a href="https://www.w3.org/TR/charmod/#sec-RefUnicode">Referencing the Unicode Standard and ISO/IEC 10646</a></p>
<p class="desc">In W3C Recommendation, <a href="https://www.w3.org/TR/charmod/">Character Model for the World Wide Web</a>.</p>
</li>
</ul>
</section>
</section>
</section>
</section>
<section id="sec_resid_non_ascii" class="topic">
<h2>Resource identifiers</h2>
<section id="sec_resid_basic" class="subtopic">
<h3>Basics</h3>
<p class="advisement" id="resid_use_iris"><a href="#resid_use_iris">​</a>Resource identifiers must permit the use of characters outside those of plain ASCII. <a href="https://github.com/w3c/web-annotation/issues/241">discussion</a></p>
<p class="advisement" id="resid_iri_conversion"><a class="self" href="#resid_iri_conversion">​</a> Specifications MUST define when the conversion from IRI references to URI references (or subsets thereof) takes place, in accordance with Internationalized Resource Identifiers (IRIs). <a class="local" href="https://w3c.github.io/bp-i18n-specdev/#resid_iri_conversion">more</a></p>
<p>Many current specifications already contain provisions in accordance with Internationalized Resource Identifiers (IRIs). For XML 1.0, see Section 4.2.2, External Entities. XML Schema Part 2: Datatypes provides the anyURI datatype (see Section 3.2.17). The XML Linking Language (XLink) provides the href attribute (see Section 5.4, Locator Attribute).</p>
<p>Document formats should allow IRIs to be used; handlers for protocols that do not currently support IRIs can convert the IRI to a URI when the IRI is dereferenced.</p>
<section class="links">
<h4>Links</h4>
<section class="howto">
<h5>How to's</h5>
<ul>
<li class="w3">
<p class="link"><a href="http://www.w3.org/International/iri-edit/draft-duerst-iri-08.txt">Internationalized Resource Identifiers (IRIs)</a></p>
</li>
</ul>
</section>
</section>
</section>
</section>
<section id="markup" class="topic">
<h2>Markup & syntax</h2>