-
Notifications
You must be signed in to change notification settings - Fork 10.4k
/
Copy pathLexer.h
542 lines (453 loc) · 19.7 KB
/
Lexer.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
//===--- Lexer.h - Swift Language Lexer -------------------------*- C++ -*-===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
//
// This file defines the Lexer interface.
//
//===----------------------------------------------------------------------===//
#ifndef SWIFT_LEXER_H
#define SWIFT_LEXER_H
#include "swift/AST/DiagnosticEngine.h"
#include "swift/Basic/SourceLoc.h"
#include "swift/Basic/SourceManager.h"
#include "swift/Parse/LexerState.h"
#include "swift/Parse/Token.h"
#include "swift/Syntax/References.h"
#include "swift/Syntax/Trivia.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Support/SaveAndRestore.h"
namespace swift {
/// Given a pointer to the starting byte of a UTF8 character, validate it and
/// advance the lexer past it. This returns the encoded character or ~0U if
/// the encoding is invalid.
uint32_t validateUTF8CharacterAndAdvance(const char *&Ptr, const char *End);
class DiagnosticEngine;
class InFlightDiagnostic;
class LangOptions;
template<typename ...T> struct Diag;
enum class CommentRetentionMode {
None,
AttachToNextToken,
ReturnAsTokens,
};
enum class TriviaRetentionMode {
WithoutTrivia,
WithTrivia,
};
/// Kinds of conflict marker which the lexer might encounter.
enum class ConflictMarkerKind {
/// A normal or diff3 conflict marker, initiated by at least 7 "<"s,
/// separated by at least 7 "="s or "|"s, and terminated by at least 7 ">"s.
Normal,
/// A Perforce-style conflict marker, initiated by 4 ">"s,
/// separated by 4 "="s, and terminated by 4 "<"s.
Perforce
};
class Lexer {
const LangOptions &LangOpts;
const SourceManager &SourceMgr;
DiagnosticEngine *Diags;
const unsigned BufferID;
using State = LexerState;
/// Pointer to the first character of the buffer, even in a lexer that
/// scans a subrange of the buffer.
const char *BufferStart;
/// Pointer to one past the end character of the buffer, even in a lexer
/// that scans a subrange of the buffer. Because the buffer is always
/// NUL-terminated, this points to the NUL terminator.
const char *BufferEnd;
/// Pointer to the artificial EOF that is located before BufferEnd. Useful
/// for lexing subranges of a buffer.
const char *ArtificialEOF = nullptr;
/// If non-null, points to the '\0' character in the buffer where we should
/// produce a code completion token.
const char *CodeCompletionPtr = nullptr;
/// Points to BufferStart or past the end of UTF-8 BOM sequence if it exists.
const char *ContentStart;
/// Pointer to the next not consumed character.
const char *CurPtr;
/// @{
/// Members that are *not* permanent lexer state. The values only make sense
/// during the lexImpl() invocation. These variables are declared as members
/// rather than locals so that we don't have to thread them through to all
/// lexing helpers.
/// Points to the point in the source buffer where we started scanning for
/// the current token. Thus, the range [LastCommentBlockStart, CurPtr)
/// covers all comments and whitespace that we skipped, and the token itself.
const char *LastCommentBlockStart = nullptr;
/// True if we have seen a comment while scanning for the current token.
bool SeenComment = false;
/// @}
Token NextToken;
/// \brief This is true if we're lexing a .sil file instead of a .swift
/// file. This enables the 'sil' keyword.
const bool InSILMode;
const CommentRetentionMode RetainComments;
const TriviaRetentionMode TriviaRetention;
/// InSILBody - This is true when we're lexing the body of a SIL declaration
/// in a SIL file. This enables some context-sensitive lexing.
bool InSILBody = false;
/// The current leading trivia for the next token.
///
/// This is only preserved if this Lexer was constructed with
/// `TriviaRetentionMode::WithTrivia`.
syntax::Trivia LeadingTrivia;
/// The current trailing trivia for the next token.
///
/// This is only preserved if this Lexer was constructed with
/// `TriviaRetentionMode::WithTrivia`.
syntax::Trivia TrailingTrivia;
Lexer(const Lexer&) = delete;
void operator=(const Lexer&) = delete;
/// The principal constructor used by public constructors below.
/// Don't use this constructor for other purposes, it does not initialize
/// everything.
Lexer(const LangOptions &Options,
const SourceManager &SourceMgr, DiagnosticEngine *Diags,
unsigned BufferID, bool InSILMode,
CommentRetentionMode RetainComments,
TriviaRetentionMode TriviaRetention);
/// @{
/// Helper routines used in \c Lexer constructors.
void primeLexer();
void initSubLexer(Lexer &Parent, State BeginState, State EndState);
/// @}
public:
/// \brief Create a normal lexer that scans the whole source buffer.
///
/// \param Options - the language options under which to lex. By
/// design, language options only affect whether a token is valid
/// and/or the exact token kind produced (e.g. keyword or
/// identifier), but not things like how many characters are
/// consumed. If that changes, APIs like getLocForEndOfToken will
/// need to take a LangOptions explicitly.
/// \param InSILMode - whether we're parsing a SIL source file.
/// Unlike language options, this does affect primitive lexing, which
/// means that APIs like getLocForEndOfToken really ought to take
/// this flag; it's just that we don't care that much about fidelity
/// when parsing SIL files.
Lexer(const LangOptions &Options,
const SourceManager &SourceMgr, unsigned BufferID,
DiagnosticEngine *Diags, bool InSILMode,
CommentRetentionMode RetainComments = CommentRetentionMode::None,
TriviaRetentionMode TriviaRetention = TriviaRetentionMode::WithoutTrivia)
: Lexer(Options, SourceMgr, Diags, BufferID, InSILMode, RetainComments,
TriviaRetention) {
primeLexer();
}
/// \brief Create a lexer that scans a subrange of the source buffer.
Lexer(const LangOptions &Options,
const SourceManager &SourceMgr, unsigned BufferID,
DiagnosticEngine *Diags, bool InSILMode,
CommentRetentionMode RetainComments,
TriviaRetentionMode TriviaRetention,
unsigned Offset, unsigned EndOffset)
: Lexer(Options, SourceMgr, Diags, BufferID, InSILMode, RetainComments,
TriviaRetention) {
assert(Offset <= EndOffset && "invalid range");
initSubLexer(
*this,
State(getLocForStartOfBuffer().getAdvancedLoc(Offset)),
State(getLocForStartOfBuffer().getAdvancedLoc(EndOffset)));
}
/// \brief Create a sub-lexer that lexes from the same buffer, but scans
/// a subrange of the buffer.
///
/// \param Parent the parent lexer that scans the whole buffer
/// \param BeginState start of the subrange
/// \param EndState end of the subrange
Lexer(Lexer &Parent, State BeginState, State EndState)
: Lexer(Parent.LangOpts, Parent.SourceMgr, Parent.Diags, Parent.BufferID,
Parent.InSILMode, Parent.RetainComments,
Parent.TriviaRetention) {
initSubLexer(Parent, BeginState, EndState);
}
/// \brief Returns true if this lexer will produce a code completion token.
bool isCodeCompletion() const {
return CodeCompletionPtr != nullptr;
}
/// Lex a token. If \c TriviaRetentionMode is \c WithTrivia, passed pointers
/// to trivias are populated.
void lex(Token &Result, syntax::Trivia &LeadingTriviaResult,
syntax::Trivia &TrailingTriviaResult) {
Result = NextToken;
LeadingTriviaResult = {LeadingTrivia};
TrailingTriviaResult = {TrailingTrivia};
if (Result.isNot(tok::eof))
lexImpl();
}
void lex(Token &Result) {
syntax::Trivia LeadingTrivia, TrailingTrivia;
lex(Result, LeadingTrivia, TrailingTrivia);
}
bool isKeepingComments() const {
return RetainComments == CommentRetentionMode::ReturnAsTokens;
}
unsigned getBufferID() const { return BufferID; }
/// peekNextToken - Return the next token to be returned by Lex without
/// actually lexing it.
const Token &peekNextToken() const { return NextToken; }
/// \brief Returns the lexer state for the beginning of the given token
/// location. After restoring the state, lexer will return this token and
/// continue from there.
State getStateForBeginningOfTokenLoc(SourceLoc Loc) const;
/// \brief Returns the lexer state for the beginning of the given token.
/// After restoring the state, lexer will return this token and continue from
/// there.
State getStateForBeginningOfToken(const Token &Tok,
const syntax::Trivia &LeadingTrivia = {}) const {
// If the token has a comment attached to it, rewind to before the comment,
// not just the start of the token. This ensures that we will re-lex and
// reattach the comment to the token if rewound to this state.
SourceLoc TokStart = Tok.getCommentStart();
if (TokStart.isInvalid())
TokStart = Tok.getLoc();
auto S = getStateForBeginningOfTokenLoc(TokStart);
if (TriviaRetention == TriviaRetentionMode::WithTrivia)
S.LeadingTrivia = LeadingTrivia;
return S;
}
State getStateForEndOfTokenLoc(SourceLoc Loc) const {
return State(getLocForEndOfToken(SourceMgr, Loc));
}
bool isStateForCurrentBuffer(LexerState State) const {
return SourceMgr.findBufferContainingLoc(State.Loc) == getBufferID();
}
/// \brief Restore the lexer state to a given one, that can be located either
/// before or after the current position.
void restoreState(State S, bool enableDiagnostics = false) {
assert(S.isValid());
CurPtr = getBufferPtrForSourceLoc(S.Loc);
// Don't reemit diagnostics while readvancing the lexer.
llvm::SaveAndRestore<DiagnosticEngine*>
D(Diags, enableDiagnostics ? Diags : nullptr);
lexImpl();
// Restore Trivia.
if (TriviaRetention == TriviaRetentionMode::WithTrivia)
if (auto <rivia = S.LeadingTrivia)
LeadingTrivia = std::move(*LTrivia);
}
/// \brief Restore the lexer state to a given state that is located before
/// current position.
void backtrackToState(State S) {
assert(getBufferPtrForSourceLoc(S.Loc) <= CurPtr &&
"can't backtrack forward");
restoreState(S);
}
/// \brief Retrieve the Token referred to by \c Loc.
///
/// \param SM The source manager in which the given source location
/// resides.
///
/// \param Loc The source location of the beginning of a token.
static Token getTokenAtLocation(const SourceManager &SM, SourceLoc Loc);
/// \brief Retrieve the source location that points just past the
/// end of the token referred to by \c Loc.
///
/// \param SM The source manager in which the given source location
/// resides.
///
/// \param Loc The source location of the beginning of a token.
static SourceLoc getLocForEndOfToken(const SourceManager &SM, SourceLoc Loc);
/// \brief Convert a SourceRange to the equivalent CharSourceRange
///
/// \param SM The source manager in which the given source range
/// resides.
///
/// \param SR The source range
static CharSourceRange
getCharSourceRangeFromSourceRange(const SourceManager &SM,
const SourceRange &SR) {
return CharSourceRange(SM, SR.Start, getLocForEndOfToken(SM, SR.End));
}
/// Return the start location of the token that the offset in the given buffer
/// points to.
///
/// Note that this is more expensive than \c getLocForEndOfToken because it
/// finds and re-lexes from the beginning of the line.
///
/// Due to the parser splitting tokens the adjustment may be incorrect, e.g:
/// \code
/// func +<T>(a : T, b : T)
/// \endcode
/// The start of the '<' token is '<', but the lexer will produce "+<" before
/// the parser splits it up.
////
/// If the offset points to whitespace the returned source location will point
/// to the whitespace offset.
static SourceLoc getLocForStartOfToken(SourceManager &SM, unsigned BufferID,
unsigned Offset);
static SourceLoc getLocForStartOfToken(SourceManager &SM, SourceLoc Loc);
/// Retrieve the start location of the line containing the given location.
/// the given location.
static SourceLoc getLocForStartOfLine(SourceManager &SM, SourceLoc Loc);
/// Retrieve the source location for the end of the line containing the
/// given token, which is the location of the start of the next line.
static SourceLoc getLocForEndOfLine(SourceManager &SM, SourceLoc Loc);
/// Retrieve the string used to indent the line that contains the given
/// source location.
static StringRef getIndentationForLine(SourceManager &SM, SourceLoc Loc);
/// \brief Determines if the given string is a valid non-operator
/// identifier, without escaping characters.
static bool isIdentifier(StringRef identifier);
/// \brief Determine the token kind of the string, given that it is a valid
/// non-operator identifier. Return tok::identifier if the string is not a
/// reserved word.
static tok kindOfIdentifier(StringRef Str, bool InSILMode);
/// \brief Determines if the given string is a valid operator identifier,
/// without escaping characters.
static bool isOperator(StringRef string);
SourceLoc getLocForStartOfBuffer() const {
return SourceLoc(llvm::SMLoc::getFromPointer(BufferStart));
}
/// StringSegment - A segment of a (potentially interpolated) string.
struct StringSegment {
enum : char { Literal, Expr } Kind;
// Loc+Length for the segment inside the string literal, without quotes.
SourceLoc Loc;
unsigned Length, IndentToStrip;
bool IsFirstSegment, IsLastSegment;
static StringSegment getLiteral(SourceLoc Loc, unsigned Length,
bool IsFirstSegment, bool IsLastSegment,
unsigned IndentToStrip) {
StringSegment Result;
Result.Kind = Literal;
Result.Loc = Loc;
Result.Length = Length;
Result.IsFirstSegment = IsFirstSegment;
Result.IsLastSegment = IsLastSegment;
Result.IndentToStrip = IndentToStrip;
return Result;
}
static StringSegment getExpr(SourceLoc Loc, unsigned Length) {
StringSegment Result;
Result.Kind = Expr;
Result.Loc = Loc;
Result.Length = Length;
Result.IsFirstSegment = false;
Result.IsLastSegment = false;
Result.IndentToStrip = 0;
return Result;
}
SourceLoc getEndLoc() {
return Loc.getAdvancedLoc(Length);
}
};
/// \brief Compute the bytes that the actual string literal should codegen to.
/// If a copy needs to be made, it will be allocated out of the provided
/// Buffer.
static StringRef getEncodedStringSegment(StringRef Str,
SmallVectorImpl<char> &Buffer,
bool IsFirstSegment = false,
bool IsLastSegment = false,
unsigned IndentToStrip = 0);
StringRef getEncodedStringSegment(StringSegment Segment,
SmallVectorImpl<char> &Buffer) const {
return getEncodedStringSegment(
StringRef(getBufferPtrForSourceLoc(Segment.Loc), Segment.Length),
Buffer, Segment.IsFirstSegment, Segment.IsLastSegment,
Segment.IndentToStrip);
}
/// \brief Given a string literal token, separate it into string/expr segments
/// of a potentially interpolated string.
static void getStringLiteralSegments(
const Token &Str,
SmallVectorImpl<StringSegment> &Segments,
DiagnosticEngine *Diags);
void getStringLiteralSegments(const Token &Str,
SmallVectorImpl<StringSegment> &Segments) {
return getStringLiteralSegments(Str, Segments, Diags);
}
static SourceLoc getSourceLoc(const char *Loc) {
return SourceLoc(llvm::SMLoc::getFromPointer(Loc));
}
/// Get the token that starts at the given location.
Token getTokenAt(SourceLoc Loc);
/// SILBodyRAII - This helper class is used when parsing a SIL body to inform
/// the lexer that SIL-specific lexing should be enabled.
struct SILBodyRAII {
Lexer &L;
SILBodyRAII(Lexer &L) : L(L) {
assert(!L.InSILBody && "Already in a sil body?");
L.InSILBody = true;
}
~SILBodyRAII() {
assert(L.InSILBody && "Left sil body already?");
L.InSILBody = false;
}
SILBodyRAII(const SILBodyRAII&) = delete;
void operator=(const SILBodyRAII&) = delete;
};
private:
/// For a source location in the current buffer, returns the corresponding
/// pointer.
const char *getBufferPtrForSourceLoc(SourceLoc Loc) const {
return BufferStart + SourceMgr.getLocOffsetInBuffer(Loc, BufferID);
}
StringRef getSubstring(const char *Start, unsigned Length) const {
assert(Start >= BufferStart && Start <= BufferEnd);
unsigned BytesUntilBufferEnd = BufferEnd - Start;
if (Length > BytesUntilBufferEnd)
Length = BytesUntilBufferEnd;
return StringRef(Start, Length);
}
void lexImpl();
InFlightDiagnostic diagnose(const char *Loc, Diagnostic Diag);
template<typename ...DiagArgTypes, typename ...ArgTypes>
InFlightDiagnostic diagnose(const char *Loc, Diag<DiagArgTypes...> DiagID,
ArgTypes &&...Args) {
return diagnose(Loc, Diagnostic(DiagID, std::forward<ArgTypes>(Args)...));
}
void formToken(tok Kind, const char *TokStart, bool MultilineString = false);
void formEscapedIdentifierToken(const char *TokStart);
/// Advance to the end of the line.
/// If EatNewLine is true, CurPtr will be at end of newline character.
/// Otherwise, CurPtr will be at newline character.
void skipToEndOfLine(bool EatNewline);
/// Skip to the end of the line of a // comment.
void skipSlashSlashComment(bool EatNewline);
/// Skip a #! hashbang line.
void skipHashbang(bool EatNewline);
void skipSlashStarComment();
void lexHash();
void lexIdentifier();
void lexDollarIdent();
void lexOperatorIdentifier();
void lexHexNumber();
void lexNumber();
void lexTrivia(syntax::Trivia &T, bool IsForTrailingTrivia);
static unsigned lexUnicodeEscape(const char *&CurPtr, Lexer *Diags);
unsigned lexCharacter(const char *&CurPtr,
char StopQuote, bool EmitDiagnostics,
bool MultilineString = false);
void lexStringLiteral();
void lexEscapedIdentifier();
void tryLexEditorPlaceholder();
const char *findEndOfCurlyQuoteStringLiteral(const char*);
/// Try to lex conflict markers by checking for the presence of the start and
/// end of the marker in diff3 or Perforce style respectively.
bool tryLexConflictMarker(bool EatNewline);
};
/// Given an ordered token \param Array , get the iterator pointing to the first
/// token that is not before \param Loc .
template<typename ArrayTy, typename Iterator = typename ArrayTy::iterator>
Iterator token_lower_bound(ArrayTy &Array, SourceLoc Loc) {
return std::lower_bound(Array.begin(), Array.end(), Loc,
[](const Token &T, SourceLoc L) {
return T.getLoc().getOpaquePointerValue() < L.getOpaquePointerValue();
});
}
/// Given an ordered token array \param AllTokens , get the slice of the array
/// where front() locates at \param StartLoc and back() locates at \param EndLoc .
ArrayRef<Token> slice_token_array(ArrayRef<Token> AllTokens, SourceLoc StartLoc,
SourceLoc EndLoc);
} // end namespace swift
#endif