In [None]:
lass LaxicalAnalyzer:
  def __init__(self):
    self.keywords=['auto',	'break',	'case',	'char', 'const',	'continue',	'default',	'do', 'double',	'else',	'enum',	'extern', 'float',
                   'for',	'goto',	'if', 'int',	'long',	'register',	'return', 'short',	'signed',	'sizeof',	'static', 'struct',	'switch',
                   'typedef',	'union', 'unsigned',	'void',	'volatile',	'while']
    self.punctuations=[';', ',', '(', ')', '{', '}', '[', ']', '#']
    self.operators=['+', '-', '*', '/', '%', '=', '<', '>', '!', '&', '|', '^']
    self.symbol_table = []

  def is_keyword(self, word):
    return word in self.keywords

  def is_punctuation(self,char):
    return char in self.punctuations

  def is_operator(self,char):
    return char in self.operators

  def is_digit(self,char):
    return '0'<= char <='9'

  def is_identifier_start(self,char):
    return char.isalpha() or char == '_'

  def is_identifier_part(self,char):
    return char.isalnum() or char == '_'

  def remove_comments(self,code):
    clean_code = ""
    i = 0
    while i < len(code):
      if code[i:i+2] == '//':
        while code[i] != '\n' and i<len(code):
          i = i+1

      elif code[i:i+2] == '/*':
        i = i+2
        while code[i:i+2] != '*/' and i<len(code):
          i = i + 1
        i = i+2

      else:
        clean_code = clean_code + code[i]
        i = i+1

    return clean_code


  def tokenize(self,code):
    tokens = []
    i = 0
    while (i<len(code)):
      ## Remove white space
      if code[i].isspace():
        i = i+1
        continue
      ## check whether the word is identifier or keyword?
      if self.is_identifier_start(code[i]):
        start = i
        i = i+1
        while i<len(code) and self.is_identifier_part(code[i]):
          i=i+1
        word = code[start:i]
        if self.is_keyword(word):
          tokens.append(('keyword',word))
        else:
          tokens.append(('identifier',word))
          if word not in self.symbol_table:
            self.symbol_table.append(word)
        continue
      ## check for the integer/digit (only intefers)
      elif self.is_digit(code[i]) and not self.is_identifier_start(code[i+1]):
        start = i
        i = i+1
        while i<len(code) and self.is_digit(code[i]) and not self.is_identifier_start(code[i+1]):
          i = i+1
        tokens.append(('constant',code[start:i]))
        continue
      ## check for the punctuation
      elif self.is_punctuation(code[i]):
        tokens.append(('punctuation',code[i]))
        i = i+1
        continue
      ## check for operators
      elif self.is_operator(code[i]):
        start = i
        i = i+1
        while i<len(code) and self.is_operator(code[i]):
          i = i+1
        tokens.append(('operator',code[start:i]))
        continue
      ## check for string literals
      elif code[i] == '"':
        start = i
        i=i+1
        while i<len(code) and code[i] != '"':
          i = i+1
        if i<len(code):
          tokens.append(('string literal',code[start:i+1])) # include closing quote
          i = i+1
        else:
          tokens.append(("error! Undeterminated string literal"))
        continue
      ## check for Unrecognize charecter
      else:
        tokens.append(("error! Unrecognize charecter",code[i]))
        i = i+1
        continue
    return tokens

  def analyze(self,code):
    clean_code = self.remove_comments(code)
    tokens = self.tokenize(code)
    return tokens,self.symbol_table

In [None]:
# Main Function
def main():

    c_program= """
    int main()
    {
      int a = 5 , 7H;
      // assign value
      char b = 'x';
      /* return
      value */
      return a + b;
    }
    """

    # c_program = """
    # /* salary calculation*/
    # void main( )
    # {
    #   long int bs , da , hra , gs;
    #   //take basic salary as input
    #   scanf("%ld",&bs);
    #   //calculate allowances
    #   da=bs*.40;
    #   hra=bs*.20;
    #   gs=bs+da+hra;
    #   // display salary slip
    #   printf("\n\nbs : %ld",bs);
    #   printf("\nda : %ld",da);
    #   printf("\nhra : %ld",hra);
    #   printf("\ngs : %ld",gs);
    # }
    # """

    # c_program = """
    # //function prototype
    # void add ( int , int );
    # void main( )
    # {
    #   int a , b;
    #   a = 10;
    #   b = 20;
    #   // function call
    #   add ( a , b );
    # }
    # void add ( int x , int y )
    # {
    #   return x + y;
    # }
    # """

    # c_program = """
    # // user defined data type
    # struct student
    # {
    #   int id;
    #   float cgpa;
    # }
    # void main( )
    # {
    #   student s;
    #   s.id = 10;
    #   s.cgpa = 8.7;
    # }
    # """

    lexer = LaxicalAnalyzer()

    c_program = lexer.remove_comments(c_program)

    tokens, symbol_table = lexer.analyze(c_program)

    print("\nTokens:")
    for token_type, value in tokens:
        print(f"{token_type}: {value}")

    print("\nSymbol Table:")
    print(symbol_table)


if __name__ == "__main__":
    main()



Tokens:
keyword: int
identifier: main
punctuation: (
punctuation: )
punctuation: {
keyword: int
identifier: a
operator: =
constant: 5
punctuation: ,
error! Unrecognize charecter: 7
identifier: H
punctuation: ;
keyword: char
identifier: b
operator: =
error! Unrecognize charecter: '
identifier: x
error! Unrecognize charecter: '
punctuation: ;
keyword: return
identifier: a
operator: +
identifier: b
punctuation: ;
punctuation: }

Symbol Table:
['main', 'a', 'H', 'b', 'x']
