Connecting to drive

In [43]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Defining base dir and sub dirs of the data

In [44]:
PATH_TO_DATA = "/content/drive/My Drive/NLP/"
ALGEBRA_DIR= "algebra/"
PREALGEBRA_DIR = "prealgebra/"
INTERMEDIATE_ALGEBRA_DIR = "intermediate_algebra/"

In [1]:
!pip install antlr4-python3-runtime

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Importing required libraries

In [45]:
#For navigation in the folders
import os

#For dataframe processing
import pandas as pd

#For regex validation
import re

#For nlp
import spacy

Reading the json files and appending to input dataframe

In [46]:
#Reading the math problems json files into dataframe
def read_data(path_to_files, input_df=pd.DataFrame()):
  #Files in the current directory
  os.chdir(path_to_files)
  files = [f for f in os.listdir('.') if os.path.isfile(f)]
  for file_name in files:
    data = pd.read_json(file_name, orient='index').T
    input_df = pd.concat([input_df,data], ignore_index = True)
  return input_df

In [114]:
math_problems = read_data(PATH_TO_DATA + ALGEBRA_DIR)
math_problems = read_data(PATH_TO_DATA + PREALGEBRA_DIR, math_problems)
math_problems = read_data(PATH_TO_DATA + INTERMEDIATE_ALGEBRA_DIR, math_problems)

In [None]:
math_problems = pd.read_csv(PATH_TO_DATA+"/data_min.csv")

Function to save dataframe

In [48]:
def save_dataframe_csv(path_to_save, filename, dataframe, include_index=False):
  dataframe.to_csv(path_to_save+filename, index=include_index)
  print("Dataframe saved successfully at {}".format(path_to_save+filename))

In [None]:
math_problems.shape

(4236, 4)

In [None]:
math_problems.head()

Unnamed: 0,problem,level,type,solution
0,"Brand X soda advertises, ``We will give you 20...",Level 5,Prealgebra,"Let $v$ be the volume of soda in Brand Y, and ..."
1,"During the first year, ABC's stock price start...",Level 2,Prealgebra,"After the first year, its price has doubled to..."
2,Find the greatest common divisor of 75 and 360.,Level 3,Prealgebra,$75 = 3^1 \cdot 5^2$ and $360 = 2^3 \cdot 3^2 ...
3,A school is arranging chairs in rows for an as...,Level 3,Prealgebra,The original number of chairs is divisible by ...
4,"The average value of all the pennies, nickels,...",Level 5,Prealgebra,If $n$ is the number of coins in Paula's purse...


Dropping the solution and type columns

In [115]:
math_problems.drop(columns=['solution'], inplace=True)
math_problems.head()

Unnamed: 0,problem,level,type
0,"The graphs of four functions, labelled (2) thr...",Level 5,Algebra
1,If each of the variables represents a differen...,Level 3,Algebra
2,What is the value of $a$ if the lines $2y - 2a...,Level 4,Algebra
3,Define $\#N$ by the formula $\#N = .5(N) + 1$....,Level 3,Algebra
4,What is the midpoint of the segment with endpo...,Level 2,Algebra


## Extracting math features

Detecting number of equations in the problem columns

In our data, the equations are enclosed inside either of
  1. \$...$
  2. \$\$...\$\$
  3. \\\[...\\\]

So we need to count the number of equations by matching these patterns

In [117]:
def get_equations_count(expression):
  regexes = [r"\$\$([^$]+)\$\$", r"\$([^$]+)\$", r"\\\(([^$]+)\\\)", r"\\\[([^$]+)\\\]"]
  no_of_equations = 0
  eqs = []
  
  #For each regex, we are finding the number of equations
  for regex in regexes:
    matches = re.findall(regex, expression)
    for eqn in matches:

      #Considering only those equations that have a min length of 3
      if len(eqn) >= 3:
        eqs.append(eqn)
        no_of_equations += 1
    
    # Replacing the equations in the original text
    expression = re.sub(regex, "", expression)
  return eqs, no_of_equations, expression

In [118]:
#Function to eliminate duplicate variables that are formed by combination of existing variables
def eliminate_concatenated_variables(strings):
    strings_set = set(strings)
    is_concatenated = True

    while is_concatenated:
        is_concatenated = False
        for string in strings_set.copy():
            for i in range(1, len(string)):
                prefix = string[:i]
                suffix = string[i:]
                if prefix in strings_set and suffix in strings_set:
                    strings_set.remove(string)
                    is_concatenated = True
                    break

    return strings_set

In [119]:
def get_variable_count(equations):
  # Define regular expression pattern to extract variables
  variable_pattern = r"\\?[a-zA-Z]+(?:_[0-9]+)?(?:\^{[0-9]+})?"
  max_no_of_variables = 0

  for equation in equations:
    # Find all variables in the equation
    variables = re.findall(variable_pattern, equation)

    # Count the number of unique variables
    unique_variables = set(variables)
    unique_variables = eliminate_concatenated_variables(unique_variables)
    num_unique_variables = len(unique_variables)
    max_no_of_variables = num_unique_variables if num_unique_variables > max_no_of_variables else max_no_of_variables

  return max_no_of_variables

In [120]:
def detect_exp_or_mod(equations):
  exponentiation_regex = r'\^'
  has_exponentiation = any(re.search(exponentiation_regex, equation) for equation in equations)

  # Detect "mod" (modulo)
  modulo_regex = r'\bmod\b'
  has_modulo = any(re.search(modulo_regex, equation) for equation in equations)

  # Detect "|a+b|"
  absolute_value_regex = r'\|.*?\|'
  has_absolute_value =any(re.search(absolute_value_regex, equation) for equation in equations)
  
  return has_exponentiation or has_modulo or has_absolute_value

In [121]:
def has_log(equations):
  # Detect "log" function
  log_regex = r'\\log'
  return any(re.search(log_regex, equation) for equation in equations)

In [149]:
def has_fractions(equations):
  fraction_pattern = r"[frac]"
  
  for equation in equations:
  # Extract fractions from the equation
    fractions = re.findall(fraction_pattern, equation)
    if len(fractions) > 0:
      return True
  return False

In [123]:
def has_equality_or_inequality(equations):
  # Detect equality or inequality
  equality_regex = r'(=|>|<|\\leq|\\geq|\\neq)'
  return any(re.search(equality_regex, equation) for equation in equations)

In [124]:
def extract_math_features(problem):
  eqns, equations_count, modified_problem = get_equations_count(problem)
  variable_count = get_variable_count(eqns)
  has_exp_or_mod = detect_exp_or_mod(eqns)
  has_logarithm = has_log(eqns)
  has_fracs = has_fractions(eqns)
  has_eq_or_neq = has_equality_or_inequality(eqns)

  return pd.Series({'modified_problem': modified_problem, 'no_of_equations': equations_count, 'no_of_variables': variable_count, 'has_exp_or_mod': has_exp_or_mod, 'has_logarithm': has_logarithm, 'has_fraction': has_fracs, 'has_eq_or_neq':has_eq_or_neq})

In [155]:
math_problems = pd.concat([math_problems, math_problems['problem'].apply(extract_math_features)], axis=1)

## Extracting text attributes

Installing spacy for further processing

In [157]:
import spacy

corpus = spacy.load("en_core_web_sm")
def get_text_attributes(text):

  document = corpus(text)

  sentences = list(document.sents)
  
  #Sentences count
  sentence_count = len(sentences)

  word_count = 0
  words_len_greater_than_six = 0
  char_count = 0
  large_words=[]
  has_repeated_large_words = False
  
  for token in document:
    if not token.is_space: 
        
        #Word count in the text
        word_count += 1
        char_count += len(token.text)
        
        #Number of words with length greater than 6
        if len(token.text) > 6:
          if token.text not in large_words:
            words_len_greater_than_six += 1
            large_words.append(token.text)
          else:
            has_repeated_large_words = True
  
  #Average words per sentence
  words_per_sentence = round(word_count / sentence_count)

  #Average length of words
  average_word_length = round(char_count / word_count)

  return pd.Series({'sentence_count': sentence_count, 'word_count': word_count, 'words_per_sentence': words_per_sentence, 'average_word_length': average_word_length, 'large_words': words_len_greater_than_six, 'has_repeated_large_words': has_repeated_large_words})

In [None]:
get_text_attributes("A math teacher requires Noelle to do one homework assignment for each of the first five homework points she wants to earn; for each of the next five homework points, she needs to do two homework assignments; and so on, so that to earn the  homework point, she has to do  (rounded up) homework assignments. For example, when she has 11 points, it will take  homework assignments to earn her  point. What is the smallest number of homework assignments necessary to earn a total of 25 homework points?")

sentence_count                 3
word_count                   100
words_per_sentence            33
average_word_length            4
large_words                    9
has_repeated_large_words    True
dtype: object

In [158]:
math_problems = pd.concat([math_problems, math_problems['modified_problem'].apply(get_text_attributes)], axis=1)

In [None]:
math_problems.head()

Unnamed: 0,problem,level,type,no_of_equations,modified_problem,sentence_count,word_count,words_per_sentence,average_word_length,large_words,has_repeated_large_words
0,Let $p$ and $q$ be the two distinct solutions ...,Level 4,Algebra,3,Let and be the two distinct solutions to the...,1,18,18,4,3,False
1,"The graphs of four functions, labelled (2) thr...",Level 5,Algebra,1,"The graphs of four functions, labelled (2) thr...",4,209,52,5,40,True
2,If each of the variables represents a differen...,Level 3,Algebra,1,If each of the variables represents a differen...,3,68,23,5,13,False
3,What is the value of $a$ if the lines $2y - 2a...,Level 4,Algebra,2,What is the value of if the lines and are p...,1,12,12,3,1,False
4,Define $\#N$ by the formula $\#N = .5(N) + 1$....,Level 3,Algebra,3,Define by the formula . Calculate .,2,7,4,4,2,False


## Named Entity Recognition

In [None]:
#List of labels for named entity recognition using spacy for en_core_web_sm
corpus.get_pipe("ner").labels

('CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART')

In [159]:
def named_entity_recognition(text):
  #Initialising a dictionary with all named entities
  ner = {}
  for label in corpus.get_pipe("ner").labels:
    ner[label] = 0
  
  document = corpus(text)

  for entity in document.ents:
    ner[entity.label_] += 1
  
  return pd.Series(ner)

In [160]:
math_problems = pd.concat([math_problems, math_problems['modified_problem'].apply(named_entity_recognition)], axis=1)

## POS Tagging

Parts of Speech tagging

In spaCy, the Part-of-Speech (POS) tags supported by the default English language model (en_core_web_sm) include the following:

ADJ: Adjective

ADP: Adposition

ADV: Adverb

AUX: Auxiliary verb

CCONJ: Coordinating conjunction

DET: Determiner

INTJ: Interjection

NOUN: Noun

NUM: Numeral

PART: Particle

PRON: Pronoun

PROPN: Proper noun

PUNCT: Punctuation

SCONJ: Subordinating conjunction

SYM: Symbol

VERB: Verb

X: Other

In [161]:
pos_labels = ['ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']

In [162]:
def pos_count_tagging(text):
  pos = {}
  for pos_tag in pos_labels:
    pos[pos_tag] = 0
  
  document = corpus(text)

  for token in document:
    if not token.is_space: 
      pos[token.pos_] += 1
  return pd.Series(pos)

In [163]:
math_problems = pd.concat([math_problems, math_problems['modified_problem'].apply(pos_count_tagging)], axis=1)

In [164]:
math_problems.columns

Index(['problem', 'level', 'type', 'modified_problem', 'no_of_equations',
       'no_of_variables', 'has_exp_or_mod', 'has_logarithm', 'has_fraction',
       'has_eq_or_neq', 'sentence_count', 'word_count', 'words_per_sentence',
       'average_word_length', 'large_words', 'has_repeated_large_words',
       'CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC',
       'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT',
       'QUANTITY', 'TIME', 'WORK_OF_ART', 'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ',
       'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT',
       'SCONJ', 'SYM', 'VERB', 'X'],
      dtype='object')

Saving the final dataframe as a csv file

In [None]:
save_dataframe_csv(PATH_TO_DATA, 'all_data.csv', math_problems)