Connecting to drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Defining base dir and sub dirs of the data

In [80]:
PATH_TO_DATA = r"/Users/sai/My Drive/Learning/Research/NLP/"
ALGEBRA_DIR= "algebra/"
PREALGEBRA_DIR = "prealgebra/"
INTERMEDIATE_ALGEBRA_DIR = "intermediate_algebra/"

Importing required libraries

In [75]:
#For navigation in the folders
import os

#For dataframe processing
import pandas as pd

#For regex validation
import re

#For nlp
import spacy

Reading the json files and appending to input dataframe

In [76]:
#Reading the math problems json files into dataframe
def read_data(path_to_files, input_df=pd.DataFrame()):
  #Files in the current directory
  os.chdir(path_to_files)
  files = [f for f in os.listdir('.') if os.path.isfile(f)]
  for file_name in files:
    data = pd.read_json(file_name, orient='index').T
    input_df = pd.concat([input_df,data], ignore_index = True)
  return input_df

In [81]:
math_problems = read_data(PATH_TO_DATA + ALGEBRA_DIR)
math_problems = read_data(PATH_TO_DATA + PREALGEBRA_DIR, math_problems)
math_problems = read_data(PATH_TO_DATA + INTERMEDIATE_ALGEBRA_DIR, math_problems)

Function to save dataframe

In [82]:
def save_dataframe_csv(path_to_save, filename, dataframe, include_index=False):
  dataframe.to_csv(path_to_save+filename, index=include_index)
  print("Dataframe saved successfully at {}".format(path_to_save+filename))

In [83]:
def read_dataframe_csv(path_to_file):
    return pd.read_csv(path_to_file)

In [84]:
math_problems.shape

(4236, 4)

In [None]:
math_problems.head()

Unnamed: 0,problem,level,type,solution
0,"Brand X soda advertises, ``We will give you 20...",Level 5,Prealgebra,"Let $v$ be the volume of soda in Brand Y, and ..."
1,"During the first year, ABC's stock price start...",Level 2,Prealgebra,"After the first year, its price has doubled to..."
2,Find the greatest common divisor of 75 and 360.,Level 3,Prealgebra,$75 = 3^1 \cdot 5^2$ and $360 = 2^3 \cdot 3^2 ...
3,A school is arranging chairs in rows for an as...,Level 3,Prealgebra,The original number of chairs is divisible by ...
4,"The average value of all the pennies, nickels,...",Level 5,Prealgebra,If $n$ is the number of coins in Paula's purse...


Dropping the solution column

In [85]:
math_problems.drop(columns=['solution'], inplace=True)
math_problems.head()

Unnamed: 0,problem,level,type
0,Kevin Kangaroo begins hopping on a number line...,Level 5,Algebra
1,The ratio of the areas of two squares is $\fra...,Level 4,Algebra
2,"If $\sqrt{2\sqrt{t-2}} = \sqrt[4]{7 - t}$, the...",Level 4,Algebra
3,Let $t(x) = \sqrt{3x+1}$ and $f(x)=5-t(x)$. Wh...,Level 4,Algebra
4,James has a total of 66 dollars in his piggy b...,Level 2,Algebra


## Extracting math features

Detecting number of equations in the problem columns

In our data, the equations are enclosed inside either of
  1. \$...$
  2. \$\$...\$\$
  3. \\\[...\\\]

So we need to count the number of equations by matching these patterns

In [86]:
def get_equations_count(expression):
  regexes = [r"\$\$([^$]+)\$\$", r"\$([^$]+)\$", r"\\\(([^$]+)\\\)", r"\\\[([^$]+)\\\]"]
  no_of_equations = 0
  eqs = []
  
  #For each regex, we are finding the number of equations
  for regex in regexes:
    matches = re.findall(regex, expression)
    for eqn in matches:

      #Considering only those equations that have a min length of 3
      if len(eqn) >= 3:
        eqs.append(eqn)
        no_of_equations += 1
    
    # Replacing the equations in the original text
    expression = re.sub(regex, "", expression)
  return eqs, no_of_equations, expression

In [87]:
#Function to eliminate duplicate variables that are formed by combination of existing variables
def eliminate_concatenated_variables(strings):
    strings_set = set(strings)
    is_concatenated = True

    while is_concatenated:
        is_concatenated = False
        for string in strings_set.copy():
            for i in range(1, len(string)):
                prefix = string[:i]
                suffix = string[i:]
                if prefix in strings_set and suffix in strings_set:
                    strings_set.remove(string)
                    is_concatenated = True
                    break

    return strings_set

In [88]:
def get_variable_count(equations):
  # Define regular expression pattern to extract variables
  variable_pattern = r"\\?[a-zA-Z]+(?:_[0-9]+)?(?:\^{[0-9]+})?"
  max_no_of_variables = 0

  for equation in equations:
    # Find all variables in the equation
    variables = re.findall(variable_pattern, equation)

    # Count the number of unique variables
    unique_variables = set(variables)
    unique_variables = eliminate_concatenated_variables(unique_variables)
    num_unique_variables = len(unique_variables)
    max_no_of_variables = num_unique_variables if num_unique_variables > max_no_of_variables else max_no_of_variables

  return max_no_of_variables

In [89]:
def detect_exp(equations):
  exponentiation_regex = r'\^|exp|sqrt'
  has_exponentiation = any(re.search(exponentiation_regex, equation) for equation in equations)
  no_of_exponents = sum([len(re.findall(exponentiation_regex, equation)) for equation in equations])

  return has_exponentiation, no_of_exponents

In [90]:
def detect_mod(equations):
    # Detect "mod" (modulo)
    modulo_regex = r'\bmod\b'
    has_modulo = any(re.search(modulo_regex, equation) for equation in equations)
    no_of_mods = sum([len(re.findall(modulo_regex, equation)) for equation in equations])

    # Detect "|a+b|"
    absolute_value_regex = r'\|.*?\|'
    has_absolute_value =any(re.search(absolute_value_regex, equation) for equation in equations)
    no_of_mods += sum([len(re.findall(absolute_value_regex, equation)) for equation in equations])

    return (has_modulo or has_absolute_value), no_of_mods

In [91]:
def has_log(equations):
  # Detect "log" function
  log_regex = r'\\log'
  has_log = any(re.search(log_regex, equation) for equation in equations)
  no_of_logs = sum([len(re.findall(log_regex, equation)) for equation in equations])
  return has_log, no_of_logs

In [92]:
def has_fractions(equations):
  fraction_pattern = r"frac"
  has_fraction = any(re.search(fraction_pattern, equation) for equation in equations)
  no_of_fractions = sum([len(re.findall(fraction_pattern, equation)) for equation in equations])
  
  return has_fraction, no_of_fractions

In [93]:
def has_equality(equations):
  # Detect equality
  equality_regex = r'\b=\b|\b==\b'
  has_eq = any(re.search(equality_regex, equation) for equation in equations)
  no_of_eqs = sum([len(re.findall(equality_regex, equation)) for equation in equations])

  return has_eq, no_of_eqs

In [94]:
def has_inequality(equations):
    # Detect equality or inequality
    neq_regex = r'(>|<|\\leq|\\geq|\\neq|!=)'
    has_neq =  any(re.search(neq_regex, repr(equation)) for equation in equations)
    no_of_neqs = sum([len(re.findall(neq_regex, repr(equation))) for equation in equations])

    return has_neq, no_of_neqs

In [95]:
def max_degree_of_equations(equations):
  degree_regex = r"([-+])?\s*(\d+)?\s*([a-zA-Z]+)\s*\^?\s*(\d+)?"
  max_degree = 0
  for equation in equations:
    terms = re.findall(degree_regex, equation)
    # Initialize degree as zero
    degree = 0
    # Iterate over each term and update the degree if necessary
    for term in terms:
        sign = term[0]
        coefficient = term[1]
        variable = term[2]
        exponent = term[3] if term[3] else '1'

        if variable:
            if exponent.isdigit():
                current_degree = int(exponent)
                degree = max(degree, current_degree)
    max_degree = max(degree, max_degree)
  return max_degree

In [96]:
def extract_math_features(problem):
  eqns, equations_count, modified_problem = get_equations_count(problem)
  variable_count = get_variable_count(eqns)
  has_exp, no_of_exponents = detect_exp(eqns)
  has_mod, no_of_mods = detect_mod(eqns)
  has_logarithm, no_of_logs = has_log(eqns)
  has_fracs, no_of_fractions = has_fractions(eqns)
  has_eq, no_of_eqs = has_equality(eqns)
  has_neq, no_of_neqs = has_inequality(eqns)
  max_degree = max_degree_of_equations(eqns)

  return pd.Series({'modified_problem': modified_problem, 
                    'no_of_equations': equations_count, 
                    'no_of_variables': variable_count, 
                    'has_exp': has_exp, 
                    'has_mod': has_mod,
                    'has_logarithm': has_logarithm, 
                    'has_fraction': has_fracs, 
                    'has_eq':has_eq,
                    'has_neq': has_neq,
                    'exp_count': no_of_exponents,
                    'mod_count': no_of_mods,
                    'log_count': no_of_logs,
                    'fracs_count': no_of_fractions,
                    'eqlts_count': no_of_eqs,
                    'neqlts_count': no_of_neqs,
                    'max_degree_of_equations': max_degree
                  })

In [97]:
math_problems = pd.concat([math_problems, math_problems['problem'].apply(extract_math_features)], axis=1)

In [98]:
math_problems.head()

Unnamed: 0,problem,level,type,modified_problem,no_of_equations,no_of_variables,has_exp,has_mod,has_logarithm,has_fraction,has_eq,has_neq,exp_count,mod_count,log_count,fracs_count,eqlts_count,neqlts_count,max_degree_of_equations
0,Kevin Kangaroo begins hopping on a number line...,Level 5,Algebra,Kevin Kangaroo begins hopping on a number line...,2,1,False,False,False,True,False,False,0,0,0,2,0,0,1
1,The ratio of the areas of two squares is $\fra...,Level 4,Algebra,The ratio of the areas of two squares is . Aft...,3,5,True,False,False,True,False,False,1,0,0,2,0,0,1
2,"If $\sqrt{2\sqrt{t-2}} = \sqrt[4]{7 - t}$, the...",Level 4,Algebra,"If , then find .",1,2,True,False,False,False,False,False,3,0,0,0,0,0,1
3,Let $t(x) = \sqrt{3x+1}$ and $f(x)=5-t(x)$. Wh...,Level 4,Algebra,Let and . What is ?,3,3,True,False,False,False,False,False,1,0,0,0,0,0,1
4,James has a total of 66 dollars in his piggy b...,Level 2,Algebra,James has a total of 66 dollars in his piggy b...,0,0,False,False,False,False,False,False,0,0,0,0,0,0,0


## Extracting text attributes

Installing spacy for further processing

In [99]:
corpus = spacy.load("en_core_web_sm")
def get_text_attributes(text):

  document = corpus(text)

  sentences = list(document.sents)
  
  #Sentences count
  sentence_count = len(sentences)

  word_count = 0
  words_len_greater_than_six = 0
  char_count = 0
  large_words=[]
  has_repeated_large_words = False
  
  for token in document:
    if not token.is_space: 
        
        #Word count in the text
        word_count += 1
        char_count += len(token.text)
        
        #Number of words with length greater than 6
        if len(token.text) > 6:
          if token.text not in large_words:
            words_len_greater_than_six += 1
            large_words.append(token.text)
          else:
            has_repeated_large_words = True
  
  #Average words per sentence
  words_per_sentence = round(word_count / sentence_count)

  #Average length of words
  average_word_length = round(char_count / word_count)

  return pd.Series({'sentence_count': sentence_count, 'word_count': word_count, 'words_per_sentence': words_per_sentence, 'average_word_length': average_word_length, 'large_words': words_len_greater_than_six, 'has_repeated_large_words': has_repeated_large_words})

In [100]:
math_problems = pd.concat([math_problems, math_problems['modified_problem'].apply(get_text_attributes)], axis=1)

## Named Entity Recognition

In [101]:
#List of labels for named entity recognition using spacy for en_core_web_sm
corpus.get_pipe("ner").labels

('CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART')

In [102]:
def named_entity_recognition(text):
  #Initialising a dictionary with all named entities
  ner = {}
  for label in corpus.get_pipe("ner").labels:
    ner[label] = 0
  
  document = corpus(text)

  for entity in document.ents:
    ner[entity.label_] += 1
  
  return pd.Series(ner)

In [103]:
math_problems = pd.concat([math_problems, math_problems['modified_problem'].apply(named_entity_recognition)], axis=1)

## POS Tagging

Parts of Speech tagging

In spaCy, the Part-of-Speech (POS) tags supported by the default English language model (en_core_web_sm) include the following:

ADJ: Adjective

ADP: Adposition

ADV: Adverb

AUX: Auxiliary verb

CCONJ: Coordinating conjunction

DET: Determiner

INTJ: Interjection

NOUN: Noun

NUM: Numeral

PART: Particle

PRON: Pronoun

PROPN: Proper noun

PUNCT: Punctuation

SCONJ: Subordinating conjunction

SYM: Symbol

VERB: Verb

X: Other

In [104]:
pos_labels = ['ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']

In [105]:
def pos_count_tagging(text):
  pos = {}
  for pos_tag in pos_labels:
    pos[pos_tag] = 0
  
  document = corpus(text)

  for token in document:
    if not token.is_space: 
      pos[token.pos_] += 1
  return pd.Series(pos)

In [106]:
math_problems = pd.concat([math_problems, math_problems['modified_problem'].apply(pos_count_tagging)], axis=1)

In [107]:
math_problems.columns

Index(['problem', 'level', 'type', 'modified_problem', 'no_of_equations',
       'no_of_variables', 'has_exp', 'has_mod', 'has_logarithm',
       'has_fraction', 'has_eq', 'has_neq', 'exp_count', 'mod_count',
       'log_count', 'fracs_count', 'eqlts_count', 'neqlts_count',
       'max_degree_of_equations', 'sentence_count', 'word_count',
       'words_per_sentence', 'average_word_length', 'large_words',
       'has_repeated_large_words', 'CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE',
       'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT',
       'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART', 'ADJ', 'ADP',
       'ADV', 'AUX', 'CONJ', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART',
       'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X'],
      dtype='object')

Save data with all features

In [108]:
save_dataframe_csv(PATH_TO_DATA, 'all_data.csv', math_problems)

Dataframe saved successfully at /Users/sai/My Drive/Learning/Research/NLP/all_data.csv


Feature Reduction

In [28]:
math_problems = pd.read_csv(PATH_TO_DATA+"all_data.csv")

In [29]:
math_problems.columns

Index(['problem', 'level', 'type', 'modified_problem', 'no_of_equations',
       'no_of_variables', 'has_exp_or_mod', 'has_logarithm', 'has_fraction',
       'has_eq_or_neq', 'sentence_count', 'word_count', 'words_per_sentence',
       'average_word_length', 'large_words', 'has_repeated_large_words',
       'CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC',
       'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT',
       'QUANTITY', 'TIME', 'WORK_OF_ART', 'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ',
       'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT',
       'SCONJ', 'SYM', 'VERB', 'X'],
      dtype='object')

In [30]:
cols_to_drop = ['sentence_count', 'word_count', 'words_per_sentence',
       'average_word_length',
       'CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC',
       'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT',
       'QUANTITY', 'TIME', 'WORK_OF_ART', 'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ',
       'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT',
       'SCONJ', 'SYM', 'VERB', 'X']

In [17]:
# 1. ratio of numbers of words in each problem/total no. of sentences in each problems
# We can use the words_per_sentence feature

In [31]:
# 2. ratio of number of pronouns used in each problem/ total no. of words in each problems
math_problems['pron_words_ratio'] = round(math_problems['PRON'] / math_problems['word_count'], 3)

In [32]:
# 3. ratio of number of pronouns used in each problem/ total no. of sentences in each problems
math_problems['pron_sents_ratio'] = round(math_problems['PRON'] / math_problems['sentence_count'], 3)

In [33]:
# 4. ratio of number of adjectives/total. number of sentences in each problems
math_problems['adj_sents_ratio'] = round(math_problems['ADJ'] / math_problems['sentence_count'], 3)

In [34]:
# 5. ratio of number of adjectives/total. number of words in each problems
math_problems['adj_words_ratio'] = round(math_problems['ADJ'] / math_problems['word_count'], 3)

In [35]:
math_problems = pd.concat([math_problems, math_problems['problem'].apply(extract_degree_of_equations)], axis=1)

In [36]:
math_problems.drop(columns=cols_to_drop, inplace=True)

In [37]:
math_problems.columns

Index(['problem', 'level', 'type', 'modified_problem', 'no_of_equations',
       'no_of_variables', 'has_exp_or_mod', 'has_logarithm', 'has_fraction',
       'has_eq_or_neq', 'large_words', 'has_repeated_large_words',
       'pron_words_ratio', 'pron_sents_ratio', 'adj_sents_ratio',
       'adj_words_ratio', 'max_degree_of_equations'],
      dtype='object')

Saving the final dataframe as a csv file

In [41]:
save_dataframe_csv(PATH_TO_DATA, 'all_data_compressed.csv', math_problems)

Dataframe saved successfully at /content/drive/My Drive/NLP/all_data_compressed.csv
