Connecting to drive

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Navigating to project directory

In [13]:
import os
os.chdir("/content/drive/My Drive/NLP/algebra/")

Importing required libraries

In [4]:
#For dataframe processing
import pandas as pd

#For regex validation
import re

#For nlp
import spacy

Reading the json files and appending to input dataframe

In [5]:
#Files in the current directory
files = [f for f in os.listdir('.') if os.path.isfile(f)]
len(files)

1736

In [6]:
#Reading the math problems json files into dataframe
math_problems = pd.DataFrame()
for file_name in files:
    data = pd.read_json(file_name, orient='index').T
    math_problems = pd.concat([math_problems,data], ignore_index = True)

In [None]:
math_problems.to_csv("data.csv")

In [None]:
math_problems.head()

Unnamed: 0,problem,level,type,solution
0,A horse 24 feet from the center of a merry-go-...,Level 2,Algebra,The radius of the circular path of the horse c...
1,A math teacher requires Noelle to do one homew...,Level 5,Algebra,Noelle only has to do 1 homework assignment to...
2,"In a rectangular coordinate system, what is th...",Level 2,Algebra,We use the distance formula: \begin{align*}\n\...
3,"Below is a portion of the graph of a function,...",Level 5,Algebra,Note that the graph of $g(x)$ is identical to ...
4,"If $x^2+y^2=1$, what is the largest possible v...",Level 5,Algebra,"If $(x,y)$ lies on the circle, so does $(x,-y)..."


Dropping the solution and type columns

In [7]:
math_problems.drop(columns=['type', 'solution'], inplace=True)
math_problems.head()

Unnamed: 0,problem,level
0,A horse 24 feet from the center of a merry-go-...,Level 2
1,A math teacher requires Noelle to do one homew...,Level 5
2,"In a rectangular coordinate system, what is th...",Level 2
3,"Below is a portion of the graph of a function,...",Level 5
4,"If $x^2+y^2=1$, what is the largest possible v...",Level 5


Detecting number of equations in the problem columns

In our data, the equations are enclosed inside either of
  1. \$...$
  2. \$\$...\$\$
  3. \\\[...\\\]

So we need to count the number of equations by matching these patterns

In [8]:
import re

def get_equations_count(expression):
  regexes = [r"\$\$([^$]+)\$\$", r"\$([^$]+)\$", r"\\\(([^$]+)\\\)", r"\\\[([^$]+)\\\]"]
  no_of_equations = 0
  for regex in regexes:
    matches = re.findall(regex, expression)
    no_of_equations += len(matches)
    expression = re.sub(regex, "", expression)
  return pd.Series({'no_of_equations':no_of_equations, 'modified_problem':expression})

In [None]:
get_equations_count('Simplify the following expression in $x$: \[2x+8x^2+9-(4-2x-8x^2).\] Express your answer in the form $ax^2 +bx+c$, where $a$, $b$, and $c$ are numbers.')

[] 0
['x', 'ax^2 +bx+c', 'a', 'b', 'c'] 5
[] 5
['2x+8x^2+9-(4-2x-8x^2).'] 6


no_of_equations                                                     6
modified_problem    Simplify the following expression in :  Expres...
dtype: object

In [10]:
math_problems = pd.concat([math_problems, math_problems['problem'].apply(get_equations_count)], axis=1)

In [11]:
math_problems.head()

Unnamed: 0,problem,level,no_of_equations,modified_problem
0,A horse 24 feet from the center of a merry-go-...,Level 2,0,A horse 24 feet from the center of a merry-go-...
1,A math teacher requires Noelle to do one homew...,Level 5,4,A math teacher requires Noelle to do one homew...
2,"In a rectangular coordinate system, what is th...",Level 2,1,"In a rectangular coordinate system, what is th..."
3,"Below is a portion of the graph of a function,...",Level 5,6,"Below is a portion of the graph of a function,..."
4,"If $x^2+y^2=1$, what is the largest possible v...",Level 5,2,"If , what is the largest possible value of ?"


In [None]:
math_problems.to_csv("../data_processed.csv")

In [14]:
math_problems = pd.read_csv("../data_processed.csv")

Installing spacy for further processing

In [16]:
math_problems.shape

(1634, 4)

In [20]:
import spacy

corpus = spacy.load("en_core_web_sm")
def get_text_attributes(text):

  document = corpus(text)

  sentences = list(document.sents)
  
  #Sentences count
  sentence_count = len(sentences)

  word_count = 0
  words_len_greater_than_six = 0
  char_count = 0
  large_words=[]
  has_repeated_large_words = False
  
  for token in document:
    if not token.is_space: 
        
        #Word count in the text
        word_count += 1
        char_count += len(token.text)
        
        #Number of words with length greater than 6
        if len(token.text) > 6:
          if token.text not in large_words:
            words_len_greater_than_six += 1
            large_words.append(token.text)
          else:
            has_repeated_large_words = True
  
  #Average words per sentence
  words_per_sentence = round(word_count / sentence_count)

  #Average length of words
  average_word_length = round(char_count / word_count)

  return pd.Series({'sentence_count': sentence_count, 'word_count': word_count, 'words_per_sentence': words_per_sentence, 'average_word_length': average_word_length, 'large_words': words_len_greater_than_six, 'has_repeated_large_words': has_repeated_large_words})

In [21]:
get_text_attributes("A math teacher requires Noelle to do one homework assignment for each of the first five homework points she wants to earn; for each of the next five homework points, she needs to do two homework assignments; and so on, so that to earn the  homework point, she has to do  (rounded up) homework assignments. For example, when she has 11 points, it will take  homework assignments to earn her  point. What is the smallest number of homework assignments necessary to earn a total of 25 homework points?")

sentence_count                 3
word_count                   100
words_per_sentence            33
average_word_length            4
large_words                    9
has_repeated_large_words    True
dtype: object

In [22]:
math_problems = pd.concat([math_problems, math_problems['modified_problem'].apply(get_text_attributes)], axis=1)

In [23]:
math_problems.head()

Unnamed: 0,problem,level,no_of_equations,modified_problem,sentence_count,word_count,words_per_sentence,average_word_length,large_words,has_repeated_large_words
0,A horse 24 feet from the center of a merry-go-...,Level 2,0,A horse 24 feet from the center of a merry-go-...,2,41,20,4,2,True
1,A math teacher requires Noelle to do one homew...,Level 5,4,A math teacher requires Noelle to do one homew...,3,100,33,4,9,True
2,"In a rectangular coordinate system, what is th...",Level 2,1,"In a rectangular coordinate system, what is th...",1,22,22,4,3,False
3,"If $x^2+y^2=1$, what is the largest possible v...",Level 5,2,"If , what is the largest possible value of ?",1,10,10,4,2,False
4,"If Alex gives Bob a penny, Bob will have three...",Level 4,0,"If Alex gives Bob a penny, Bob will have three...",3,45,15,4,2,True


In [24]:
math_problems.to_csv("../data_processed_with_base_attrs.csv", index=False)

In [26]:
#List of labels for named entity recognition using spacy for en_core_web_sm
corpus.get_pipe("ner").labels

('CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART')

In [31]:
def named_entity_recognition(text):
  #Initialising a dictionary with all named entities
  ner = {}
  for label in corpus.get_pipe("ner").labels:
    ner[label] = 0
  
  document = corpus(text)

  for entity in document.ents:
    ner[entity.label_] += 1
  
  return pd.Series(ner)

In [32]:
named_entity_recognition("A math teacher requires Noelle to do one homework assignment for each of the first five homework points she wants to earn; for each of the next five homework points, she needs to do two homework assignments; and so on, so that to earn the  homework point, she has to do  (rounded up) homework assignments. For example, when she has 11 points, it will take  homework assignments to earn her  point. What is the smallest number of homework assignments necessary to earn a total of 25 homework points?")

CARDINAL       6
DATE           0
EVENT          0
FAC            0
GPE            0
LANGUAGE       0
LAW            0
LOC            0
MONEY          0
NORP           0
ORDINAL        1
ORG            0
PERCENT        0
PERSON         1
PRODUCT        0
QUANTITY       0
TIME           0
WORK_OF_ART    0
dtype: int64

In [33]:
math_problems = pd.concat([math_problems, math_problems['modified_problem'].apply(named_entity_recognition)], axis=1)

In [35]:
math_problems.head()

Unnamed: 0,problem,level,no_of_equations,modified_problem,sentence_count,word_count,words_per_sentence,average_word_length,large_words,has_repeated_large_words,...,MONEY,NORP,ORDINAL,ORG,PERCENT,PERSON,PRODUCT,QUANTITY,TIME,WORK_OF_ART
0,A horse 24 feet from the center of a merry-go-...,Level 2,0,A horse 24 feet from the center of a merry-go-...,2,41,20,4,2,True,...,0,0,0,0,0,0,0,2,0,0
1,A math teacher requires Noelle to do one homew...,Level 5,4,A math teacher requires Noelle to do one homew...,3,100,33,4,9,True,...,0,0,1,0,0,1,0,0,0,0
2,"In a rectangular coordinate system, what is th...",Level 2,1,"In a rectangular coordinate system, what is th...",1,22,22,4,3,False,...,0,0,0,0,0,0,0,0,0,0
3,"If $x^2+y^2=1$, what is the largest possible v...",Level 5,2,"If , what is the largest possible value of ?",1,10,10,4,2,False,...,0,0,0,0,0,0,0,0,0,0
4,"If Alex gives Bob a penny, Bob will have three...",Level 4,0,"If Alex gives Bob a penny, Bob will have three...",3,45,15,4,2,True,...,0,0,0,0,0,7,0,0,0,0


In [36]:
for label in corpus.get_pipe("ner").labels:
  print(math_problems[label].value_counts())

0     1215
1      234
2       83
3       58
4       21
5       12
6        9
9        1
12       1
Name: CARDINAL, dtype: int64
0    1517
1      51
2      31
3      16
4      13
5       3
6       3
Name: DATE, dtype: int64
0    1634
Name: EVENT, dtype: int64
0    1632
1       2
Name: FAC, dtype: int64
0    1616
1      12
2       5
4       1
Name: GPE, dtype: int64
0    1634
Name: LANGUAGE, dtype: int64
0    1632
2       1
1       1
Name: LAW, dtype: int64
0    1631
1       2
5       1
Name: LOC, dtype: int64
0    1619
1      10
2       2
3       1
4       1
5       1
Name: MONEY, dtype: int64
0    1617
1      14
2       3
Name: NORP, dtype: int64
0    1545
1      45
2      23
3      17
4       3
6       1
Name: ORDINAL, dtype: int64
0    1516
1     101
2      12
3       5
Name: ORG, dtype: int64
0    1630
1       3
2       1
Name: PERCENT, dtype: int64
0     1525
1       54
2       24
4       11
3       10
6        3
5        2
9        2
7        1
14       1
8        1
Name: PERSON, 

In [37]:
# From above we can see that none of the problems have words mapped to WORK_OF_ART entity, hence dropping the column
math_problems.drop(columns=['WORK_OF_ART'], inplace=True)

In [40]:
math_problems.to_csv("../data_with_ner.csv", index=False)