Connecting to drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Navigating to project directory

In [3]:
import os
os.chdir("/content/drive/My Drive/NLP/algebra/")

Importing required libraries

In [4]:
import pandas as pd

Reading the json files and appending to input dataframe

In [5]:
#Files in the current directory
files = [f for f in os.listdir('.') if os.path.isfile(f)]
len(files)

1736

In [6]:
#Initialising the dataframe by reading the first json file
math_problems = pd.DataFrame()
for file_name in files:
    data = pd.read_json(file_name, orient='index').T
    math_problems = pd.concat([math_problems,data], ignore_index = True)

In [None]:
math_problems.to_csv("data.csv")

In [None]:
math_problems.head()

Unnamed: 0,problem,level,type,solution
0,A horse 24 feet from the center of a merry-go-...,Level 2,Algebra,The radius of the circular path of the horse c...
1,A math teacher requires Noelle to do one homew...,Level 5,Algebra,Noelle only has to do 1 homework assignment to...
2,"In a rectangular coordinate system, what is th...",Level 2,Algebra,We use the distance formula: \begin{align*}\n\...
3,"Below is a portion of the graph of a function,...",Level 5,Algebra,Note that the graph of $g(x)$ is identical to ...
4,"If $x^2+y^2=1$, what is the largest possible v...",Level 5,Algebra,"If $(x,y)$ lies on the circle, so does $(x,-y)..."


Dropping solution and type columns

In [7]:
math_problems.drop(columns=['type', 'solution'], inplace=True)
math_problems.head()

Unnamed: 0,problem,level
0,A horse 24 feet from the center of a merry-go-...,Level 2
1,A math teacher requires Noelle to do one homew...,Level 5
2,"In a rectangular coordinate system, what is th...",Level 2
3,"Below is a portion of the graph of a function,...",Level 5
4,"If $x^2+y^2=1$, what is the largest possible v...",Level 5


Detecting number of equations in the problem columns

In [8]:
'''
  In our data, the equations are enclosed inside either of
  1. $...$
  2. $$...$$
  3. \[...\]

  So we need to count the number of equations by matching these patterns
'''

import re

def get_equations_count(expression):
  regexes = [r"\$\$([^$]+)\$\$", r"\$([^$]+)\$", r"\\\(([^$]+)\\\)", r"\\\[([^$]+)\\\]"]
  no_of_equations = 0
  for regex in regexes:
    matches = re.findall(regex, expression)
    no_of_equations += len(matches)
    expression = re.sub(regex, "", expression)
  return pd.Series({'no_of_equations':no_of_equations, 'modified_problem':expression})

In [None]:
get_equations_count('Simplify the following expression in $x$: \[2x+8x^2+9-(4-2x-8x^2).\] Express your answer in the form $ax^2 +bx+c$, where $a$, $b$, and $c$ are numbers.')

[] 0
['x', 'ax^2 +bx+c', 'a', 'b', 'c'] 5
[] 5
['2x+8x^2+9-(4-2x-8x^2).'] 6


no_of_equations                                                     6
modified_problem    Simplify the following expression in :  Expres...
dtype: object

In [10]:
math_problems = pd.concat([math_problems, math_problems['problem'].apply(get_equations_count)], axis=1)

In [11]:
math_problems.head()

Unnamed: 0,problem,level,no_of_equations,modified_problem
0,A horse 24 feet from the center of a merry-go-...,Level 2,0,A horse 24 feet from the center of a merry-go-...
1,A math teacher requires Noelle to do one homew...,Level 5,4,A math teacher requires Noelle to do one homew...
2,"In a rectangular coordinate system, what is th...",Level 2,1,"In a rectangular coordinate system, what is th..."
3,"Below is a portion of the graph of a function,...",Level 5,6,"Below is a portion of the graph of a function,..."
4,"If $x^2+y^2=1$, what is the largest possible v...",Level 5,2,"If , what is the largest possible value of ?"


In [None]:
math_problems.to_csv("../data_processed.csv")

Installing spacy for further processing

In [12]:
!pip install spacy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [26]:
import spacy

corpus = spacy.load("en_core_web_sm")
def get_text_attributes(text):

  document = corpus(text)

  sentences = list(document.sents)
  
  #Sentences count
  sentence_count = len(sentences)

  word_count = 0
  words_len_greater_than_six = 0
  char_count = 0
  
  for token in document:
    if not token.is_space: 
        
        #Word count in the text
        word_count += 1
        char_count += len(token.text)
        
        #Number of words with length greater than 6
        if len(token.text) > 6:
          words_len_greater_than_six += 1
  
  #Average words per sentence
  words_per_sentence = round(word_count / sentence_count)

  #Average length of words
  average_word_length = round(char_count / word_count)

  return pd.Series({'sentence_count': sentence_count, 'word_count': word_count, 'words_per_sentence': words_per_sentence, 'average_word_length': average_word_length, 'large_words': words_len_greater_than_six})

In [27]:
get_text_attributes("A math teacher requires Noelle to do one homework assignment for each of the first five homework points she wants to earn; for each of the next five homework points, she needs to do two homework assignments; and so on, so that to earn the  homework point, she has to do  (rounded up) homework assignments. For example, when she has 11 points, it will take  homework assignments to earn her  point. What is the smallest number of homework assignments necessary to earn a total of 25 homework points?")

teacher
requires
homework
assignment
homework
homework
homework
assignments
homework
rounded
homework
assignments
example
homework
assignments
smallest
homework
assignments
necessary
homework


sentence_count           3
word_count             100
words_per_sentence      33
average_word_length      4
large_words             20
dtype: int64

In [24]:
math_problems = pd.concat([math_problems, math_problems['modified_problem'].apply(get_text_attributes)], axis=1)

In [25]:
math_problems.to_csv("../data_processed_with_base_attrs.csv")