In [1]:
ideal = "The Mughal Empire was founded by Babur, a Central Asian ruler, who reigned from 1526–1530. He descended from the Turco-Mongol conqueror Timur on his father's side, and from Genghis Khan on his mother's side."
print(ideal)

The Mughal Empire was founded by Babur, a Central Asian ruler, who reigned from 1526–1530. He descended from the Turco-Mongol conqueror Timur on his father's side, and from Genghis Khan on his mother's side.


In [2]:
# Importing the required libraries
import nltk 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
import re

Using Regex to first make a list of numbers in the best answer

In [3]:
numbers = re.sub("\D"," ",ideal).split()
print(numbers)

['1526', '1530']


### Tokenization

Sentence Tokenization

In [4]:
sentences = nltk.sent_tokenize(ideal)
for sentence in sentences:
    print(sentence)
    print()

The Mughal Empire was founded by Babur, a Central Asian ruler, who reigned from 1526–1530.

He descended from the Turco-Mongol conqueror Timur on his father's side, and from Genghis Khan on his mother's side.



Word Tokenizing using Regex

In [5]:
word_list = []
for sentence in sentences:
    words = re.sub("[^a-zA-Z]"," ",sentence).split()
    print(words)
    print()
    word_list.extend(words)

['The', 'Mughal', 'Empire', 'was', 'founded', 'by', 'Babur', 'a', 'Central', 'Asian', 'ruler', 'who', 'reigned', 'from']

['He', 'descended', 'from', 'the', 'Turco', 'Mongol', 'conqueror', 'Timur', 'on', 'his', 'father', 's', 'side', 'and', 'from', 'Genghis', 'Khan', 'on', 'his', 'mother', 's', 'side']



In [6]:
word_list = [word for word in word_list if word not in set(stopwords.words('english'))]
word_list

['The',
 'Mughal',
 'Empire',
 'founded',
 'Babur',
 'Central',
 'Asian',
 'ruler',
 'reigned',
 'He',
 'descended',
 'Turco',
 'Mongol',
 'conqueror',
 'Timur',
 'father',
 'side',
 'Genghis',
 'Khan',
 'mother',
 'side']

### Stemming and Lemmatization

We'll lemmatize cuz it's more accurate than stemming

In [7]:
lemmatizer = WordNetLemmatizer()
lemm_words = [lemmatizer.lemmatize(word, pos=wordnet.VERB) for word in word_list]
print(lemm_words)

['The', 'Mughal', 'Empire', 'found', 'Babur', 'Central', 'Asian', 'ruler', 'reign', 'He', 'descend', 'Turco', 'Mongol', 'conqueror', 'Timur', 'father', 'side', 'Genghis', 'Khan', 'mother', 'side']


Segregating proper nouns using **part of speech tagging**

In [8]:
ppn_list = []
tagged = nltk.pos_tag(lemm_words)
for (word, tag) in tagged:
    if tag == 'NNP': # If the word is a proper noun
        ppn_list.append(word)
print(ppn_list)

['Mughal', 'Empire', 'Babur', 'Central', 'Asian', 'Turco', 'Mongol', 'Timur', 'Genghis', 'Khan']


Hence, we have made a list of the following attributes of ideal answer:
1. Numbers
2. Proper Nouns
3. Common nouns and other words

In [9]:
ppn_list = list(set(ppn_list))
cmn_list = [word.lower() for word in list(set(lemm_words) - set(ppn_list))]
cmn_list = [word for word in cmn_list if word not in set(stopwords.words('english'))]
num_list = list(set(numbers))

print("List of numbers: ", num_list)
print("List of Proper Nouns: ", ppn_list)
print("List of Common Nouns and other words: ", cmn_list)

List of numbers:  ['1526', '1530']
List of Proper Nouns:  ['Turco', 'Asian', 'Timur', 'Babur', 'Central', 'Genghis', 'Empire', 'Khan', 'Mongol', 'Mughal']
List of Common Nouns and other words:  ['father', 'side', 'descend', 'reign', 'conqueror', 'ruler', 'mother', 'found']


Defining a single function representing all transformations

In [10]:
def answer_attributes(answer):
    num_list = list(set(re.sub("\D"," ",answer).split()))
    
    # Sentence Tokenization
    sentences = nltk.sent_tokenize(answer)
    # Word Tokenization with Regex
    word_list = []
    for sentence in sentences:
        words = re.sub("[^a-zA-Z]"," ",answer).split()
        words = [word for word in words if word not in set(stopwords.words('english'))]
        word_list.extend(words)
    
    # Lemmatizing
    lemmatizer = WordNetLemmatizer()
    lemm_words = [lemmatizer.lemmatize(word, pos=wordnet.VERB) for word in word_list]
    
    # Segregating Proper Nouns
    ppn_list = []
    tagged = nltk.pos_tag(lemm_words)
    for (word, tag) in tagged:
        if tag == 'NNP': # If the word is a proper noun
            ppn_list.append(word)
    ppn_list = list(set(ppn_list))
    
    # Segregating Common Nouns and other words
    cmn_list = [word.lower() for word in list(set(lemm_words) - set(ppn_list))]
    cmn_list = [word for word in cmn_list if word not in set(stopwords.words('english'))]

    return num_list, ppn_list, cmn_list

## Vectorizing and grading an Answer

In [11]:
attempt = "The Mughal Empire was founded by Babur, an Asian ruler, who reigned till 1530. He descended from Timur on his father's side, and from Genghis Khan on his mother's side."

In [12]:
weightage = [0.5, 0.3, 0.2]  # Weightage split of 5:3:2 between proper_nouns : numbers : common_nouns

def vectoriser(ideal, attempt):
    ideal_vec = [1, 1, 1, 1]
    attempt_vec = [1, 0, 1, 1]
    return ideal_vec, attempt_vec

def scoreit(ideal_vec, attempt_vec):
    
    pass

ppn_vec_ideal, ppn_vec_attempt = vectoriser(answer_attributes(ideal)[1], answer_attributes(attempt)[1])
ppn_score = scoreit(ppn_vec_ideal, ppn_vec_attempt)

num_vec_ideal, num_vec_attempt = vectoriser(answer_attributes(ideal)[0], answer_attributes(attempt)[0])
num_score = scoreit(num_vec_ideal, num_vec_attempt)

cmn_vec_ideal, cmn_vec_attempt = vectoriser(answer_attributes(ideal)[2], answer_attributes(attempt)[2])
cmn_score = scoreit(cmn_vec_ideal, cmn_vec_attempt)

Final Score

In [13]:
Final = weightage[0] * ppn_score + weightage[1] * num_score + weightage[2] * cnn_score 
print("Final Score (out of 100) = ", Final * 100)

TypeError: unsupported operand type(s) for *: 'float' and 'NoneType'