In [10]:
# Importing the required libraries
import nltk 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
import re
import numpy as np
from numpy.linalg import norm

# Demonstration on a test case:
Initializing the instructor's template and student's answer:

In [52]:
ideal = "The Mughal Empire was founded by Babur, a Central Asian ruler, who reigned from 1526–1530. He descended from the Turco-Mongol conqueror Timur on his father's side, and from Genghis Khan on his mother's side."
attempt = "The Mughal Empire was founded by Babur, an Asian ruler, who reigned till 1530. He descended from Timur on his father's side, and from Genghis Khan on his mother's side."
print(ideal)
print(attempt)

The Mughal Empire was founded by Babur, a Central Asian ruler, who reigned from 1526–1530. He descended from the Turco-Mongol conqueror Timur on his father's side, and from Genghis Khan on his mother's side.
The Mughal Empire was founded by Babur, an Asian ruler, who reigned till 1530. He descended from Timur on his father's side, and from Genghis Khan on his mother's side.


The instructor's template string is used here to demonstrate the processing techniques that are implemented in our model

Using Regex to first make a list of numbers in the best answer

In [11]:
numbers = re.sub("\D"," ",ideal).split() # Identifies and lists the occurrences of numbericals in the sentences
print(numbers)

['1526', '1530']


### Tokenization

Sentence Tokenization

In [12]:
sentences = nltk.sent_tokenize(ideal) # Seperates the sentences in the given string
for sentence in sentences:
    print(sentence)
    print()

The Mughal Empire was founded by Babur, a Central Asian ruler, who reigned from 1526–1530.

He descended from the Turco-Mongol conqueror Timur on his father's side, and from Genghis Khan on his mother's side.



Word Tokenizing using Regex

In [13]:
word_list = []
for sentence in sentences:
    words = re.sub("[^a-zA-Z]"," ",sentence).split()
    print(words)
    print()
    word_list.extend(words)

['The', 'Mughal', 'Empire', 'was', 'founded', 'by', 'Babur', 'a', 'Central', 'Asian', 'ruler', 'who', 'reigned', 'from']

['He', 'descended', 'from', 'the', 'Turco', 'Mongol', 'conqueror', 'Timur', 'on', 'his', 'father', 's', 'side', 'and', 'from', 'Genghis', 'Khan', 'on', 'his', 'mother', 's', 'side']



In [14]:
# nltk.download('stopwords') # Uncomment and run if 'stopwords' is not downloaded previously

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sriram\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [47]:
# Stop words Removal

word_list = [word for word in word_list if word not in set(stopwords.words('english'))]
print(word_list)

['The', 'Mughal', 'Empire', 'founded', 'Babur', 'Central', 'Asian', 'ruler', 'reigned', 'He', 'descended', 'Turco', 'Mongol', 'conqueror', 'Timur', 'father', 'side', 'Genghis', 'Khan', 'mother', 'side']


### Stemming and Lemmatization

Lemmatization is usually more accurate and effective than stemming

In [16]:
# nltk.download('wordnet') # Uncomment and run if 'wordnet' is not downloaded previously

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Sriram\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [17]:
# Lemmatization

lemmatizer = WordNetLemmatizer()
lemm_words = [lemmatizer.lemmatize(word, pos=wordnet.VERB) for word in word_list]
print(lemm_words)

['The', 'Mughal', 'Empire', 'found', 'Babur', 'Central', 'Asian', 'ruler', 'reign', 'He', 'descend', 'Turco', 'Mongol', 'conqueror', 'Timur', 'father', 'side', 'Genghis', 'Khan', 'mother', 'side']


Segregating proper nouns using **part of speech tagging**

In [19]:
# nltk.download('averaged_perceptron_tagger') # # Uncomment and run if 'averaged_perceptron_tagger' is not downloaded previously

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Sriram\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [20]:
# Demonstration on identifying proper nouns in the string

ppn_list = []
tagged = nltk.pos_tag(lemm_words)
for (word, tag) in tagged:
    if tag == 'NNP': # If the word is a proper noun
        ppn_list.append(word)
print(ppn_list)

['Mughal', 'Empire', 'Babur', 'Central', 'Asian', 'Turco', 'Mongol', 'Timur', 'Genghis', 'Khan']


Hence, we have made a list of the following attributes of ideal answer:
1. Numbers
2. Proper Nouns
3. Common nouns and other words

In [21]:
# Demonstration on identifying proper nouns, common nouns and numericals in the string

ppn_list = list(set(ppn_list))
cmn_list = [word.lower() for word in list(set(lemm_words) - set(ppn_list))]
cmn_list = [word for word in cmn_list if word not in set(stopwords.words('english'))]
num_list = list(set(numbers))

print("List of numbers: ", num_list)
print("List of Proper Nouns: ", ppn_list)
print("List of Common Nouns and other words: ", cmn_list)

List of numbers:  ['1530', '1526']
List of Proper Nouns:  ['Mughal', 'Central', 'Genghis', 'Turco', 'Khan', 'Timur', 'Asian', 'Babur', 'Empire', 'Mongol']
List of Common Nouns and other words:  ['ruler', 'side', 'father', 'reign', 'mother', 'conqueror', 'found', 'descend']


# Defining a single function representing all transformations

In [59]:
def answer_attributes_ideal(answer):
    num_list = list(set(re.sub("\D"," ",answer).split()))
    
    # Sentence Tokenization
    sentences = nltk.sent_tokenize(answer)
    # Word Tokenization with Regex
    word_list = []
    for sentence in sentences:
        words = re.sub("[^a-zA-Z]"," ",answer).split()
        words = [word for word in words if word not in set(stopwords.words('english'))]
        word_list.extend(words)
    
    # Lemmatizing
    lemmatizer = WordNetLemmatizer()
    lemm_words = [lemmatizer.lemmatize(word, pos=wordnet.VERB) for word in word_list]
    
    # Segregating Proper Nouns
    ppn_list = []
    tagged = nltk.pos_tag(lemm_words)
    for (word, tag) in tagged:
        if tag == 'NNP': # If the word is a proper noun
            ppn_list.append(word)
    ppn_list = list(set(ppn_list))
    
    # Segregating Common Nouns and other words
    cmn_list = [word.lower() for word in list(set(lemm_words) - set(ppn_list))]
    cmn_list = [word for word in cmn_list if word not in set(stopwords.words('english'))]

    return num_list, ppn_list, cmn_list

In [60]:
def answer_attributes_attempt(correct_cmnlist, answer):
    num_list = list(set(re.sub("\D"," ",answer).split()))
    
    # Sentence Tokenization
    sentences = nltk.sent_tokenize(answer)
    # Word Tokenization with Regex
    word_list = []
    for sentence in sentences:
        words = re.sub("[^a-zA-Z]"," ",answer).split()
        words = [word for word in words if word not in set(stopwords.words('english'))]
        word_list.extend(words)
    
    # Lemmatizing
    lemmatizer = WordNetLemmatizer()
    lemm_words = [lemmatizer.lemmatize(word, pos=wordnet.VERB) for word in word_list]
    
    # Segregating Proper Nouns
    ppn_list = []
    tagged = nltk.pos_tag(lemm_words)
    for (word, tag) in tagged:
        if tag == 'NNP': # If the word is a proper noun
            ppn_list.append(word)
    ppn_list = list(set(ppn_list))
    
    # Segregating Common Nouns and other words
    cmn_list = [word.lower() for word in list(set(lemm_words) - set(ppn_list))]
    cmn_list = [word for word in cmn_list if word not in set(stopwords.words('english'))]

    # Synonym matching for common nouns using synsets
    for word in correct_cmnlist:
        for syn in wordnet.synsets(word):
            for i in syn.lemmas():
                for k in range(len(cmn_list)):
                    if i.name() == cmn_list[k]:
                        cmn_list[k] = word

    return num_list, ppn_list, cmn_list

## Vectorizing and grading an Answer

In [61]:
def vectoriser(ideal, attempt):
    word_set = list(set(ideal).union(set(attempt)))

    word_count_ideal = {}
    word_count_attempt = {}
    for word in word_set:
        word_count_ideal[word] = 0
        word_count_attempt[word] = 0
    for word in ideal:
        word_count_ideal[word] += 1
    for word in attempt:
        word_count_attempt[word] += 1

    return list(word_count_ideal.values()),list(word_count_attempt.values())

def scoreit(ideal_vec, attempt_vec):
    sim = np.dot(ideal_vec, attempt_vec)
    if sim != 0:
        sim = sim / (norm(ideal_vec)*norm(attempt_vec))
    return sim

# The Grader function
This function makes use of all the functions above and prints the similarity score

In [66]:
def grader(ideal, attempt, weightage = [0.5,0.3,0.2]): # Different weights for proper nouns, common nouns and numbers
    ppn_vec_ideal, ppn_vec_attempt = vectoriser(answer_attributes_ideal(ideal)[1], answer_attributes_ideal(attempt)[1])
    ppn_score = scoreit(ppn_vec_ideal, ppn_vec_attempt)

    num_vec_ideal, num_vec_attempt = vectoriser(answer_attributes_ideal(ideal)[0], answer_attributes_ideal(attempt)[0])
    num_score = scoreit(num_vec_ideal, num_vec_attempt)

    cmn_vec_ideal, cmn_vec_attempt = vectoriser(answer_attributes_ideal(ideal)[2], answer_attributes_attempt(answer_attributes_ideal(ideal)[2],attempt)[2])
    cmn_score = scoreit(cmn_vec_ideal, cmn_vec_attempt)
    
    Final = weightage[0] * ppn_score + weightage[1] * num_score + weightage[2] * cmn_score
    print("Final Score (out of 100) = ", Final * 100)
    return # Final*100

In [67]:
grader(ideal,attempt)

Final Score (out of 100) =  78.69868060479874


# Test cases
These are the results on 5 manually generated test cases

In [68]:
ideal1 = "KGF is one of the best movies if not the best movie ever made in the kannada film history. It is a recent film packed with lots of adventures and emotions. It has collected a lump sum of 1200 crores box office collection worldwide and stands in top 5 of India's highest grossed movies. Lead roles of the film were played by Yash,Srinidhi and Sanjay."
attempt1 = "KGF is a recent released movie which can be considered one of the best movies if not the best ever made in the kannada film history. It's packed with lots of adventures and emotions. It has collected a large sum of 1200 crores in the box office collection worldwide and it stands in the top 5 highest grossing movies of Indian cinema. Lead roles of the film were played by Yash,Srinidhi and Sanjay."

In [69]:
print(ideal1)
print(attempt1)
grader(ideal1,attempt1)

KGF is one of the best movies if not the best movie ever made in the kannada film history. It is a recent film packed with lots of adventures and emotions. It has collected a lump sum of 1200 crores box office collection worldwide and stands in top 5 of India's highest grossed movies. Lead roles of the film were played by Yash,Srinidhi and Sanjay.
KGF is a recent released movie which can be considered one of the best movies if not the best ever made in the kannada film history. It's packed with lots of adventures and emotions. It has collected a large sum of 1200 crores in the box office collection worldwide and it stands in the top 5 highest grossing movies of Indian cinema. Lead roles of the film were played by Yash,Srinidhi and Sanjay.
Final Score (out of 100) =  86.44216614641168


In [70]:
ideal2 = "IITM is said to be one of the best colleges, nevertheless there are a lot of management issues and is deemed by its students as one of the worst colleges. Its main strength is the alumni base and the student resource which over the years has given it laurels and hence the award NIRF 1"
attempt2 = "IIT madras is not a great college, everytime one has been to it he suffers from depression. The only thing that keeps one going is the placement and the packages if not no one would want to go for it. It did get nirf 1 award but it's all a scam and been manipulated by the administration. Lot of management problems and nothing is done on time."

In [71]:
print(ideal2)
print(attempt2)
grader(ideal2,attempt2)

IITM is said to be one of the best colleges, nevertheless there are a lot of management issues and is deemed by its students as one of the worst colleges. Its main strength is the alumni base and the student resource which over the years has given it laurels and hence the award NIRF 1
IIT madras is not a great college, everytime one has been to it he suffers from depression. The only thing that keeps one going is the placement and the packages if not no one would want to go for it. It did get nirf 1 award but it's all a scam and been manipulated by the administration. Lot of management problems and nothing is done on time.
Final Score (out of 100) =  33.481553119113954


In [72]:
ideal3 = "Life is a precious commodity, live it fully and enjoy every moment of it. As everyone says - YOLO, you only live once! There are a lot of things that can be done in life, meeting new people and sharing experiences is the best thing to bring one happiness and keeps everyone involved mentally healthy. The second part is the body - little excercise a day keeps the body healthy. Finally the soul, one’s inner self - explore yourself, you’ll feel your soul."
attempt3 = "Life is equivalent to death. One should not live it fully and not enjoy it. There are a lot of things which if done in life such as meeting new people will make you dead mentally. The same with one’s body, excerise is never healthy if done daily. Soul is just a myth - there is nothgin called an inner self, so do no explore yourself. You can never feel your soul as it does not exist."

In [73]:
print(ideal3)
print(attempt3)
grader(ideal3,attempt3)

Life is a precious commodity, live it fully and enjoy every moment of it. As everyone says - YOLO, you only live once! There are a lot of things that can be done in life, meeting new people and sharing experiences is the best thing to bring one happiness and keeps everyone involved mentally healthy. The second part is the body - little excercise a day keeps the body healthy. Finally the soul, one’s inner self - explore yourself, you’ll feel your soul.
Life is equivalent to death. One should not live it fully and not enjoy it. There are a lot of things which if done in life such as meeting new people will make you dead mentally. The same with one’s body, excerise is never healthy if done daily. Soul is just a myth - there is nothgin called an inner self, so do no explore yourself. You can never feel your soul as it does not exist.
Final Score (out of 100) =  31.080029464446447


In [74]:
ideal4 = "Evaporation happens when a liquid turns into a gas. It can be easily visualized when rain puddles “disappear” on a hot day or when wet clothes dry in the sun. In these examples, the liquid water is not actually vanishing—it is evaporating into a gas, called water vapor. Evaporation happens on a global scale."
attempt4 = "Evaporation is a process where liquids change to a gas or vapor. Water changes to a vapor or steam from the energy created when molecules bounce into one another because they're heated up. Sweat drying from our body is a great example of evaporation."

In [75]:
print(ideal4)
print(attempt4)
grader(ideal4,attempt4)

Evaporation happens when a liquid turns into a gas. It can be easily visualized when rain puddles “disappear” on a hot day or when wet clothes dry in the sun. In these examples, the liquid water is not actually vanishing—it is evaporating into a gas, called water vapor. Evaporation happens on a global scale.
Evaporation is a process where liquids change to a gas or vapor. Water changes to a vapor or steam from the energy created when molecules bounce into one another because they're heated up. Sweat drying from our body is a great example of evaporation.
Final Score (out of 100) =  33.43186810535768


In [77]:
ideal5 = "The assassination of Austrian Archduke Franz Ferdinand (June 28, 1914) was the main catalyst for the start of the Great War (World War I)"
attempt5 = "The main catalyst of world war I is killing of Archduke Franz Ferdinand on June 28, 1914."

In [78]:
print(ideal5)
print(attempt5)
grader(ideal5,attempt5)

The assassination of Austrian Archduke Franz Ferdinand (June 28, 1914) was the main catalyst for the start of the Great War (World War I)
The main catalyst of world war I is killing of Archduke Franz Ferdinand on June 28, 1914.
Final Score (out of 100) =  74.29961096932652
