#### Learning MultiNB ML Model for multi labels classification of text

* There are 2 input variables of text sentence
* Label is categorical text
    
    
* This ML model saves 2 model files
  * SkLearn MultiNB Model
  * Vocabulary used during training time as this same vocabulary is needed during inference/prediction
      
    
* This ML Model is being deployed as
  * Flask based web service
  * Azure ML Service based web service in ACI (Azure Container Instances)

In [16]:
import numpy as np
import pandas as pd
import re

import pickle

from gensim.models import KeyedVectors
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, remove_stopwords, strip_short

from tqdm import tqdm
tqdm.pandas()

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

In [2]:
df = pd.read_csv('train.csv')
df['Derived Generic Category'].unique()

array(['Assignment', 'Quiz', 'Homework', 'Test', 'Extra Credit'],
      dtype=object)

In [3]:
df.groupby(['Derived Generic Category']).count()

Unnamed: 0_level_0,Teacher,School,Assignment Name,School Category
Derived Generic Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Assignment,605,605,605,605
Extra Credit,37,37,37,37
Homework,327,327,327,327
Quiz,145,145,145,145
Test,102,102,102,102


In [5]:
txt_filters = [lambda x: x.lower(), strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, remove_stopwords, strip_short]

def process_input(row):
    input_merged = row['Assignment Name'] + ' ' + row['School Category']
    
    # gensim's preprocess_string through series of txt_filters which generates tokens array
    input_processed_tokens = " ".join(preprocess_string(input_merged, txt_filters))
    
    # input_processed_tokens is deduplicated to form final input string
    #input_processed = " ".join(sorted(set(input_processed_tokens), key=input_processed_tokens.index))
    return input_processed_tokens
    
df['processed_input'] = df.apply(lambda row: process_input(row), axis=1)

In [6]:
df.head()

Unnamed: 0,Teacher,School,Assignment Name,School Category,Derived Generic Category,processed_input
0,Kristy Smethwick,Plum Senior High School,Button Button Project,Default Category,Assignment,button button project default category
1,Kristy Smethwick,Plum Senior High School,Button Button Quiz,Default Category,Quiz,button button quiz default category
2,Kristy Smethwick,Plum Senior High School,Good Man Quiz,Default Category,Quiz,good man quiz default category
3,Jessica Pilyih,Holiday Park Elementary,Stargazing Comp. Questions,Writing/Essay,Assignment,stargazing comp questions writing essay
4,Kristy Smethwick,Plum Senior High School,Story...Hour Creative Piece,Default Category,Assignment,story hour creative piece default category


In [7]:
# 0th index based 6th is this new 'label' column
df['label'] = pd.factorize(df['Derived Generic Category'])[0]

X_train, X_test, y_train, y_test = train_test_split(df['processed_input'], df['label'], test_size=0.1)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)

# Save vocabulary into a pickle file which is truly a complementary to ML Model as this is needed during prediction/serving time
pickle.dump(count_vect.vocabulary_, open('vocab.pkl','wb'))

# ML Model Training
clf = MultinomialNB().fit(X_train_counts, y_train)

# Save ML Model into pickel file
pickle.dump(clf, open('classifier_model.pkl','wb'))

In [71]:
# Prediction on simulated input
#predicted_label = clf.predict(count_vect.transform(["religions venn diagram worksheets", "button button quiz default category"]))

In [None]:
predicted_label = clf.predict(count_vect.transform(df['processed_input']))

In [72]:
# Overriding ML Model predicted label with rule-based decision

labels_dict = {}
labels_dict['assignment'] = 0
labels_dict['quiz'] = 1
labels_dict['homework'] = 2
labels_dict['test'] = 3
labels_dict['extra credit'] = 4

arr_labels = ['assignment', 'quiz', 'homework', 'test', 'extra credit']

for index, row in df.iterrows():
    label_match_school_category = re.search('assignment|quiz|homework|test|extra credit', row['School Category'].lower())
    label_match_assignment_name = re.search('assignment|quiz|homework|test|extra credit', row['Assignment Name'].lower()) 
    predicted_match_school_category = re.search(arr_labels[predicted_label[index]], row['School Category'].lower())
    predicted_match_assignment_name = re.search(arr_labels[predicted_label[index]], row['Assignment Name'].lower())     
    if label_match_school_category and (label_match_assignment_name is None) and (predicted_match_school_category is None):
        predicted_label[index] = labels_dict[label_match_school_category.group()]
    elif label_match_assignment_name and (label_match_school_category is None) and (predicted_match_assignment_name is None):
        predicted_label[index] = labels_dict[label_match_assignment_name.group()]

'\nif label_match_school_category and label_match_assignment_name:\n    predicted_label[index] = labels_dict[label_match_school_category.group()]\nelif label_match_school_category:\n    predicted_label[index] = labels_dict[label_match_school_category.group()]\nelif label_match_assignment_name:\n    predicted_label[index] = labels_dict[label_match_assignment_name.group()]\n'

In [73]:
# Checking prediction as which ones are incorrect

for i in range(0, df.shape[0]):
    if predicted_label[i] != df['label'][i]:
        print(i, predicted_label[i], df['processed_input'][i], df['label'][i])

48 1 religions venn diagram worksheets 0
70 1 major religion group projects group grade 0
137 2 art failure prompt essays 0
646 2 homework log default category 0
647 2 homework log default category 0
648 2 homework log default category 0
649 2 homework log default category 0
731 2 caravana reading 0
916 2 post movie prompt essays 0
976 2 reading reading 0
977 2 reading reading 0
1087 2 topic homework default category 0
1089 2 topic homework default category 0
1092 2 topic homework default category 0
1094 2 topic homework default category 0
1098 2 topic homework packet default category 0


In [62]:
# Coutn of correct prediction

score_truth = (predicted_label == df['label'].values)
score_truth.sum()

1149

In [76]:
# Testing deployed model in Azure ML Service ACI Instance

import requests
import json

# URL for the web service
scoring_uri = 'http://<ip_addr>:<port>/score'
# If the service is authenticated, set the key
#key = '<your key>'

# Sample test data to send to Service for prediction
data = [{"Assignment Name": "Topic 2 Test", "School Category": "Default Category"}]

'''
data = [
  {
    "Assignment Name": "Button, Button Project",
    "School Category": "Default Category"
  },
  {
    "Assignment Name": "Button, Button Quiz",
    "School Category": "Default Category"
  },
  {
    "Assignment Name": "Good Man Quiz",
    "School Category": "Default Category"
  },
  {
    "Assignment Name": "Stargazing Comp. Questions",
    "School Category": "Writing/Essay"
  },
  {
    "Assignment Name": "Story...Hour Creative Piece",
    "School Category": "Default Category"
  },
  {
    "Assignment Name": "Story...Hour Irony Quotes",
    "School Category": "Default Category"
  },
  {
    "Assignment Name": "The Demon Lover Analysis",
    "School Category": "Default Category"
  },
  {
    "Assignment Name": "The Demon Lover Quiz",
    "School Category": "Default Category"
  },
  {
    "Assignment Name": "The Demon Lover Trial",
    "School Category": "Default Category"
  },
  {
    "Assignment Name": "The Monkey's Paw Quiz",
    "School Category": "Default Category"
  },
  {
    "Assignment Name": "The Monkey's Paw video",
    "School Category": "Default Category"
  },
  {
    "Assignment Name": "The Story of an Hour Quiz",
    "School Category": "Default Category"
  },
  {
    "Assignment Name": "The Wall Written Response",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "What the Dog Saw Nearpod",
    "School Category": "Assignments/Homework"
  },
  {
    "Assignment Name": "1 Objective",
    "School Category": "Objective Exam"
  },
  {
    "Assignment Name": "1 Objective",
    "School Category": "Objective Exam"
  },
  {
    "Assignment Name": "1-4 Practice",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "1-4 Practice",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "1-4 ws",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "1-4 ws",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "1-5 Practice",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "1-5 Practice",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "1-6 The Coordinate Plane",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "1-6 The Coordinate Plane",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "1-7 ws",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "1-7 ws",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "1A Verbs Matching WS 1-45",
    "School Category": "Daily Assignments"
  },
  {
    "Assignment Name": "1A Verbs Matching WS 1-45",
    "School Category": "Daily Assignments"
  },
  {
    "Assignment Name": "1A Verbs Matching WS 1-45",
    "School Category": "Daily Assignments"
  },
  {
    "Assignment Name": "1st Amendment 101 Rd. Comp.",
    "School Category": "Assignment"
  },
  {
    "Assignment Name": "1st Volunteer Summary / Essay",
    "School Category": "Indvidual Project"
  },
  {
    "Assignment Name": "1st Volunteer Summary / Essay",
    "School Category": "Indvidual Project"
  },
  {
    "Assignment Name": "2 Objective",
    "School Category": "Objective Exam"
  },
  {
    "Assignment Name": "2 Objective",
    "School Category": "Objective Exam"
  },
  {
    "Assignment Name": "22 3 Study Guide Chart",
    "School Category": "Classroom Activities"
  },
  {
    "Assignment Name": "22 3 Study Guide Chart",
    "School Category": "Classroom Activities"
  },
  {
    "Assignment Name": "2-4 justifications",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "2-4 justifications",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "2-4 ws",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "2-4 ws",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "2-4 ws #2",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "2-4 ws #2",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "2-5 Practice",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "2-5 Practice",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "3 Int Business Vocab",
    "School Category": "Objective Exam"
  },
  {
    "Assignment Name": "3 Int Business Vocab",
    "School Category": "Objective Exam"
  },
  {
    "Assignment Name": "3 Objective",
    "School Category": "Objective Exam"
  },
  {
    "Assignment Name": "3 Objective",
    "School Category": "Objective Exam"
  },
  {
    "Assignment Name": "3 Religions Venn Diagram",
    "School Category": "Worksheets"
  },
  {
    "Assignment Name": "3-5 & 3-6 Review",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "3-5 & 3-6 Review",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "3-5 HW #1",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "3-5 HW #1",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "3-6 ws",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "3-6 ws",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "3x-ky+2=0 ; homework quiz",
    "School Category": "Quiz"
  },
  {
    "Assignment Name": "4 episode synopsis",
    "School Category": "Default Category"
  },
  {
    "Assignment Name": "4 Hello/Goodbye Dialogues",
    "School Category": "Daily Assignments"
  },
  {
    "Assignment Name": "4 Hello/Goodbye Dialogues",
    "School Category": "Daily Assignments"
  },
  {
    "Assignment Name": "4 Hello/Goodbye Dialogues",
    "School Category": "Daily Assignments"
  },
  {
    "Assignment Name": "4 Objective",
    "School Category": "Objective Exam"
  },
  {
    "Assignment Name": "4 Objective",
    "School Category": "Objective Exam"
  },
  {
    "Assignment Name": "4-2 Proofs",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "4-2 Proofs",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "4-3 Proofs",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "4-3 Proofs",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "4-4 CPCTC Proofs #1",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "4-4 CPCTC Proofs #1",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "4-6 Practice",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "4-6 Practice",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "5 Major Religion Group",
    "School Category": "Projects Group Grade"
  },
  {
    "Assignment Name": "5 Major Religion Quiz",
    "School Category": "Tests/Quizzes"
  },
  {
    "Assignment Name": "5 min/day #2",
    "School Category": "Default Category"
  },
  {
    "Assignment Name": "5 minutes/day #1",
    "School Category": "Default Category"
  },
  {
    "Assignment Name": "5 minutes/day #3",
    "School Category": "Default Category"
  },
  {
    "Assignment Name": "5 Objective",
    "School Category": "Objective Exam"
  },
  {
    "Assignment Name": "5 Objective",
    "School Category": "Objective Exam"
  },
  {
    "Assignment Name": "5 Religion Individual Project",
    "School Category": "Projects Individual Grade"
  },
  {
    "Assignment Name": "5 THEMES OF GEOGRAPHY QUIZ",
    "School Category": "Tests/Quizzes"
  },
  {
    "Assignment Name": "5 Themes Quiz Extra Credit",
    "School Category": "EXTRA CREDIT"
  },
  {
    "Assignment Name": "5-2 Practice",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "5-2 Practice",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "5-2 Practice",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "5-2 Practice",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "5-2 WS",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "5-2 WS",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "5-3 WS #1",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "5-3 WS #1",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "5-3 WS #2",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "5-3 WS #2",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "5-3 WS #3",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "5-3 WS #3",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "5-5 Inequalities delta math",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "5-5 Inequalities delta math",
    "School Category": "Homework"
  },
  {
    "Assignment Name": "6 Objective",
    "School Category": "Objective Exam"
  },
  {
    "Assignment Name": "6 Objective",
    "School Category": "Objective Exam"
  },
  {
    "Assignment Name": "6 Principles Pol. Cartoon",
    "School Category": "Assignment"
  },
  {
    "Assignment Name": "6 Study Guide",
    "School Category": "Classroom Activities"
  },
  {
    "Assignment Name": "6 Study Guide",
    "School Category": "Classroom Activities"
  }
]
'''

# Convert to JSON string
input_data = json.dumps(data)

# Set the content type
headers = { 'Content-Type':'application/json' }
# If authentication is enabled, set the authorization header
#headers['Authorization']=f'Bearer {key}'

# Make the request and display the response
resp = requests.post(scoring_uri, input_data, headers = headers)
#print(resp.text)


In [77]:
# Predictions as enum contant

resp.content

b'"[3]"'