In [1]:
#Notebook to create ML pipeline for Java Plugin
from sklearn2pmml import PMMLPipeline, sklearn2pmml
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn import model_selection, preprocessing, neural_network, metrics, linear_model, svm
import os
from sklearn2pmml.feature_extraction.text import Splitter
import string
import random
import numpy as np
import nltk
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [2]:
filename = "Chatbot_Train.csv"
num_inputs = 5
data = pd.read_csv(filename)
data

Unnamed: 0,input_1,input_2,input_3,input_4,input_5,topic
0,What is my altitude?,How high up am I?,Altitude here,Height,Alt,altitude
1,What is the oxygen content?,Can I breath here?,Oxygen,Is there air on this planet?,Air,oxygen
2,How fast is the wind?,What is the airflow like?,What is the speed of the air on this planet?,Wind,Airflow,airflow
3,What is the atmosphere like?,Atmosphere contents,What is in the water?,Atmosphere,What is in the air here?,atmosphere
4,What is the gravity on this planet?,Gravity,What is the force pulling me down?,,How fast fo I fall on this planet?,gravity
5,Humidity,What is the humidity here?,How moist is the air on this planet?,What is the water content in the air?,How humid is it here?,humidity
6,Magnetic field,What is the magnetic force on this planet?,What is the force that keeps the planet together?,,,magnetic field
7,Pressure,What is the pressure on the planet?,What is the force exerted on my body here?,How much is being pushed on me on this planet?,Would I get crushed here?,pressure
8,Radiation,What is the amount of radiation on this planet?,How much energy do I get on this planet?,Can I get sick here?,Wave energy,radiation
9,What is the year length here?,How long is a year on this planet?,How old would I be on this planet?,Year length,How long does it take for this planet to orbit...,year


In [3]:
#method to get appropriate part of speech (need to run tests to check if works properly)
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t, get_wordnet_pos(t)) for t in word_tokenize(articles)]

In [4]:
X = []
y = []
topics = []

# Loop csv append topic
for i, j in data.iterrows():
    class_value = data.iloc[i]["topic"]
    if not pd.isna(class_value):
        for k in range(num_inputs):
            col_value = k+1
            row_input = data.iloc[i]["input_"+str(col_value)]
            if not pd.isna(row_input):
                X.append(row_input)
                y.append(class_value)
    # Add class if topic not present in list
    if class_value not in topics:
        topics.append(class_value)

print(topics)
print(X)
print(y)

['altitude', 'oxygen', 'airflow', 'atmosphere', 'gravity', 'humidity', 'magnetic field', 'pressure', 'radiation', 'year', 'planet', 'player']
['What is my altitude?', 'How high up am I?', 'Altitude here', 'Height', 'Alt', 'What is the oxygen content?', 'Can I breath here?', 'Oxygen', 'Is there air on this planet?', 'Air', 'How fast is the wind?', 'What is the airflow like?', 'What is the speed of the air on this planet?', 'Wind', 'Airflow', 'What is the atmosphere like?', 'Atmosphere contents', 'What is in the water?', 'Atmosphere', 'What is in the air here?', 'What is the gravity on this planet?', 'Gravity', 'What is the force pulling me down?', 'How fast fo I fall on this planet?', 'Humidity', 'What is the humidity here?', 'How moist is the air on this planet?', 'What is the water content in the air?', 'How humid is it here?', 'Magnetic field', 'What is the magnetic force on this planet?', 'What is the force that keeps the planet together?', 'Pressure', 'What is the pressure on the p

In [5]:
for i in range(len(X)):
    X[i] = X[i].replace("[^a-zA-Z0-9\\s]", '')
    X[i] = X[i].lower()
X

['what is my altitude?',
 'how high up am i?',
 'altitude here',
 'height',
 'alt',
 'what is the oxygen content?',
 'can i breath here?',
 'oxygen',
 'is there air on this planet?',
 'air',
 'how fast is the wind?',
 'what is the airflow like?',
 'what is the speed of the air on this planet?',
 'wind',
 'airflow',
 'what is the atmosphere like?',
 'atmosphere contents',
 'what is in the water?',
 'atmosphere',
 'what is in the air here?',
 'what is the gravity on this planet?',
 'gravity',
 'what is the force pulling me down?',
 'how fast fo i fall on this planet?',
 'humidity',
 'what is the humidity here?',
 'how moist is the air on this planet?',
 'what is the water content in the air?',
 'how humid is it here?',
 'magnetic field',
 'what is the magnetic force on this planet?',
 'what is the force that keeps the planet together?',
 'pressure',
 'what is the pressure on the planet?',
 'what is the force exerted on my body here?',
 'how much is being pushed on me on this planet?',
 '

In [6]:
le = preprocessing.LabelEncoder()
y = le.fit_transform(y)
y

array([ 1,  1,  1,  1,  1,  6,  6,  6,  6,  6,  0,  0,  0,  0,  0,  2,  2,
        2,  2,  2,  3,  3,  3,  3,  4,  4,  4,  4,  4,  5,  5,  5,  9,  9,
        9,  9,  9, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11,  7,  7,  7,  7,
        7,  8,  8,  8,  8,  8])

In [7]:
vectorizer = TfidfVectorizer(analyzer = "word", token_pattern = None, tokenizer = LemmaTokenizer(), norm = None)
X_vect = vectorizer.fit_transform(X)

In [8]:
model = neural_network.MLPClassifier()
model = model.fit(X_vect, y)  



In [9]:
pipeline = PMMLPipeline([ ('vectorizer',vectorizer),
        ('mlp', model) ])

In [22]:
le.inverse_transform(pipeline.predict(["What is the named?"]))

array(['player'], dtype='<U14')

In [21]:
pipeline.predict_proba(["What is the named?"])

array([[0.01498255, 0.00310188, 0.03867444, 0.0064261 , 0.01119254,
        0.00203802, 0.00608358, 0.13939853, 0.77065985, 0.00353784,
        0.00240152, 0.00150315]])

In [11]:
le.classes_

array(['airflow', 'altitude', 'atmosphere', 'gravity', 'humidity',
       'magnetic field', 'oxygen', 'planet', 'player', 'pressure',
       'radiation', 'year'], dtype='<U14')

In [20]:
len(le.classes_)

12

In [10]:
sklearn2pmml(pipeline, 'model.pmml', with_repr = True)