In [5]:
from nltk import bigrams, trigrams
import glob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from eunjeon import Mecab
from sklearn import metrics
import os
import random
import pickle
import numpy as np

In [2]:
def make_data(path, testprob):
    print('--- Making data')

    # Get corpus file list
    corpuslist_abs = os.listdir(path)

    template = []

    X = []
    y = []

    # make model corpus by corpus
    for cabs in corpuslist_abs:
        # make corpusname
        filename = os.path.basename(cabs)
        corpusname = os.path.splitext(filename)[0]

        # Get corpus
        ########## corpus frame ############
        # sentence1
        # sentence2
        # ...
        ####################################
        with open(path + '/' + cabs, 'r', encoding='utf-8') as f:
            raw = f.readlines()

        for sent in raw:
            template.append((sent, corpusname))

    random.shuffle(template)

    for sent in template:
        X.append(sent[0])
        y.append(sent[1])

    #for i in range(10):
    #    print('{}\t{}'.format(X[i], y[i]))

    idx = int(len(X) - (len(X)*testprob))
    train_X, train_y, test_X, test_y = X[:idx], y[:idx], X[idx:], y[idx:]

    print("--- Making data Done")
    print('--- Data information')
    print('%% The number of sentences of train:', len(train_X))
    print('%% The number of intentions:', len(list(set(train_y))))

    return train_X, train_y, test_X, test_y

In [9]:
def train_1():

    # tokenizer
    mecab = Mecab()

    train_X, train_y, test_X, test_y = make_data('corpus', testprob=0.1)

    count_vect = CountVectorizer()

    X_train_counts = count_vect.fit_transform(train_X)
    print("The number of features: {}".format(X_train_counts.shape[1]))

    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

    # SVM
    clf_svm = SGDClassifier().fit(X_train_tfidf, train_y)

    # Evaluation
    X_test_counts = count_vect.transform(test_X)
    X_test_tfidf = tfidf_transformer.transform(X_test_counts)

    print()
    predicted = clf_svm.predict(X_test_tfidf)
    print("SVM: ", np.mean(predicted == test_y))

    print()
    print("Examples: ")
    print("Input\t   Predicted\t  Correct")
    for i in range(3):
        print("%s\t=> %s\t: %s" % (test_X[i], predicted[i], test_y[i]))

    print(metrics.classification_report(test_y, predicted))

In [10]:
train_1()

--- Making data
--- Making data Done
--- Data information
%% The number of sentences of train: 9006
%% The number of intentions: 65
The number of features: 1418





SVM:  0.965034965034965

Examples: 
Input	   Predicted	  Correct
내차 히터 17도로 맞쳐줘요
	=> Control_Engine_Start_Temp	: Control_Engine_Start_Temp
내차 시동꺼줘라
	=> Control_Engine_Stop	: Control_Engine_Stop
내차 공조를 최소로 설정해줘
	=> Control_Engine_Start_Min	: Control_Engine_Start_Min
                             precision    recall  f1-score   support

       Control_Charge_Start       1.00      1.00      1.00        12
        Control_Charge_Stop       1.00      1.00      1.00        16
         Control_Door_Close       1.00      1.00      1.00        13
          Control_Door_Open       1.00      1.00      1.00         2
  Control_Engine_Start_Cool       1.00      1.00      1.00         2
   Control_Engine_Start_Max       1.00      1.00      1.00        15
   Control_Engine_Start_Min       1.00      1.00      1.00        31
  Control_Engine_Start_Temp       0.99      1.00      1.00       556
  Control_Engine_Start_Warm       1.00      1.00      1.00         2
Control_Engine_Start_noTemp       0.94    

  'precision', 'predicted', average, warn_for)


In [11]:
def train_2():

    # tokenizer
    mecab = Mecab()

    train_X, train_y, test_X, test_y = make_data('corpus', testprob=0.1)
    #print(len(train_X), len(train_y))

    print('--- Get vocabulary')
    with open('vocab.pickle', 'rb') as f:
        vocab = pickle.load(f)
    print('--- Load vocabulary successfully')
    print('%% Vacabulary size:', len(vocab))

    count_vect = CountVectorizer(
        tokenizer=mecab.morphs,
        ngram_range=(1, 3),
        max_features=10000,
        vocabulary=vocab
    )

    X_train_counts = count_vect.transform(train_X)
    print("The number of features: {}".format(X_train_counts.shape[1]))

    tfidf_transformer = TfidfTransformer(
        use_idf=False,
        smooth_idf=False,
        norm='l2'
    )
    X_train_tfidf = tfidf_transformer.transform(X_train_counts)
    #print(X_train_tfidf.shape)

    # Naive Beyesian
    # clf = MultinomialNB().fit(X_train_tfidf, train_y)

    # SVM
    clf_svm = SGDClassifier().fit(X_train_tfidf, train_y)

    # Evaluation
    X_test_counts = count_vect.transform(test_X)
    X_test_tfidf = tfidf_transformer.transform(X_test_counts)

    print()
    predicted = clf_svm.predict(X_test_tfidf)
    print("SVM: ", np.mean(predicted == test_y))

    print()
    print("Examples: ")
    print("Input\t   Predicted\t  Correct")
    for i in range(3):
        print("%s\t=> %s\t: %s" % (test_X[i], predicted[i], test_y[i]))

    print(metrics.classification_report(test_y, predicted))

In [12]:
train_2()

--- Making data
--- Making data Done
--- Data information
%% The number of sentences of train: 9006
%% The number of intentions: 65
--- Get vocabulary
--- Load vocabulary successfully
%% Vacabulary size: 902
The number of features: 902





SVM:  0.993006993006993

Examples: 
Input	   Predicted	  Correct
온도를 31도로 조절해줘요
	=> Control_Engine_Start_Temp	: Control_Engine_Start_Temp
내 차 공조를 19도로 켜줘라
	=> Control_Engine_Start_Temp	: Control_Engine_Start_Temp
내차 공조를 29도로 조절해줘
	=> Control_Engine_Start_Temp	: Control_Engine_Start_Temp
                               precision    recall  f1-score   support

         Control_Charge_Start       1.00      1.00      1.00        25
          Control_Charge_Stop       1.00      1.00      1.00        11
           Control_Door_Close       1.00      1.00      1.00        10
            Control_Door_Open       1.00      1.00      1.00         2
    Control_Engine_Start_Cool       1.00      1.00      1.00         1
     Control_Engine_Start_Max       1.00      1.00      1.00        12
     Control_Engine_Start_Min       1.00      1.00      1.00        31
    Control_Engine_Start_Temp       1.00      1.00      1.00       568
    Control_Engine_Start_Warm       1.00      1.00      1.00         4


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
