In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
from eunjeon import Mecab

sentences = ['내차 온도를 15도로 켜줘',
            '내차 에어컨 끄자',
            '내 차 깜빡이켜',
            '내차 도어 닫아']

vect = TfidfVectorizer()
X = vect.fit_transform(sentences)
X.todense()

matrix([[0.5417361 , 0.        , 0.        , 0.34578314, 0.        ,
         0.        , 0.        , 0.5417361 , 0.5417361 ],
        [0.        , 0.        , 0.64450299, 0.41137791, 0.        ,
         0.        , 0.64450299, 0.        , 0.        ],
        [0.        , 1.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.41137791, 0.64450299,
         0.64450299, 0.        , 0.        , 0.        ]])

In [28]:
Y = ['Engine Start', 'Engine Stop', 'Light On', 'Door Close']

In [31]:
from sklearn.linear_model import SGDClassifier
model = SGDClassifier(loss='hinge')
model.fit(X, Y)



SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [32]:
X_pred = vect.transform(['내 차의 온도를 26도로 켜봐'])
Y_pred = model.predict(X_pred)

print(Y_pred)

['Engine Start']


In [3]:
import os
from eunjeon import Mecab

def make_data(path):
    print('--- Making data')

    # Get corpus file list
    corpuslist_abs = os.listdir(path)

    # We need Morpheme analyzer
    # We will use mecab
    mecab = Mecab()
    
    X = []
    y =[]

    # make model corpus by corpus
    for cabs in corpuslist_abs:
        # make corpusname
        # This corpusname will be reference of model in defaultdict
        filename = os.path.basename(cabs)
        corpusname = os.path.splitext(filename)[0]

        # Get corpus
        ########## corpus frame ############
        # sentence1
        # sentence2
        # ...
        ####################################
        with open(path+'/' + cabs, 'r', encoding='utf-8') as f:
            raw = f.readlines()


        for sent in raw:
            X.append(sent)
            y.append(corpusname)
    
    return X, y

In [4]:
X, y = make_data('C:/MyProject/hmcLM/corpus')
print(len(X), len(y))

--- Making data
9159 9159


In [5]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X)
X_train_counts.shape

(9159, 716)

In [6]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(9159, 716)

In [7]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(9159, 716)

In [8]:
# Training
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, y)

In [9]:
docs_new = ['시동 걸어', '내 차 시동 꺼줄래', "문 좀 닫아줘"]
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, category))

'시동 걸어' => Control_Engine_Start_noTemp
'내 차 시동 꺼줄래' => Control_Engine_Stop
'문 좀 닫아줘' => Control_Door_Close


In [10]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', MultinomialNB())])

In [12]:
int(9000*0.12)

1080

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from eunjeon import Mecab

mecab = Mecab()

count_vect = CountVectorizer(preprocessor=mecab.morphs)