# Training

In [2]:
import re
import pandas as pd
from sklearn.utils import shuffle

In [3]:
Emails = pd.read_csv('./../Data/Final.csv')
Commands = pd.read_csv('./../Data/Commands.csv')

In [4]:
def removeWords(sentence):
    sentence = sentence.replace('hey','').replace('alaska', '')
    sentence = re.sub(r' {2,}', ' ',sentence)
    return sentence

In [5]:
Emails['Class'] = pd.Series(['Emails']*len(Emails))
Emails = Emails[['context', 'Class']].rename(columns={'context': 'Context'})
Emails['Context'] = Emails['Context'].str.lower()
Emails = Emails.drop_duplicates(subset=['Context']).reset_index(drop=True)
Emails = Emails[:30]
Emails['Context'] = Emails['Context'].apply(removeWords) 

In [6]:
Commands = Commands.rename(columns={'Commands': 'Context'})
Commands['Class'] = 'Commands'
Commands['Context'] = Commands['Context'].apply(removeWords)

In [7]:
frames = [Emails, Commands]
All = pd.concat(frames)
All = All.reset_index(drop=True)

In [8]:
df = shuffle(All)
df.reset_index(drop=True, inplace =True)
df

Unnamed: 0,Context,Class
0,send,Commands
1,", i want to send email to psloan@hotmail.com ...",Emails
2,", i want to send email to gravesdenise@gmail....",Emails
3,please send an email to tdyer@gmail.com and f...,Emails
4,please send an email for me with subject ad h...,Emails
5,add x123@hotmail.com and y@yahoo.com to the l...,Commands
6,write an email with subject this morning s co...,Emails
7,open attachment files,Commands
8,please write an email with topic your approva...,Emails
9,send email,Commands


In [9]:
df['Class_id'] = df['Class'].factorize()[0]

In [10]:
class_id_df = df[['Class', 'Class_id']].drop_duplicates().sort_values('Class_id')
class_id_df =  dict(class_id_df.values)
class_id_df

{'Commands': 0, 'Emails': 1}

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df=1, norm='l2', encoding='utf-8', ngram_range=(1, 2), stop_words='english')

In [13]:
features = tfidf.fit_transform(df['Context'].to_numpy())
labels = df.Class_id

features.shape, labels.shape

((54, 1241), (54,))

In [14]:
from sklearn.feature_selection import chi2
import numpy as np
N = 2
for Product, category_id in sorted(class_id_df.items()):
    features_chi2 = chi2(features, labels == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names_out())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}':".format(Product))
    print("  - Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
    print("  - Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

# 'Commands':
  - Most correlated unigrams:
. screen
. add
  - Most correlated bigrams:
. send email
. add signature
# 'Emails':
  - Most correlated unigrams:
. screen
. add
  - Most correlated bigrams:
. send email
. add signature


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
X_train, X_test, y_train, y_test = train_test_split(df['Context'], df['Class'], random_state = 0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
cv_df

0
1
2
3
4
5
6
7
8
9
10


In [17]:
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
cv_df

Unnamed: 0,model_name,fold_idx,accuracy
0,RandomForestClassifier,0,1.0
1,RandomForestClassifier,1,1.0
2,RandomForestClassifier,2,1.0
3,RandomForestClassifier,3,1.0
4,RandomForestClassifier,4,1.0
5,LinearSVC,0,0.909091
6,LinearSVC,1,0.909091
7,LinearSVC,2,0.818182
8,LinearSVC,3,0.818182
9,LinearSVC,4,0.9


In [18]:
clf = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0).fit(X_train_tfidf, y_train)

In [19]:
X_test_counts = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

In [24]:
predicted = clf.predict(X_test_tfidf)
predicted

array(['Emails', 'Commands', 'Commands', 'Commands', 'Emails', 'Emails',
       'Commands', 'Emails', 'Commands', 'Emails', 'Emails', 'Emails',
       'Emails', 'Emails'], dtype=object)

In [25]:
y_test = y_test.to_numpy()
y_test

array(['Emails', 'Commands', 'Commands', 'Commands', 'Emails', 'Emails',
       'Commands', 'Emails', 'Commands', 'Emails', 'Emails', 'Emails',
       'Emails', 'Emails'], dtype=object)

In [28]:
N = predicted.shape[0]
accuracy = (predicted == y_test).sum() / N
str(accuracy*100) + '%'

'100.0%'

In [34]:
import pickle
filename = 'model_congif.pkl'
with open(filename, 'wb') as model_congif:
    pickle.dump((count_vect, tfidf_transformer, clf), model_congif)

# Inference

In [35]:
import pickle
with open('model_congif.pkl', 'rb') as f:
    count_vect, tfidf_transformer, clf = pickle.load(f)

In [37]:
def getPrediction(s):
    sentence =[s]
    cv_output = count_vect.transform(sentence)
    tfidf_output = tfidf_transformer.transform(cv_output)
    return clf.predict(tfidf_output)[0]

In [38]:
getPrediction('please give me a new screen')

'Commands'