# Imports

In [43]:
# Import the required libraries 
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.utils import np_utils
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

In [9]:
from sklearn import datasets

## Processing Dataset

In [10]:
text_dir = 'D:\\Tu Beo\\Education\\FoodVisor\\data\\UPMC_Food101\\texts_txt'

In [None]:
all_texts = datasets.load_files(text_dir, 
            description=None, categories=None, load_content=True, shuffle=True, 
                                        encoding='utf-8', decode_error='strict', random_state=0)

In [15]:
all_texts.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [16]:
all_texts.filenames[:5]

array(['D:\\Tu Beo\\Education\\FoodVisor\\data\\UPMC_Food101\\texts_test_txt\\apple_pie\\apple_pie_181.txt',
       'D:\\Tu Beo\\Education\\FoodVisor\\data\\UPMC_Food101\\texts_test_txt\\baklava\\baklava_636.txt',
       'D:\\Tu Beo\\Education\\FoodVisor\\data\\UPMC_Food101\\texts_test_txt\\apple_pie\\apple_pie_573.txt',
       'D:\\Tu Beo\\Education\\FoodVisor\\data\\UPMC_Food101\\texts_test_txt\\apple_pie\\apple_pie_493.txt',
       'D:\\Tu Beo\\Education\\FoodVisor\\data\\UPMC_Food101\\texts_test_txt\\apple_pie\\apple_pie_203.txt'],
      dtype='<U100')

In [17]:
all_texts.target_names[:5]

['apple_pie', 'baby_back_ribs', 'baklava']

In [18]:
all_texts.target[:5]

array([0, 2, 0, 0, 0])

In [26]:
text_train,text_test , y_train, y_test = train_test_split(
    all_texts.data, all_texts.target, test_size=0.33, random_state=42)

In [27]:
# Feature Engineering 
print ("TF-IDF on text data ... ")
tfidf = TfidfVectorizer(binary=True)
X_train = tfidf.fit_transform(text_train).astype('float16')
X_test = tfidf.transform(text_test).astype('float16')
print ("Done ! ")

TF-IDF on text data ... 


In [29]:
X_train.shape

(1765, 57452)


In [33]:
__, num_words = X_train.shape
num_classes = len(all_texts.target_names)

## Create Keras model

In [34]:
# Model Training 
def build_model():
    model = Sequential()
    model.add(Dense(256, input_dim=num_words, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(200, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(160, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(120, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(80, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

Create model ... 
Done !


In [None]:
print ("Creating model ... ")
estimator = KerasClassifier(build_fn=build_model, epochs=50, batch_size=128)

## Training

In [35]:
print("Training ...")
estimator.fit(X_train, y_train)
print("Completed !")

Compile model ...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 256)               14707968  
_________________________________________________________________
dropout_6 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 200)               51400     
_________________________________________________________________
dropout_7 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 160)               32160     
_________________________________________________________________
dropout_8 (Dropout)          (None, 160)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 120)               193

## Prediction

In [37]:
# Predictions 
print ("Predict on test data ... ")
y_pred = estimator.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy : ", accuracy)

Predict on test data ... 
Accuracy :  0.9471264367816092


## Most informative words

In [38]:
from sklearn.svm import LinearSVC

In [39]:
clf = LinearSVC()
clf.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [40]:
print ("Predict on test data ... ")
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy : ", accuracy)

Predict on test data ... 
Accuracy :  0.967816091954023


In [45]:
def print_top10(vectorizer, clf, class_labels):
    """
    Prints features with the highest coefficient values, per class \n
    https://stackoverflow.com/questions/11116697/how-to-get-most-informative-features-for-scikit-learn-classifiers
    """
    feature_names = vectorizer.get_feature_names()
    for i, class_label in enumerate(class_labels):
        top10 = np.argsort(clf.coef_[i])[-10:]
        print("%s: %s" % (class_label,
              ", ".join(feature_names[j] for j in top10)))

In [46]:
print_top10(tfidf, clf, all_texts.target_names)

apple_pie: caramel, cinnamon, filling, peeled, pies, flour, crust, pie, apples, apple
baby_back_ribs: pepper, tender, rib, pork, barbecue, bbq, grill, back, baby, ribs
baklava: filo, syrup, pistachios, pistachio, greek, sheets, walnuts, nuts, phyllo, baklava
