In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os

from sklearn.svm import LinearSVC
from sklearn import preprocessing

In [2]:
def get_content_binary_word(path):    
    # get full character content of a word num i 
    x_word = []
    
    with open(path, "r") as f:            
        for line_idx, line in enumerate(f):
            # each line is a character
            x_char_str = line.strip().split(' ')
            x_char = [int(pixel_i) for pixel_i in x_char_str]
            x_word.append(x_char)
    return np.array(x_word)
# test case
word = get_content_binary_word('./data/train_words/text_1.txt')
print (word)
print (word.shape)

def convert_word2label(word):
    alphabet = "abcdefghijklmnopqrstuvwxyz"
    
    le = preprocessing.LabelEncoder()
    le.fit(list(alphabet))
    return le.transform(word)
    
def get_dataset(data_dir, mode="train"):
    words_file_dir = os.path.join(data_dir, '{}_words'.format(mode))
    words_list_path = os.path.join(data_dir, '{}_words.txt'.format(mode))

    X = []
    y = []    
    with open(words_list_path) as f:
        for line in f:
            line = line.strip()
            idx, word = line.split()
            
            binary_word_path = os.path.join(words_file_dir, 'text_{}.txt'.format(idx))
            X.append(get_content_binary_word(binary_word_path))
            y.append(convert_word2label(list(word)))
            
    return np.array(X), np.array(y)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 1 1 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 1 1 ... 1 1 0]
 [1 0 0 ... 0 0 0]]
(11, 128)


In [3]:
data_dir = './data'

X_train, y_train = get_dataset(data_dir, mode="train")
X_test, y_test = get_dataset(data_dir, mode="test")

In [4]:
print (X_train.shape, y_train.shape)
print (X_train[0].shape, y_train[0].shape)
print (X_train[0][0])

(13481,) (13481,)
(7, 128) (7,)
[0 0 0 0 0 1 1 1 0 0 0 0 0 1 1 1 0 0 0 0 0 1 0 1 0 0 0 0 1 1 0 1 0 0 0 0 1
 0 0 1 0 0 0 0 1 0 0 1 0 0 0 1 1 1 1 1 0 0 0 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]


In [5]:
# Train linear SVM
svm = LinearSVC(dual=False, C=.1)
# flatten input
svm.fit(np.vstack(X_train), np.hstack(y_train))

LinearSVC(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [6]:
print("Test score with linear SVM: %f" % svm.score(np.vstack(X_test),
                                                   np.hstack(y_test)))

Test score with linear SVM: 0.770827


In [7]:
print (np.vstack(X_train).shape)
print (np.hstack(y_train).shape)

(100622, 128)
(100622,)


In [8]:
# Save model 
from sklearn.externals import joblib
joblib.dump(svm, 'Sklearn_SVM.sav')



['Sklearn_SVM.sav']

In [9]:
from sklearn.externals import joblib
model = joblib.load('Sklearn_SVM.sav')

In [10]:
print("Test score with linear SVM: %f" % model.score(np.vstack(X_test),
                                                   np.hstack(y_test)))

Test score with linear SVM: 0.770827
