in order to install NLTK stop words, run the following python commands -

import nltk
nltk.download()

# Predict team using defect title

in the following notebook, a model is built and trained using defects data from center. 
for predicting one field from defects form - team field.

## Required libraries
general imports - keras, sklearn, numpy, pandas

In [45]:
import keras 
from keras.models import Sequential
from keras.layers import Activation
from keras.optimizers import Adam
from keras import regularizers
from keras.layers import Dense, GlobalAveragePooling1D, Embedding, Dropout, Conv1D, MaxPooling1D, Flatten
from keras.callbacks import EarlyStopping
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn import model_selection
from sklearn import metrics
from keras.metrics import categorical_crossentropy
from sklearn.model_selection import train_test_split
import pandas as pd
import json
import numpy as np
import os

# load data from dataset

In [46]:
# Reads and converts json / Excel format to python dict.
def js_r(data):
   with open(data, encoding='utf-8') as f_in:
       return(json.load(f_in))

def read_excel(path):
    return pd.read_excel(open(path, 'rb'),
                  sheet_name='Sheet2')  # doctest: +SKIP

# load JSON data
# data_path=r'C:\dev\defects.json'
# my_dic_data = js_r(data_path)['data']

# Load EXCEL data:
data_path = r'datasets' + os.sep + 'defects-all-spread.xlsx'
my_dic_data = read_excel(data_path)
print('There are: ' + str(len(my_dic_data)) + ' records of defects in dataset.')

There are: 28970 records of defects in dataset.


# Transform & normalize dataset

handling ids fields from json dataset, remove unused columns

In [47]:
def transform_data(data_list):
    print(len(data_list))
    for defect in my_dic_data:
        for column in defect.keys():
            if not defect[column] == None and type(defect[column]) is dict and 'id' in defect[column]:
                defect[column] = defect[column]['id']
        
    return pd.DataFrame(my_dic_data).drop(['access_granted', 'blocked_reason', 'path', 'phase',
                                           'blocked', 'original_id', 'priority', 'defect_root_level', 'story_points', 'user_tags', 'program', 'taxonomies', 'version_stamp', 'detected_in_build', 'sprint', 'dependency_problem_type', 'defect_type'], axis=1)
# df = transform_data(my_dic_data)

cleaning the dataset (removing defects without labels, defects with only one label)

In [48]:
# build dataframe from dict
df = pd.DataFrame(my_dic_data)
# cleaning rows without labels OR descritpion
df = df[df['team.id'].map(lambda x: x != None)]
df = df[df['name'].map(lambda x: x != None)]
train_df = df[df['description'].map(lambda x: x != None)]
train_df = train_df.groupby('team.id').filter(lambda x : len(x)>2)

# building translation map from team id -> team name
team_id_to_name_map = pd.Series(df['team.name'].values,index=df['team.id']).to_dict()
# spliting Lables from features
X = train_df.drop(['team.id'], axis=1)
Y = train_df['team.id']

print("An example of dataset structure:")
train_df.head(n=2)


An example of dataset structure:


Unnamed: 0,creation_time,id,name,description,knowledge_modified_udf,parent.name,parent.id,release.id,team.id,team.name,product_areas.id,product_areas.name,application_modules.id,application_modules.name,qa_owner.id,qa_owner.name,owner.id,owner.name
0,2017-01-11 09:05:12,205022,[Regression Day - Firefox] - Exception thrown ...,&lt;html&gt;&lt;body&gt;&lt;p&gt;1. create tes...,NaT,Requirement Backlog,1001,22001.0,3002.0,OMG Yuval (Do Not Use),78018.0,10 Tests,78018.0,10 Tests,,,9001.0,sari.bivas@microfocus.com
1,2016-12-27 10:06:39,198001,"Bug Hunt 12.53.19 - when add ""On it"" user I ge...",,NaT,Requirement Backlog,1001,22001.0,8003.0,Sharon - XMen Magneto,79021.0,Failure Analysis,79021.0,Failure Analysis,,,3035.0,abed.masrawa@hpe.com


labels encoding

In [49]:
# label envoding
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
encoded_labels = pd.DataFrame(le.fit_transform(train_df['team.id']))
classes_number = encoded_labels.groupby(0).nunique().shape[0]
all_labels = encoded_labels[0].unique()
print('there are: ' + str(classes_number) + ' different labels in dataset, which corresponds to all teams')

there are: 34 different labels in dataset, which corresponds to all teams


In [50]:
# ensure that the model is not overfitting 
# train & validation tests with labels, test without
xtrain, xvalid, ytrain, yvalid = train_test_split(X, encoded_labels,
                                                      stratify=encoded_labels,
                                                      random_state=42,
                                                      test_size=0.2, shuffle=True)


print ('train samples: ' + str(xtrain.shape) + ' validation samples: ' + str(xvalid.shape))

train samples: (22576, 17) validation samples: (5644, 17)


# Build Model + Classifier

In [51]:
from sklearn import metrics
# parameters:BATCH_SIZE=16, EPOC_SIZE=30
BATCH_SIZE = 50 # smaller batch size consume less memory (but can decrease accuracy)
EPOC_SIZE = 30
tokenizer_file_name = "models" + os.sep + "FTtokenizer_" + str(BATCH_SIZE) + "B_" + str(EPOC_SIZE) + "E" + ".pkl"
classifier_file_name = "models" + os.sep + "FTClassifier_" + str(BATCH_SIZE) + "B_" + str(EPOC_SIZE) + "E" + ".pkl"

class fasttext_classifier(object):
    def __init__(self):
        self.train_df = None
        self.train_X = None
        self.train_Y = None
        self.vslid_X = None
        self.valid_Y = None
        self.model = None
        self.hist = None
        self.tokenizer = None
        self.rare_train_words = []
    
    def create_model(self, input_dim, classes_number, embedding_dims=32, optimizer='adam'):
        self.labels = all_labels
        self.model = Sequential()
        
        # we start off with an efficient embedding layer which maps
        # our vocab indices into embedding_dims dimensions
        self.model.add(Embedding(input_dim=input_dim, output_dim=embedding_dims, input_length=256))
        # Dropout consists in randomly setting a fraction rate of input units to 0 at each update during training time,
        # which helps prevent overfitting.
        self.model.add(Dropout(0.3))
        #This layer creates a convolution kernel that is convolved with the layer input over a single spatial (or temporal) dimension to produce a tensor of outputs. If use_bias is True, a bias vector is created and added to the outputs. Finally, if activation is not None, it is applied to the outputs as well.
        self.model.add(Conv1D(64,
                              5,
                              padding='valid',
                              activation='relu'))
        self.model.add(Dropout(0.3))
        self.model.add(MaxPooling1D())
        self.model.add(Flatten())
        self.model.add(Dense(800, activation='relu'))
        self.model.add(Dropout(0.5))
        self.model.add(Dense(classes_number, activation='softmax'))

        self.model.compile(loss='sparse_categorical_crossentropy',
                           optimizer=optimizer,
                           metrics=['accuracy'])

        return

    def set_tokenizer(self, tokenizer):
        self.tokenizer = tokenizer

    def train(self, docstrain, ytrain, docsvalid, yvalid):
        self.train_X = docstrain
        self.train_Y = ytrain
        self.hist = self.model.fit(docstrain, ytrain,
                                   batch_size=BATCH_SIZE,
                                   validation_data=(docsvalid, yvalid),
                                   epochs=EPOC_SIZE, verbose=2,
                                   callbacks=[EarlyStopping(patience=4, monitor='val_loss')])

        predictions = self.model.predict_proba(docsvalid)
        predictions_classes = self.model.predict_classes(docsvalid)
        try:
            # print("valid shape=" + str(yvalid.shape) + "predictions shape=" + str(predictions.shape) + "labels shape=" + str(self.labels.shape))
            print("accuracy on validation set after training: %0.3f" % (
                np.sum(predictions_classes == yvalid[0]) / len(yvalid[0])))
#             print("prediction rows: " + str(len(predictions.values)))
#             print("prediction columns: " + str(len(predictions.values[0])))
#             print("fasttext logloss: %0.3f " % metrics.log_loss(yvalid[0].values, predictions.values, labels=self.labels[0].values))
        except Exception as e:
            print("Oops!! occured. could not calculate metrics for this epoc")
            print(e)
        return

    def predict(self, docstest):
        print("run prediction")
        predictions = self.model.predict_proba(docstest)
        predictions_classes = self.model.predict_classes(docstest)
        return predictions, predictions_classes

    def plot_train_vs_val(self):
        hist = self.model.history
        hist_dict = hist.history
        # plot loss
        fig = plt.figure()
        plt.subplot(211)
        val_loss = hist_dict.get('val_loss')
        val_loss_line = plt.plot(val_loss, label='val_loss')
        plt.legend()
        loss = hist_dict.get('loss')
        plt.plot(loss, label='train_loss')
        plt.legend()
        plt.title("train and validation loss")
        plt.ylabel("loss")

        # plot accuracy
        plt.subplot(212)
        val_acc = hist_dict.get('val_acc')
        plt.plot(val_acc, label='val_acc')
        plt.legend()
        acc = hist_dict.get('acc')
        plt.plot(acc, label='train_acc')
        plt.legend()
        plt.title("train and validation accuracy")
        plt.ylabel("accuracy")
        plt.xlabel("step")

        fig.savefig("fast-text-itr-performance.pdf", format='pdf')

In [52]:
from keras.layers import Dense, GlobalAveragePooling1D, Embedding
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn import model_selection
from sklearn import metrics
from nltk.corpus import stopwords
import pickle

# preproceeings are:
# Separate punctuation from words
# Remove lower frequency words ( <= 2)
# Cut a longer document which contains 256 words
def preprocess(text, stop_words):
    text = text.replace("' ", " ' ")
    signs = set(',.:;"?![]')
    prods = set(text) & signs
    
    if not prods:
        return text

    for sign in prods:
        text = text.replace(sign, ' {} '.format(sign))
    
    if stop_words is not None:
        # Remove Stopwords
        text = ' '.join([w for w in text.split(' ') if not w in stop_words])
    return text

# execute pre process for each of the docs (remove special characters)
def create_docs(data, n_gram_max=1, tokenizer=None, train_mode=True, referance_col='name', stop_words=None):
    df = pd.DataFrame(data=data, columns=[referance_col])
    rare_train_words = []

    # create N grams + separate punctuation from words (character N grams)
    def add_ngram(q, n_gram_max):
        ngrams = []
        for n in range(2, n_gram_max + 1):
            for w_index in range(len(q) - n + 1):
                ngrams.append('--'.join(q[w_index:w_index + n]))
        return q + ngrams
    
    # pre-process text
    docs = []
    for doc in df[referance_col]:
        doc = preprocess(doc, stop_words).split()
        docs.append(' '.join(add_ngram(doc, n_gram_max)))
        
    # tokenization step (why do not split on spaces?)
    min_count = 2
    if tokenizer is None:
        tokenizer = Tokenizer(lower=True, filters='')
        tokenizer.fit_on_texts(docs)
        # summarize what was learned in tokenizer
#         print(tokenizer.word_counts)
#         print(tokenizer.document_count)
#         print(tokenizer.word_index)
#         print(tokenizer.word_docs)

    if train_mode:
        # remove low frequency words
        num_words = sum([1 for _, v in tokenizer.word_counts.items() if v >= min_count])
        tokenizer = Tokenizer(num_words=num_words, lower=True, filters='')
        tokenizer.fit_on_texts(docs)

    docs = tokenizer.texts_to_sequences(docs)
    maxlen = 256
    # cat long sentences and
    docs = pad_sequences(sequences=docs, maxlen=maxlen)

    if train_mode:
        return docs, tokenizer
    else:
        return docs


# get fsx featurew for trainig and validation set in a cross validation methodology
def get_fasttext_features(xtrain, ytrain, xvalid, yvalid, referance_col, classes_number, all_labels, lbl_prefix='fastext_'):
    cv_scores = []
    pred_full_test = 0
    print('len ytrain = ' + str(len(set(ytrain))) + " len classes= " + str(classes_number))
    pred_train = np.zeros([xtrain.shape[0], classes_number])
    
    fsx = fasttext_classifier()
    
    print("create docs for train step (pre process, tokenization)")
    docstrain, tokenizer = create_docs(data=xtrain[referance_col], referance_col=referance_col)
    fsx.set_tokenizer(tokenizer)
    
    print("create docs for validation step (pre process, tokenization)")
    docstest = create_docs(data=xvalid[referance_col], tokenizer=fsx.tokenizer, train_mode=False,
                           referance_col=referance_col)
    input_dim = np.max(docstrain) + 1
    fsx.create_model(input_dim, classes_number=classes_number)

    # split training set to 5 folds
    kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)
    cv_cnt = 1
    for dev_index, val_index in kf.split(docstrain):
        print("CV fsx:" + str(cv_cnt))
        cv_cnt += 1

        dev_X, val_X = docstrain[dev_index], docstrain[val_index]
        dev_y, val_y = ytrain.iloc[dev_index], ytrain.iloc[val_index]

        fsx.train(dev_X, dev_y, val_X, val_y)
        prob_val_y, cls_val_y = fsx.predict(val_X)
        prob_test_y, cls_test_y = fsx.predict(docstest)

        pred_full_test = pred_full_test + prob_test_y
        pred_train[val_index, :] = prob_val_y
#         try:
#             cv_scores.append(metrics.log_loss(val_y, prob_val_y, labels=all_labels))
#         except Exception as e:
#             print("valid shape=" + str(val_y.shape) + "predictions shape=" + str(prob_val_y.shape) + "labels shape=" + str(all_labels.shape))
#             print("Oops! failed to append log_loss metrics in KFold")
#             print(e)
#     try:
#         print("Mean cv score : ", np.mean(cv_scores))
#     except:
#         print("error while calculating mean cv score")
        
    pred_full_test = pred_full_test / 5.

    columns = [lbl_prefix + str(i) for i in range(classes_number)]
    aa = pd.DataFrame(columns=columns, data=pred_train)
    bb = pd.DataFrame(columns=columns, data=pred_full_test)
    return aa, bb


# this methos to be used to save model created on training set, for new row currently not in DB
def obtain_fasttext_model(xtrain, ytrain, xvalid, yvalid, classes_number, referance_col='name',create_doc=True, stop_words=True):

    fsx = fasttext_classifier()
    if stop_words:
        print("using stop words")
        eng_stopwords = set(stopwords.words("english"))
    else:
        eng_stopwords = None
        
    if create_doc:
        docstrain, tokenizer = create_docs(data=xtrain[referance_col], referance_col=referance_col, stop_words=eng_stopwords)
        fsx.set_tokenizer(tokenizer)
        docstest = create_docs(data=xvalid[referance_col], tokenizer=fsx.tokenizer, train_mode=False,
                           referance_col=referance_col, stop_words=eng_stopwords)
    else:
        docstrain=xtrain
        docstest=xvalid

    input_dim = np.max(docstrain) + 1
    fsx.create_model(input_dim, classes_number=classes_number)

    fsx.train(docstrain, ytrain, docstest, yvalid)
    return fsx, tokenizer


xtrain_processed = xtrain.reset_index(drop=True)
xvalid_processed = xvalid.reset_index(drop=True)
    

# Train / Load Model

In [53]:
train = True

saver = tf.train.Saver()
sess = keras.backend.get_session()
saver.save(sess, './keras_model')

if train:
    fsx, tokenizer = obtain_fasttext_model(xtrain_processed, ytrain, xvalid_processed, yvalid, classes_number,
                                                             referance_col='name', stop_words=True)
    print("finish training FT classifier")
    # export model and tokenizer
    pickle.dump(tokenizer, open(tokenizer_file_name, "wb"), protocol=pickle.HIGHEST_PROTOCOL)
    fsx.model.save(classifier_file_name)
else:
    # load model from disk
    import pickle
    from keras.models import load_model
    import pandas as pd
    loaded_tokenizer = pickle.load(open(tokenizer_file_name, "rb"))
    loaded_fsx = load_model(classifier_file_name)


print("Exported model and tokenizer")

using stop words
Train on 22576 samples, validate on 5644 samples
Epoch 1/30
 - 83s - loss: 2.3148 - acc: 0.3355 - val_loss: 1.8296 - val_acc: 0.4818
Epoch 2/30
 - 80s - loss: 1.6164 - acc: 0.5414 - val_loss: 1.4771 - val_acc: 0.5985
Epoch 3/30
 - 80s - loss: 1.3496 - acc: 0.6144 - val_loss: 1.3606 - val_acc: 0.6332
Epoch 4/30
 - 79s - loss: 1.1865 - acc: 0.6603 - val_loss: 1.3060 - val_acc: 0.6492
Epoch 5/30
 - 82s - loss: 1.0699 - acc: 0.6933 - val_loss: 1.2794 - val_acc: 0.6582
Epoch 6/30
 - 82s - loss: 0.9794 - acc: 0.7152 - val_loss: 1.2582 - val_acc: 0.6637
Epoch 7/30
 - 79s - loss: 0.9092 - acc: 0.7372 - val_loss: 1.2545 - val_acc: 0.6706
Epoch 8/30
 - 80s - loss: 0.8559 - acc: 0.7454 - val_loss: 1.2642 - val_acc: 0.6743
Epoch 9/30
 - 80s - loss: 0.7993 - acc: 0.7628 - val_loss: 1.2743 - val_acc: 0.6774
Epoch 10/30
 - 81s - loss: 0.7574 - acc: 0.7734 - val_loss: 1.2776 - val_acc: 0.6779
Epoch 11/30
 - 80s - loss: 0.7224 - acc: 0.7814 - val_loss: 1.2799 - val_acc: 0.6729
accuracy

In [113]:
from keras import backend
import tensorflow as tf
saver = tf.train.Saver()
sess = keras.backend.get_session()
saver.save(sess, 'keras_model')
pickle.dump(tokenizer, open(tokenizer_file_name, "wb"), protocol=pickle.HIGHEST_PROTOCOL)
fsx.model.save(classifier_file_name)

model = keras.models.load_model(classifier_file_name)

saver = tf.train.Saver()
sess = keras.backend.get_session()
saver.restore(sess, 'keras_model')

model.predict(inputs)


UnknownError: Failed to rename: keras_model.data-00000-of-00001.tempstate16543772219696919801 to: keras_model.data-00000-of-00001 : Access is denied.
; Input/output error
	 [[node save_1/SaveV2 (defined at <ipython-input-113-1259041555ac>:3) ]]

Caused by op 'save_1/SaveV2', defined at:
  File "C:\Users\vaingato.CORPDOM\AppData\Local\Continuum\anaconda3\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\Users\vaingato.CORPDOM\AppData\Local\Continuum\anaconda3\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\Users\vaingato.CORPDOM\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Users\vaingato.CORPDOM\AppData\Local\Continuum\anaconda3\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "C:\Users\vaingato.CORPDOM\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 505, in start
    self.io_loop.start()
  File "C:\Users\vaingato.CORPDOM\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\platform\asyncio.py", line 148, in start
    self.asyncio_loop.run_forever()
  File "C:\Users\vaingato.CORPDOM\AppData\Local\Continuum\anaconda3\lib\asyncio\base_events.py", line 539, in run_forever
    self._run_once()
  File "C:\Users\vaingato.CORPDOM\AppData\Local\Continuum\anaconda3\lib\asyncio\base_events.py", line 1775, in _run_once
    handle._run()
  File "C:\Users\vaingato.CORPDOM\AppData\Local\Continuum\anaconda3\lib\asyncio\events.py", line 88, in _run
    self._context.run(self._callback, *self._args)
  File "C:\Users\vaingato.CORPDOM\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\ioloop.py", line 690, in <lambda>
    lambda f: self._run_callback(functools.partial(callback, future))
  File "C:\Users\vaingato.CORPDOM\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\ioloop.py", line 743, in _run_callback
    ret = callback()
  File "C:\Users\vaingato.CORPDOM\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\gen.py", line 781, in inner
    self.run()
  File "C:\Users\vaingato.CORPDOM\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\gen.py", line 742, in run
    yielded = self.gen.send(value)
  File "C:\Users\vaingato.CORPDOM\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 357, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "C:\Users\vaingato.CORPDOM\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\gen.py", line 209, in wrapper
    yielded = next(result)
  File "C:\Users\vaingato.CORPDOM\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 267, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "C:\Users\vaingato.CORPDOM\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\gen.py", line 209, in wrapper
    yielded = next(result)
  File "C:\Users\vaingato.CORPDOM\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 534, in execute_request
    user_expressions, allow_stdin,
  File "C:\Users\vaingato.CORPDOM\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\gen.py", line 209, in wrapper
    yielded = next(result)
  File "C:\Users\vaingato.CORPDOM\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 294, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\Users\vaingato.CORPDOM\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "C:\Users\vaingato.CORPDOM\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2848, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "C:\Users\vaingato.CORPDOM\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2874, in _run_cell
    return runner(coro)
  File "C:\Users\vaingato.CORPDOM\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\async_helpers.py", line 67, in _pseudo_sync_runner
    coro.send(None)
  File "C:\Users\vaingato.CORPDOM\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3049, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "C:\Users\vaingato.CORPDOM\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3214, in run_ast_nodes
    if (yield from self.run_code(code, result)):
  File "C:\Users\vaingato.CORPDOM\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-113-1259041555ac>", line 3, in <module>
    saver = tf.train.Saver()
  File "C:\Users\vaingato.CORPDOM\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\training\saver.py", line 832, in __init__
    self.build()
  File "C:\Users\vaingato.CORPDOM\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\training\saver.py", line 844, in build
    self._build(self._filename, build_save=True, build_restore=True)
  File "C:\Users\vaingato.CORPDOM\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\training\saver.py", line 881, in _build
    build_save=build_save, build_restore=build_restore)
  File "C:\Users\vaingato.CORPDOM\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\training\saver.py", line 510, in _build_internal
    save_tensor = self._AddSaveOps(filename_tensor, saveables)
  File "C:\Users\vaingato.CORPDOM\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\training\saver.py", line 210, in _AddSaveOps
    save = self.save_op(filename_tensor, saveables)
  File "C:\Users\vaingato.CORPDOM\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\training\saver.py", line 124, in save_op
    tensors)
  File "C:\Users\vaingato.CORPDOM\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\ops\gen_io_ops.py", line 1920, in save_v2
    name=name)
  File "C:\Users\vaingato.CORPDOM\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 788, in _apply_op_helper
    op_def=op_def)
  File "C:\Users\vaingato.CORPDOM\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\util\deprecation.py", line 507, in new_func
    return func(*args, **kwargs)
  File "C:\Users\vaingato.CORPDOM\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 3300, in create_op
    op_def=op_def)
  File "C:\Users\vaingato.CORPDOM\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 1801, in __init__
    self._traceback = tf_stack.extract_stack()

UnknownError (see above for traceback): Failed to rename: keras_model.data-00000-of-00001.tempstate16543772219696919801 to: keras_model.data-00000-of-00001 : Access is denied.
; Input/output error
	 [[node save_1/SaveV2 (defined at <ipython-input-113-1259041555ac>:3) ]]


In [108]:
string_to_predict = 'pipeline module does not work properly'
column = pd.DataFrame({'name' : [string_to_predict]})



train = False
if train:
    docs = create_docs(data=column, train_mode=False, tokenizer=tokenizer, referance_col='name')
    prob_val_matrix, y_class = fsx.predict(docs)
    predicted_team_id = le.inverse_transform(y_class)[0]
    team_probability = prob_val_matrix[0][y_class]
else:
    docs = create_docs(data=column, train_mode=False, tokenizer=loaded_tokenizer, referance_col='name')
    prob_val_matrix, y_class = loaded_fsx.predict(docs)
    predicted_team_id = (-prob_val_matrix).argsort()[0][0]
    predicted_team_id = (-prob_val_matrix).argsort()[0][1]

# predicted_team_name = team_id_to_name_map[predicted_team_id]
# team_probability = prob_val_matrix[0][predicted_team_id]

# team_id_b = le.inverse_transform([max_teams[0]])[0]

print (str(team_probability) + " probability it's: " + predicted_team_name)
print (str(prob_val_y[max_teams[1]]) + " probability it's: " + predicted_team_name)




ValueError: not enough values to unpack (expected 2, got 1)

In [55]:
# # extract NLP features
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.pipeline import Pipeline, FeatureUnion
# from sklearn.decomposition import PCA as sklearnPCA, TruncatedSVD
# from sklearn.base import BaseEstimator, TransformerMixin
# class ItemSelector(BaseEstimator, TransformerMixin):

#     def __init__(self, key):
#         self.key = key

#     def fit(self, x, y=None):
#         return self

#     def transform(self, data_dict):
#         return data_dict[self.key]
    
# tf_idf_3_grams = Pipeline([
#                 ('sel', ItemSelector(key='name')),
#                 ('tf', TfidfVectorizer(max_features=1000,
#                                        strip_accents='unicode', token_pattern=r'\w{1,}',
#                                        ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1,
#                                        stop_words='english')),
#                 ('svd', TruncatedSVD(n_components=50))
#     ])    
# name_features = tf_idf_3_grams.fit_transform(X)

# train test split: train, valid, test sets

## regularization

how to avoid overfitting
- reduce layers
- randomly neglect nodes from producing output in nn (dropout)
- add regularization (penelize for large weights: loss + x) L2 regularization

how to avoid underfitting - 
- encrease layers / nodes.
- add additional features.

to add regularization to layer:
model.add(Dense(5, input_shape=(26,), kernal.regulizer=regulizers.l2(0.01), activation='relu'))


# create the model with layers

In [56]:
model = Sequential()
# Dense is a type of layer (basic type), fully connected layer
# first arg is number of neurons in layer
# the activation function getting the wieghted sum from all input nodes, and output a number between 0 to 1.
# the first layer is hidden layer (and the input defined with param input_shape)
model.add(Dense(5, input_shape=(50,), activation='relu'))
model.add(Dense(5, activation='relu'))
model.add(Dense(1, activation='softmax'))


In [57]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_11 (Dense)             (None, 5)                 255       
_________________________________________________________________
dense_12 (Dense)             (None, 5)                 30        
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 6         
Total params: 291
Trainable params: 291
Non-trainable params: 0
_________________________________________________________________


# compile the module

In [58]:
# using optimizer (SGC, Adma..) , minimize the loss funtion.
# Adam is a variation of SGC, also choose loss func + metrics (printed out)
model.compile(Adam(lr=.0001), loss='binary_crossentropy', metrics=['accuracy'])

# train the model

In [59]:
# expect to get numpy array
# verbose: how much output we want to see 
model.fit(train_samples, train_labels, validation_split=0.20, batch_size=10, epochs=20, shuffle=True, verbose=1)

NameError: name 'train_samples' is not defined