In [15]:
import numpy as np
import random
import string
import os
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve, average_precision_score


from keras.models import Sequential
from keras.optimizers import SGD, Adam
from keras.layers.core import Dense, Dropout

Using TensorFlow backend.


## Data Factory

In [16]:

class DataFactory:
    def  __init__(self, goodvaluespath, alphabet=string.ascii_letters+string.digits+" "):
        self.valid = []
        self.invalid = []
        self.alphabet = alphabet

        if not os.path.exists(goodvaluespath):
            raise ValueError("goodvaluespathis not a valid path")

        filenames = os.listdir(goodvaluespath)
        for filename in filenames:
            currfullpath = os.path.join(goodvaluespath, filename)
            with open(currfullpath, "r") as fp:
                for line in fp:
                    cleanedline = self.cleanline(line)
                    if cleanedline not in self.valid:
                        self.valid.append( cleanedline )

        self.generateinvalids()


    def cleanline(self, rawinput):
        temp = rawinput.strip()
        return temp

    def getvalid(self):
        return self.valid.copy()


    def getinvalid(self):
        return self.invalid.copy()

    def generateinvalids(self):
        for goodword in self.valid:
            badword = "".join([random.choice(self.alphabet) for ch in goodword])
            while badword in self.invalid:
                badword = "".join([random.choice(self.alphabet) for ch in goodword])

            self.invalid.append(badword)



## short helper functions

In [17]:
def get_dataframe():
    factory = DataFactory("StreetNames/", alphabet=string.ascii_letters + string.digits + " ")
    validDF = pd.DataFrame({'word': factory.getvalid()})
    validDF['tag'] = 0
    invalidDF = pd.DataFrame({'word': factory.getinvalid()})
    invalidDF['tag'] = 1
    ret = pd.DataFrame( validDF )
    ret = ret.append( invalidDF, ignore_index=True )
    ret = ret.sample(frac=1)
    ret = ret.reset_index(drop=True)
    return ret



def statistics(predicitions, targets):
    stats = {'TP' :0 ,'FP' :0 ,'TN' :0 ,'FN' :0}
    for i in range(len(predicitions)):
        isBad    = targets[i, 1] > 0.5
        isTagged = predicitions[i ,1] > 0.5
        if isBad and isTagged:
            stats['TP'] += 1
        elif (isBad) and (not isTagged):
            stats['FN' ]+=1
        elif (not isBad) and (isTagged):
            stats['FP' ]+=1
        elif (not isBad) and (not isTagged):
            stats['TN' ]+=1

    stats['PR'] = (1.00000000 *stats['TP']) / (stats['FP'] +stats['TP'] +1)
    stats['RE'] = (1.00000000 *stats['TP']) / (stats['FN'] +stats['TP'] +1)
    return stats


def calculate_precision_cutoff(pred_float, tag):
    n_samples = pred_float.shape[0]
    cutoffs = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
    cutoff_data = {}
    for cutoff in cutoffs:
        cutoff_data[cutoff] = {"TP":0.0, "FP":0.0, "TN":0.0, "FN":0.0 , "TOTAL":n_samples}
        for i in range(n_samples):
            if (pred_float[i, 0] > cutoff) and (tag[i] ==1):
                cutoff_data[cutoff]["TP"] += 1.0
            elif (pred_float[i, 0] > cutoff) and (tag[i] == 0):
                cutoff_data[cutoff]["FP"] += 1.0
            elif (pred_float[i, 0] <= cutoff) and (tag[i] == 1):
                cutoff_data[cutoff]["FN"] += 1.0
            elif (pred_float[i, 0] <= cutoff) and (tag[i] == 0):
                cutoff_data[cutoff]["TN"] += 1.0

        try:
            cutoff_data[cutoff]["PR"] = cutoff_data[cutoff]["TP"] / (cutoff_data[cutoff]["TP"] + cutoff_data[cutoff]["FP"])
        except ZeroDivisionError:
            cutoff_data[cutoff]["PR"] = 0
        try:
            cutoff_data[cutoff]["RE"] = cutoff_data[cutoff]["TP"] / (cutoff_data[cutoff]["TP"] + cutoff_data[cutoff]["FN"])
        except ZeroDivisionError:
            cutoff_data[cutoff]["RE"] = 0
        try:
            cutoff_data[cutoff]["AC"] = (cutoff_data[cutoff]["TP"]+cutoff_data[cutoff]["TN"]) / (n_samples)
        except ZeroDivisionError:
            cutoff_data[cutoff]["AC"] = 0

    return cutoff_data



# charcter based histogram

In [18]:

def get_histogram(df):
    histogram = {}
    #df = pd.DataFrame()
    i = 0
    for row in df[ df['tag']==0 ]['word']:
        for ch in row:
            if not ch in histogram:
                histogram[ch] = 0
            histogram[ch] += 1

    return histogram



def get_percentage(histogram):
    sum_appearances = sum([v for k, v in histogram.items()])
    percentage = {}
    for k, v in histogram.items():
        percentage[k] = ((1.000000 * v) / sum_appearances)
    return percentage


In [19]:
def transform_df(df, percentage):
    for i in range(21):
        df["ch{0}".format(i + 1)] = df['word'].str[i]
        df["pr{0}".format(i + 1)] = df['word'].str[i].map(percentage)


# Naive method

In [32]:

def make_vec(row, histogram):
    ret_val = {}
    word = row['word']
    for (i, ch1, ch2, ch3) in zip(range(2, len(word)), word[:], word[1:], word[2:]):
        key1 = (i, ch1, ch2, ch3)
        key2 = ( ch1, ch2 )
        mone = ((1.0000000 * histogram['a_and_b'][key1]) / (histogram['a_and_b']['total']))
        mechane = ((1.0000000 * histogram['b'][key2]) / (histogram['b']['total']))
        ret_val["pr_{0}".format(i)] = mone / mechane

    return pd.Series(ret_val)


def make_histogram_naive(df):
    histogram = {'a_and_b':{'total':0} , 'b': {'total':0} }
    for (idx, row) in df.iterrows():
        word = row['word']
        for (i, ch1, ch2, ch3) in zip(range(2, len(word)), word[:], word[1:], word[2:]):
            if (i, ch1, ch2, ch3) not in histogram['a_and_b']:
                histogram['a_and_b'][(i, ch1, ch2, ch3)] = 0
            histogram['a_and_b'][(i, ch1, ch2, ch3)] += 1
            histogram['a_and_b']['total'] += 1

        for ( ch1, ch2 ) in zip(word[:], word[1:]):
            if (ch1, ch2) not in histogram['b']:
                histogram['b'][( ch1, ch2 )] = 0
            histogram['b'][(ch1, ch2)] += 1
            histogram['b']['total'] += 1
    
    return histogram
    
def run_naive_method(df):
    """

    :param df:
    :type df: pandas.DataFrame
    :return:
    """

    histogram = make_histogram_naive(df)

    print (histogram)
    df = df.merge( df.apply(make_vec, axis=1, broadcast=False, raw=False, reduce=None, args=(histogram,)),
              left_index=True, right_index=True)
    df.fillna(0.0000, inplace=True)
    wanted_columns = [ 'pr_2', 'pr_3', 'pr_4', 'pr_5', 'pr_6', 'pr_7', 'pr_8', 'pr_9', 'pr_10', 'pr_11',
                      'pr_12', 'pr_13', 'pr_14', 'pr_15', 'pr_16', 'pr_17', 'pr_18', 'pr_19', 'pr_20', 'tag']

    np_array = df[wanted_columns].values
    #split train & test
    X = np_array[:, :-1]
    Y = np_array[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.3)
    model = LogisticRegression(max_iter=1000, verbose=1)
    model.fit(X_train, y_train)

    predictions = model.decision_function(X_test)
    precision, recall, thresholds = precision_recall_curve(y_test, predictions)
    print (max ( map(lambda x: ( x[0]+x[1], x ) , zip(precision, recall))))
    # PRECISION: 0.9132584269662921   RECALL: 0.9819135717244236))


##  DNN model

In [22]:
def learn_naive_dnn(new_df):
    """:type new_df: pd.DataFrame """

    X = new_df[['pr1', 'pr2', 'pr3', 'pr4', 'pr5', 'pr6', 'pr7', 'pr8', 'pr9', 'pr10', 'pr11', 'pr12', 'pr13', 'pr14', 'pr15', 'pr16', 'pr17', 'pr18', 'pr19', 'pr20', 'pr21']].values
    Y = new_df['tag'].values

    n_samples = X.shape[0]
    train_rand_idx = np.random.choice(  range( n_samples ), size=int(0.7*n_samples), replace=False )
    test_idx = [x for x in range(n_samples) if x not in train_rand_idx]
    train_x = X[train_rand_idx, : ]
    train_y = Y[train_rand_idx]
    test_x = X[test_idx, :]
    test_y = Y[test_idx]

    # Set constants
    batch_size = 128
    dimof_input = 21
    dimof_middle = 10
    dimof_output = 1
    dropout = 0.1

    verbose = True
    print('batch_size: ', batch_size)
    print('dimof_middle: ', dimof_middle)
    print('dropout: ', dropout)
    #print('countof_epoch: ', countof_epoch)

    print('verbose: ', verbose)
    print()

    # Set model
    model = Sequential()
    model.add(Dense(dimof_middle, input_dim=dimof_input, init='uniform', activation='sigmoid'))
    model.add(Dense(dimof_middle, init='uniform', activation='sigmoid'))
    model.add(Dense(dimof_output, init='uniform', activation='sigmoid'))
    optimizer = Adam(lr=0.001)
    model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy'])
    print (model.summary())
    # Train

    model.fit(
        train_x, train_y,
        shuffle=True,
        #validation_split=0.2,
        batch_size=batch_size, epochs=20, verbose=verbose)

    # Test
    x = model.predict(test_x)
    d = calculate_precision_cutoff(x, test_y)
    print ("\n\nRESULTS:\n========\n")
    for k in d.keys():
        print ("cutoff: ", k, " PR: ", d[k]["PR"], " RE: ", d[k]["RE"], " AC: ", d[k]["AC"])

def run_nn_method(df):
    histogram = get_histogram(df)
    percentage = get_percentage(histogram)
    transform_df(df, percentage)
    new_df = df[
        ['pr1', 'pr2', 'pr3', 'pr4', 'pr5', 'pr6', 'pr7', 'pr8', 'pr9', 'pr10', 'pr11', 'pr12', 'pr13', 'pr14', 'pr15',
         'pr16', 'pr17', 'pr18', 'pr19', 'pr20', 'pr21', 'tag']]
    new_df.fillna(value=0, inplace=True)
    learn_naive_dnn(new_df)



# DNN method

In [23]:

df = get_dataframe()
run_nn_method(df)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


batch_size:  128
dimof_middle:  10
dropout:  0.1
verbose:  True

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 10)                220       
_________________________________________________________________
dense_8 (Dense)              (None, 10)                110       
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 11        
Total params: 341
Trainable params: 341
Non-trainable params: 0
_________________________________________________________________
None




Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


RESULTS:

cutoff:  0.1  PR:  0.8688766114180478  RE:  0.9873046875  AC:  0.9198644817810966
cutoff:  0.2  PR:  0.9104952294411631  RE:  0.978515625  AC:  0.9416787665076402
cutoff:  0.3  PR:  0.9300147196574334  RE:  0.9695870535714286  AC:  0.9487658162207011
cutoff:  0.4  PR:  0.9426319396847156  RE:  0.9593331473214286  AC:  0.9509092166217245
cutoff:  0.5  PR:  0.9528711484593837  RE:  0.9491489955357143  AC:  0.9515314941575054
cutoff:  0.6  PR:  0.9605978260869565  RE:  0.93701171875  AC:  0.9497338034985826
cutoff:  0.7  PR:  0.9692657522188807  RE:  0.9217354910714286  AC:  0.946726128742308
cutoff:  0.8  PR:  0.9774264649996199  RE:  0.8970424107142857  AC:  0.9387056627255757
cutoff:  0.9  PR:  0.9844992695990911  RE:  0.84619140625  AC:  0.917167

## Naive Method

In [33]:

df = get_dataframe()
run_naive_method(df)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[LibLinear](1.8966150321516524, (0.9129902750048303, 0.9836247571468221))
