In [41]:
import math, string, time, datetime
import numpy as np
import matplotlib.pyplot as plt
from random import randrange, shuffle
from enchant.utils import levenshtein # pip install pyenchant
from pyts.approximation import SymbolicAggregateApproximation # pip install pyts

### _________________________ Algorithms _________________________

#### String representation

In [42]:
def String_representation(X, n_bins):
    
    # normalization
    X_norm = [(Xi - Xi.mean())/Xi.std() for Xi in X]
    
    # Symbolic Aggregate Approximation
    S = []
    sax = SymbolicAggregateApproximation(n_bins=n_bins, strategy='normal')
    for xs in X_norm:
        S.append(''.join(sax.fit_transform([xs])[0]))
    return S.copy()

#### String grammar clustering

In [43]:
def StringGrammarClustering(S, C, V_pos, m, eta, a, b): # normally (m , eta > 1)  (beta ,a , b > 0)
    
    N = len(S) # Number of string

    print('<Initial>')
    print('V_pos =', V_pos)
    print('\n<Processing>')
    
    Lev = [[0]*N for _ in range(N)]
    for i in range(N):
        for j in range(N):
            if i <= j:
                Lev[i][j] = levenshtein(S[i],S[j])
            else:
                Lev[i][j] = Lev[j][i]
    
    # Compute beta using fuzzy median equation (3)
    Med_pos = np.array([sum([Lev[j][k] for k in range(N)]) for j in range(N)]).argmin()
    beta = sum([Lev[Med_pos][k] for k in range(N)])/N 
    
    epoch = 1
    while True:
        # Compute Levenshtein distance between input string j and cluster prototype i (Lev(s_j, sc_i))
#         Lev = [[levenshtein(s_j, sc_i) for s_j in S] for sc_i in V]
        
        U = [[0]*N for _c in range(C)] # membership matrix [u_ik]_CxN
        T = [[0]*N for _c in range(C)] # possibilistic matrix [t_ik]_CxN
        
        # Update membership and possibilistic
        for i in range(C):
            for k in range(N):    
                
                # Update membership value using equation (5)
                if k not in V_pos: # string s_k is not prototype(V)
                    U[i][k] = 1/sum([(Lev[V_pos[i]][k]/Lev[V_pos[j]][k])**(1/(m-1)) for j in range(C)])

                # Update possibilistic value using equation (6)
                T[i][k] = math.exp(-b * eta * math.sqrt(C) * Lev[V_pos[i]][k] / beta)
            
            # Set membership value sc_i = 1 (prototype)
            U[i][V_pos[i]] = 1

        # Update center string of each cluster i (sc_i) using equation (10)
        V_pos_updated = [np.array([sum([(a*U[i][k]**m + b*T[i][k]**eta)*Lev[j][k] for k in range(N)]) for j in range(N)]).argmin() for i in range(C)]
        
        print('epoch', epoch, end=': ')
        if V_pos_updated == V_pos:
            print('<V not change>\n')
            # Multi-prototypes generation
            SC = [[sc for j, sc in enumerate(S_train) if np.array([U[k][j] for k in range(C)]).argmax() == i] for i in range(C)]
            return SC
        else:
            print('V_pos =', V_pos_updated)
        
        V_pos = V_pos_updated.copy()
        epoch += 1
        

#### FKNN Classifcation

In [44]:
def FKNN(SC, S_pred, C, K = 3):
    m = 2
    pred = []
    for _s in S_pred:
        
        # lowest levenshtein distance K prototypes
        lowest = []
        Lev = [[levenshtein(_s, sc_ij) for sc_ij in sc_i] for sc_i in SC]
        for _ in range(K):
            lev_min = float('inf')
            lev_class, lev_idx = -1, -1
            for i in range(len(Lev)):
                for j in range(len(Lev[i])):
                    if Lev[i][j] < lev_min:
                        lev_min = Lev[i][j]
                        lev_class = i
                        lev_idx = j
            lowest.append((lev_class, lev_idx))
            Lev[lev_class][lev_idx] = float('inf')

        # class prediction
        prob = []
        for _k in range(K):
            dividend, divisor = 0, 0
            for (i, j) in lowest:
                lev = levenshtein(SC[i][j], _s)
                if not lev == 0:
                    eq = (1/lev)**(1/(m-1))
                else:
                    eq = 1
                
                divisor += eq
                if lowest[_k][0] == i:
                    dividend += eq
            prob.append(dividend/divisor)
        pred.append(lowest[np.array(prob).argmax()][0])            
    return pred

### _________________________ Experiment _________________________

#### Load dataset
File Header : [Date, Open, High, Low, Close, Adj Close, Volume]

In [45]:
stocks = ['DELTA', 'HANA', 'KCE', 'NEX', 'TEAM', 'ADVANC', 'AIT', 'DTAC', 'FORTH',
'HUMAN', 'ILINK', 'INET', 'JAS', 'JMART', 'MFEC', 'SAMART', 'SIS', 'SVOA', 'TRUE']

X = [[] for _ in range(5)] # Elliott patterns

for stock in stocks:
    pattern = []
    file = open('.\Dataset\\' + stock +'_Elliott.csv')
    data = file.read().split()
    data = [d.split(',') for d in data]

    for i, ds in enumerate(data):
        if len(ds) == 2:
            if not pattern == []:
                X[_y].append(np.array(pattern))
            pattern = []
            
            # for 5 classes recognition
            _y = int(ds[1]) - 1 # class number
            
            # for 2 classes recognition
#             if int(ds[1]) < 2:
#                 _y = 0
#             else:
#                 _y = 1
            
        elif i == len(data) - 1:
            X[_y].append(np.array(pattern))
            pattern = []
        else:
#             pattern.append(float(ds[4])) # Close price
            pattern.append(float(ds[2])) # High price
            pattern.append(float(ds[3])) # Low Price
            
X = [[np.log(X[i][j]) for j in range(len(X[i])) if len(X[i][j]) <= 400] for i in range(len(X))]

C = len(X) # number of class
for i in range(C):
    shuffle(X[i])

n_spc = [len(_x) for _x in X]
train_spc = [n-4 for n in n_spc]
print('Samples per class :',n_spc)

X_train, X_test, Y_train, Y_test = [], [], [], []
for i in range(C):
    X_train += X[i][:train_spc[i]]
    Y_train += [i]*(train_spc[i])
    X_test += X[i][train_spc[i]:]
    Y_test += [i]*(n_spc[i]-train_spc[i])

print('Train : ',len(X_train), 'samples')
print('Test : ',len(X_test), 'samples')

Samples per class : [98, 40, 64, 37, 29]
Train :  248 samples
Test :  20 samples


#### Preprocessing 

In [46]:
n_bins = 7
S_train = String_representation(X_train, n_bins)
S_test = String_representation(X_test, n_bins)

#### Training 

In [65]:
start = time.time()
SC = StringGrammarClustering(S_train, C, V_pos=[sum(train_spc[:i+1])-1 for i in range(C)], m = 2, eta = 2, a = 1, b = 1)
end = time.time()
print('time:', datetime.timedelta(seconds=end-start),end='\n\n')
print('# Photptypes per class :', [len(sc) for sc in SC])

<Initial>
V_pos = [93, 129, 189, 222, 247]

<Processing>
epoch 1: <V not change>

time: 0:04:38.713217

# Photptypes per class : [60, 56, 24, 81, 27]


#### Testing 

In [66]:
pred = FKNN(SC, S_test, C, K = 3)
print('Y_test :', Y_test)
print('pred   :', pred)
print('accuracy :', round(sum([1 for i in range(len(pred)) if pred[i] == Y_test[i]])/len(pred)*100, 2))

Y_test : [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4]
pred   : [0, 0, 3, 0, 3, 0, 3, 3, 1, 0, 1, 3, 3, 3, 3, 3, 4, 3, 3, 4]
accuracy : 45.0


#### Without Training (SAX to FKNN) 

In [57]:
sc_spc = [0] + train_spc
SC2 = [S_train[sum(sc_spc[:i]):sum(sc_spc[:i+1])] for i in range(1,C+1)]
# SC2 = [SC2[i][:min(train_spc)] for i in range(C)]
pred = FKNN(SC2, S_test, C, K = 5)
print('Y_test :', Y_test)
print('pred   :', pred)
print('accuracy :', round(sum([1 for i in range(len(pred)) if pred[i] == Y_test[i]])/len(pred)*100, 2))

Y_test : [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4]
pred   : [1, 0, 2, 0, 1, 1, 0, 0, 3, 0, 0, 2, 0, 3, 0, 0, 4, 2, 3, 2]
accuracy : 35.0
