In [1]:
import math, string
import numpy as np
import matplotlib.pyplot as plt
from random import randrange, shuffle
from enchant.utils import levenshtein # pip install pyenchant
from pyts.approximation import PiecewiseAggregateApproximation, SymbolicAggregateApproximation # pip install pyts

#### Load dataset

In [2]:
stocks = ['DELTA', 'HANA', 'KCE', 'NEX', 'TEAM', 'ADVANC', 'AIT', 'DTAC', 'FORTH',
'HUMAN', 'ILINK', 'INET', 'JAS', 'JMART', 'MFEC', 'SAMART', 'SIS', 'SVOA', 'TRUE']

X = [[] for _ in range(5)] # Elliott patterns
Y = [] # Elliott classes
for stock in stocks:
    pattern = []
    file = open('.\Dataset\\' + stock +'_Elliott.csv')
    data = file.read().split()
    data = [d.split(',') for d in data]

    for i, ds in enumerate(data):
        if len(ds) == 2 or i == len(data) - 1:

            if not i == len(data) - 1:
                _y = int(ds[1]) - 1 # class number

            if not i == 0:
                X[_y].append(np.array(pattern))
                pattern = []
        else:
            pattern.append(float(ds[4])) 

C = len(X) # number of class
for i in range(C):
    shuffle(X[i])
    
n_spc = [len(_x) for _x in X]
print('Samples per class :',n_spc)

X_train, X_test = [], []
for i in range(C):
    X_train += X[i]
    Y += [i]*n_spc[i]
#     X_test += X[i][spc:spc + 10]


Samples per class : [118, 55, 74, 47, 30]


#### String representation

In [3]:
def String_representation(X):
    # normalization
    X_norm = [(Xi - Xi.mean())/Xi.std() for Xi in X]

    X_paa = []
    S = [] # set of N strings
    for T in X_norm:
        n = len(T) # length of time series
        w = 2
        n_bins = 8

        # PAA (Piecewise Aggregate Approximation)
        paa = PiecewiseAggregateApproximation(window_size=w)
        X_paa.append(paa.transform([T])[0])

        # SAX (Symbolic Aggregate approXimation)
        sax = SymbolicAggregateApproximation(n_bins=n_bins, strategy='normal')
        st = ''
        for s in sax.fit_transform([X_paa[-1]])[0]:
            st += s
        S.append(st)
    return S

In [4]:
S = String_representation(X_train)
# S_test = String_representation(X_test)

#### String grammar clustering

In [5]:
def StringGrammarClustering(S, C, V_pos):
    
    N = len(S) # Number of string
    m, eta, a, b = 2, 2, 2, 0.5 # normally m > 1, eta > 1, beta > 0, a > 0, b > 0
    sigma = string.ascii_lowercase[:C]

    # Initialize string prototypes for all C classes
    V = [S[i] for i in V_pos]

    print('<---------- V initial ---------->')
    for i, sc in enumerate(V):
        print(f'sc_{i+1} : {sc}')
    print('\n<---------- V processing ---------->')

    # Compute beta using fuzzy median equation (3)
    Med = S[np.array([sum([levenshtein(s_j,s_k) for s_k in S]) for s_j in S]).argmin()]
    beta = sum([levenshtein(Med, s_k) for s_k in S])/N
    
    while True:
        U = [[0]*N for _c in range(C)] # membership matrix [u_ik]_CxN
        T = [[0]*N for _c in range(C)] # possibilistic matrix [t_ik]_CxN
        
        # Compute Levenshtein distance between input string j and cluster prototype i (Lev(s_j, sc_i))
        Lev = [[levenshtein(s_j, sc_i) for s_j in S] for sc_i in V]
        
        # Update membership and possibilistic
        for i in range(C):
            for k in range(N):    
                
                # Update membership value using equation (5)
                if k not in V_pos: # string s_k is not phototype(V) 
                    U[i][k] = 1/sum([(Lev[i][k]/Lev[j][k])**(1/(m-1)) for j in range(C) if Lev[j][k] > 0])

                # Update possibilistic value using equation (6)
                T[i][k] = math.e**(-(b * eta * math.sqrt(C) * Lev[i][k]) / beta)
            
            # Set membership value sc_i = 1 (phototype)
            U[i][V_pos[i]] = 1

        # Update center string of each cluster i (sc_i) using equation (10)
        V_pos = [np.array([sum([(a*U[i][k]**m + b*T[i][k]**eta)*levenshtein(S[j],S[k]) for k in range(N)]) for j in range(N)]).argmin() for i in range(C)]
        V_updated = [S[pos] for pos in V_pos]
        
        if V_updated == V:
            print('>>>> V not change <<<<')
            print('>>>>     End!     <<<<\n')
            return U
        V = V_updated.copy()
        for i, sc in enumerate(V):
            print(f'sc_{i+1} : {sc}')
        print()
        

In [6]:
U = StringGrammarClustering(S, C, V_pos=[n-1 for n in n_spc])

<---------- V initial ---------->
sc_1 : aaabdfedddhghgh
sc_2 : cabbaabddeeedeedddccdefhhgehh
sc_3 : abbbbcbbbbbbbbbbbcdddefffeefffhhhhhh
sc_4 : aeceedcgh
sc_5 : acddbcdfhh

<---------- V processing ---------->
sc_1 : aabbccdeffefhhh
sc_2 : abbbbddddecddbcghhhh
sc_3 : abbbbcbbbbbbbbbbbcdddefffeefffhhhhhh
sc_4 : hhffeefedcaa
sc_5 : acddbcdfhh

>>>> V not change <<<<
>>>>     End!     <<<<



#### Multi-phototypes generation 

In [7]:
SC = [[sc for j, sc in enumerate(S) if np.array([U[k][j] for k in range(C)]).argmax() == i] for i in range(C)]
print('Photptypes per class :', [len(sc) for sc in SC])

Photptypes per class : [45, 20, 128, 108, 23]


#### FKNN Classifcation

In [8]:
def FKNN(SC, S_pred, C, K = 3):
    m = 2
    pred = []
    for _s in S_pred:
        
        # lowest levenshtein distance K phototypes
        lowest = []
        Lev = [[levenshtein(_s, sc_ij) for sc_ij in sc_i] for sc_i in SC]
        for _ in range(K):
            lev_min = float('inf')
            lev_class = -1
            for i in range(len(Lev)):
                for j in range(len(Lev[i])):
                    if Lev[i][j] < lev_min:
                        lev_min = Lev[i][j]
                        lev_class = i
                        lev_idx = j
            lowest.append((lev_class, lev_idx))
            Lev[lev_class][lev_idx] = float('inf')
        
        # class prediction
        prob = []
        for _k in range(K):
            dividend, divisor = 0, 0
            for (i, j) in lowest:
                lev = levenshtein(SC[i][j], _s)
                if lev != 0:
                    eq = (1/lev)**(1/(m-1))
                else:
                    eq = 1
                
                divisor += eq
                if lowest[_k][0] == i:
                    dividend += eq
            prob.append(dividend/divisor)
        pred.append(lowest[np.array(prob).argmax()][0])
                
    return pred

In [9]:
pred = FKNN(SC, S, C, K = 3)
print(pred)
print('accuracy :', round(sum([1 for i in range(len(pred)) if pred[i] == Y[i]])/len(pred)*100, 2))

[3, 3, 3, 2, 3, 2, 3, 2, 3, 3, 2, 3, 3, 1, 3, 3, 3, 3, 3, 3, 0, 2, 2, 3, 3, 2, 0, 4, 3, 4, 2, 0, 2, 0, 3, 3, 3, 1, 4, 2, 2, 0, 3, 3, 2, 3, 4, 3, 3, 2, 3, 2, 0, 3, 1, 2, 0, 2, 0, 2, 2, 3, 2, 2, 2, 3, 2, 4, 3, 3, 3, 3, 4, 2, 3, 2, 3, 2, 3, 2, 3, 0, 2, 2, 3, 3, 4, 2, 2, 2, 3, 2, 2, 0, 0, 2, 2, 2, 2, 2, 3, 2, 0, 2, 3, 3, 3, 2, 4, 2, 2, 2, 0, 2, 3, 2, 2, 0, 0, 2, 2, 4, 2, 3, 3, 2, 0, 2, 2, 1, 2, 0, 4, 2, 2, 3, 4, 0, 0, 0, 3, 2, 2, 2, 3, 2, 3, 0, 3, 2, 0, 4, 4, 4, 3, 3, 3, 3, 4, 1, 2, 1, 2, 3, 2, 0, 3, 3, 0, 3, 3, 0, 2, 2, 4, 2, 2, 2, 0, 4, 3, 3, 1, 2, 0, 0, 2, 2, 1, 3, 3, 3, 2, 2, 2, 2, 0, 3, 2, 0, 1, 2, 4, 0, 2, 2, 3, 3, 3, 0, 3, 2, 2, 3, 2, 0, 2, 1, 1, 3, 3, 2, 0, 3, 2, 3, 2, 2, 2, 3, 2, 2, 3, 3, 2, 2, 2, 4, 2, 2, 2, 2, 3, 2, 1, 2, 2, 2, 0, 2, 3, 3, 2, 2, 2, 3, 2, 2, 2, 3, 2, 0, 2, 2, 2, 2, 2, 3, 2, 3, 2, 0, 2, 1, 3, 0, 0, 2, 3, 3, 2, 0, 1, 4, 2, 3, 3, 0, 3, 0, 3, 3, 3, 2, 3, 3, 0, 1, 3, 3, 1, 4, 1, 3, 4, 3, 2, 0, 3, 2, 4, 0, 2, 2, 1, 1, 2, 3, 3, 3, 3, 3, 2, 1]
accuracy : 21.91
