In [126]:
import pandas as pd
from tqdm import tqdm

In [2]:
class Prediction():
    Item = None
    Parent = None
    Children = None

    def __init__(self, itemValue=None):
        self.Item = itemValue
        self.Children = []
        self.Parent = None

    def get_child(self, target):
        for chld in self.Children:
            if chld.Item == target:
                return chld
        return None

    def get_children(self):
        return self.Children

    def has_child(self, target):
        found = self.get_child(target)
        if found is not None:
            return True
        else:
            return False

    def add_new_child(self, child):
        newchild = Prediction(child)
        newchild.Parent = self
        self.Children.append(newchild)

    def remove_child(self, child):
        for chld in self.Children:
            if chld.Item == child:
                self.Children.remove(chld)

In [3]:
class CPT():
    alphabet = None
    root = None
    II = None
    LT = None
    def __init__(self):
        self.alphabet = set()
        self.root = Prediction()
        self.II = {}
        self.LT = {}

    def load_files(self, train_file, test_file=None):

        data = []
        target = []

        if train_file is None:
            return train_file

        train = pd.read_csv(train_file)

        for index, row in train.iterrows():
            data.append(row.values)

        if test_file is not None:

            test = pd.read_csv(test_file)

            for index, row in test.iterrows():
                data.append(row.values)
                target.append(list(row.values))

            return data, target

        return data


    def train(self, data):


        cursornode = self.root

        for seqid, row in enumerate(data):
            for element in row:

                if cursornode.has_child(element) == False:
                    cursornode.add_new_child(element)
                    cursornode = cursornode.get_child(element)

                else:
                    cursornode = cursornode.get_child(element)

                # Adding to the Inverted Index

                if self.II.get(element) is None:
                    self.II[element] = set()

                self.II[element].add(seqid)

                self.alphabet.add(element)

            self.LT[seqid] = cursornode

            cursornode = self.root

        return True

    def score(self, counttable, key, length, target_size, number_of_similar_sequences, number_items_counttable):

        weight_level = 1 / number_of_similar_sequences
        weight_distance = 1 / number_items_counttable
        score = 1 + weight_level + weight_distance * 0.001

        if counttable.get(key) is None:
            counttable[key] = score
        else:
            counttable[key] = score * counttable.get(key)

        return counttable

    def predict(self, data, target, k, n=1):

        predictions = []

        for each_target in tqdm(target):
            each_target = each_target[-k:]

            intersection = set(range(0, len(data)))

            for element in each_target:
                if self.II.get(element) is None:
                    continue
                intersection = intersection & self.II.get(element)

            similar_sequences = []

            for element in intersection:
                currentnode = self.LT.get(element)
                tmp = []
                while currentnode.Item is not None:
                    tmp.append(currentnode.Item)
                    currentnode = currentnode.Parent
                similar_sequences.append(tmp)

            for sequence in similar_sequences:
                sequence.reverse()

            counttable = {}

            for sequence in similar_sequences:
                try:
                    index = next(
                        i for i, v in zip(range(len(sequence) - 1, 0, -1), reversed(sequence)) if v == each_target[-1])
                except:
                    index = None
                if index is not None:
                    count = 1
                    for element in sequence[index + 1:]:
                        if element in each_target:
                            continue

                        counttable = self.score(counttable, element, len(each_target), len(each_target),
                                                len(similar_sequences), count)
                        count += 1

            pred = self.get_n_largest(counttable, n)
            predictions.append(pred)

        return predictions

    def get_n_largest(self, dictionary, n):

        largest = sorted(dictionary.items(), key=lambda t: t[1], reverse=True)[:n]
        return [key for key, _ in largest]

In [4]:
model = CPT()

In [207]:
df = pd.read_csv('automatica_all.csv', header=0)
df

Unnamed: 0,sequence
0,pocgkr
1,cgwpcfks
2,kfsfkspwg
3,pfkswkfsococg
4,wptgkfsoc
...,...
75,pococwtrfkfkg
76,pwocfksfksr
77,pwgerks
78,poctfkfkrwg


In [161]:
df2 = pd.DataFrame(columns=['sequence'])

In [222]:
for row in range(0,len(df)):
    df2.loc[row,'sequence'] = df.loc[row,'sequence'][:3]
    lst = list(df2.loc[row,'sequence'])
    df2.loc[row,'sequence'] = lst

In [169]:
for row in range(0,len(df2)):
    df.loc[row,'sequence'] = (',').join(str(x) for x in list(df.loc[row,'sequence']))

In [172]:
unique_vals = []
for row in range(0,len(df)):
    for elem in df.loc[row,'sequence']:
        if elem not in unique_vals:
            unique_vals.append(elem)

In [173]:
unique_vals

['p', ',', 'o', 'c', 'g', 'k', 'r', 'w', 'f', 's', 't', 'e']

In [176]:
unique_vals.remove(',')

In [178]:
d = {k: v for v, k in enumerate(sorted(unique_vals))}

In [179]:
d

{'c': 0,
 'e': 1,
 'f': 2,
 'g': 3,
 'k': 4,
 'o': 5,
 'p': 6,
 'r': 7,
 's': 8,
 't': 9,
 'w': 10}

In [235]:
test = pd.read_csv('automatica_test.csv', header=None, names=['sequence'])

In [236]:
test = test['sequence'].str.split(',', expand=True)

In [238]:
test.to_csv('automatica_test_cols.csv', header=False, index=False)

In [237]:
test

Unnamed: 0,0,1,2
0,p,o,c
1,c,g,w
2,k,f,s
3,p,f,k
4,w,p,t
...,...,...,...
75,p,o,c
76,p,w,o
77,p,w,g
78,p,o,c
