## Libraries

In [127]:
import numpy as np
import pandas as pd
import math
import keras
from time import *
from sklearn.preprocessing import OneHotEncoder
from keras.models import Sequential
from keras.layers import LSTM, Dense

np.set_printoptions(precision=50,suppress=True)

## Part 1: Training the LSTM model

In [57]:
#We upload our dataset from the excel document to df_to_train

df_to_train_and_test = pd.read_excel('memrefs_train_andor_validate.xlsx', header = None)

print(df_to_train_and_test)

                 0
0       0xbfb22b18
1       0xbfb22b14
2       0xbfb22b10
3       0xbfb22b0c
4       0xbfb22b18
...            ...
399995  0xbfb22978
399996  0xbfb22974
399997  0xbfb22980
399998  0xbfb22970
399999  0xbfb2296c

[400000 rows x 1 columns]


In [58]:
#We transform the dataset from the hexademical to the decimal in order to be able to use it.

df_to_train_and_test = df_to_train_and_test[0].apply(int, base=16)

print(df_to_train_and_test)

0         3216124696
1         3216124692
2         3216124688
3         3216124684
4         3216124696
             ...    
399995    3216124280
399996    3216124276
399997    3216124288
399998    3216124272
399999    3216124268
Name: 0, Length: 400000, dtype: int64


In [59]:
#Further down we will descover that a big dataset crasges the program thanks to extreme ram usage.
#Although we will use a lot of data that could be trained to make better predictions we have to cut it down to a smaller size.

rows_to_drop = 41000
df_to_train_and_test = df_to_train_and_test[:rows_to_drop]

print(df_to_train_and_test)

0        3216124696
1        3216124692
2        3216124688
3        3216124684
4        3216124696
            ...    
40995    3216122804
40996    3216122812
40997    3216122652
40998    3216122656
40999    3216122644
Name: 0, Length: 41000, dtype: int64


In [60]:
#We transform the dataset from pandas series to an array

df_to_train_and_test = np.array(df_to_train_and_test)

print(df_to_train_and_test)

[3216124696 3216124692 3216124688 ... 3216122652 3216122656 3216122644]


In [61]:
#After a lot of experimentation the dataset cannot be used as it is inside the model to create suitable results.
#For this reason we take the difference between each consecutive number to create the diff_values list.

diff_values = list()

for i in range(1, len(df_to_train_and_test)):
    value = df_to_train_and_test[i] - df_to_train_and_test[i - 1]
    diff_values.append(value)

print(diff_values)    

[-4, -4, -4, 12, -4, -4, -4, 12, -4, -4, -4, 12, -4, -4, -4, 12, -4, -4, -4, 12, -4, -4, -4, 12, -4, -4, -4, 12, -4, -4, -4, -3079625884, 3079625532, -3079625600, -44, 3079625644, 364, -4, -4, -4, 12, -4, -4, -4, 12, -4, -4, -4, 12, -4, -4, -4, 12, -4, -4, -4, 12, -4, -4, -4, 12, -4, -4, -4, 12, -4, -4, -4, 12, -4, -4, -4, 0, 28, 4, 0, -4, -16, -4, -4, -4, 12, -4, -4, -4, 12, -4, -4, -4, 12, -4, -4, -4, 12, -4, -4, -4, 12, -4, -4, -4, 28, 4, 0, -4, -28, -4, -3079649372, 3079649372, 4, 8, -4, -4, -4, -4, 12, -32, -4, 40, -3079625020, -45548, 45552, -45552, 3079670552, 4, 4, 8, -4, -4, -4, -4, 12, -32, -4, 40, -3079625004, -45560, 45564, -45564, 3079670548, 4, 4, 8, -4, -4, -4, -4, 12, -32, -4, 40, -3079624988, -45568, 45572, -45572, 3079670540, 4, 4, 8, -4, -4, -4, -4, 12, -32, -4, 40, -3079624972, -45588, 45592, -45592, 3079670544, 4, 4, 12, -4, -4, -4, 12, -4, -4, -4, 12, -4, -4, -4, 12, -4, -4, -4, 12, -4, -4, -4, 12, -4, -4, -4, 28, 4, 0, -4, -16, -4, -4, -4, 12, -4, -4, -4, -307962

In [62]:
#And here we transform it to an array and reshape it in order to be used later.

diff_values =  np.array(diff_values)
diff_values = diff_values.reshape(-1,1)

print(diff_values)

[[  -4]
 [  -4]
 [  -4]
 ...
 [-160]
 [   4]
 [ -12]]


In [63]:
#Further down we will split the dataset into the train and split datasets.
#The most famous and useful analogy is to split the dataset into 80 and 20% accordingly.
#Here we find the length of each dataset that will be used in the arrays later.

training_data_len = math.ceil(len(df_to_train_and_test) * 0.8)
testing_data_len = len(df_to_train_and_test) - training_data_len

print('training_data_len: ', training_data_len, ' and ' + '\n' + 'testing_data_len: ', testing_data_len)

training_data_len:  32800  and 
testing_data_len:  8200


In [64]:
#This is the point where we create the X and y train that will be used in the LSTM model.
#X_train is filled with batches of 1000 samples, from 0 to 999, 1 to 1000 and so on.
#y_train on the other hand is given every sample from the dataset past the first batch but one step in front of X
#X_train and y_train then are transformed into arrays and reshaped.

X_train = []
y_train = []

for i in range(999, training_data_len):
    X_train.append(diff_values[i - 999: i])
    y_train.append(diff_values[i])

X_train = np.array(X_train)
y_train = np.array(y_train)    

X_train = np.reshape(X_train, newshape = (X_train.shape[0], X_train.shape[1], 1))
y_train = y_train.reshape(-1,1)

print('X_train' + '\n', X_train, np.shape(X_train), '\n' + '\n' + '\n' + '\n' + ' and y_train ' + '\n', y_train, np.shape(y_train))

X_train
 [[[         -4]
  [         -4]
  [         -4]
  ...
  [        364]
  [         -4]
  [         -4]]

 [[         -4]
  [         -4]
  [         12]
  ...
  [         -4]
  [         -4]
  [         -4]]

 [[         -4]
  [         12]
  [         -4]
  ...
  [         -4]
  [         -4]
  [         12]]

 ...

 [[          0]
  [ 3079614696]
  [          4]
  ...
  [         16]
  [-3079614864]
  [         72]]

 [[ 3079614696]
  [          4]
  [          4]
  ...
  [-3079614864]
  [         72]
  [         88]]

 [[          4]
  [          4]
  [          4]
  ...
  [         72]
  [         88]
  [         -8]]] (31801, 999, 1) 



 and y_train 
 [[-4]
 [12]
 [-4]
 ...
 [88]
 [-8]
 [ 0]] (31801, 1)


In [65]:
#However here is the moment that we decide if the model produces a regression or a classification solution.
#If we let y_train in this form, as a vector, the result will be one number that is as close as possible to the real one
#, through the regression method, but the chances of it actually being the exact number we want are almost non-existent.
#For this reason we onehot encode y_train, which transofrms it into a matrix with the same rows as before, the 
#rows being the samples, but the columns are the unique numbers, now unique classes, of the dataset.
#Each row is filled with 0s and one 1 at the column/class that the number/row belongs too.
#In that way the result will be vector with 0s and a 1, meaning that it classifies the result into one of the classes.

encoder = OneHotEncoder(drop='first', sparse=False)
y_train = y_train.reshape(len(y_train), 1)
y_train_onehot = encoder.fit_transform(y_train)

print(y_train_onehot, np.shape(y_train_onehot))

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]] (31801, 4271)


In [66]:
#At last this is the part we create the model.

model = keras.Sequential() 
model.add(LSTM(64, input_shape = (X_train.shape[1], X_train.shape[2]), dropout = 0.2, return_sequences = True))
model.add(LSTM(64, dropout = 0.2, return_sequences = False))
model.add(Dense(y_train_onehot.shape[1], activation = 'softmax'))

In [67]:
#This is the compilation part with the proper parameters.

start = time()
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics = ['accuracy']) 

print('compilation time: ', time() - start)

compilation time:  0.013817548751831055


In [68]:
#Finally we train the model.
#We could put even more epochs in the search of better results but it takes a lot of time and after a point nothing really changes.

model.fit(X_train, y_train_onehot, epochs = 32, batch_size = 100, verbose = 1, shuffle = False)  

Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 15/32
Epoch 16/32
Epoch 17/32
Epoch 18/32
Epoch 19/32
Epoch 20/32
Epoch 21/32
Epoch 22/32
Epoch 23/32
Epoch 24/32
Epoch 25/32
Epoch 26/32
Epoch 27/32
Epoch 28/32
Epoch 29/32
Epoch 30/32
Epoch 31/32
Epoch 32/32


<tensorflow.python.keras.callbacks.History at 0x7f6453fd4978>

## Part 2: Testing the trained model

In [69]:
data_to_test = diff_values[training_data_len:]

print(data_to_test)

[[ -80]
 [  88]
 [  -4]
 ...
 [-160]
 [   4]
 [ -12]]


In [70]:
#In a similar fashion we create the X and y test, transform them to arrays and reshape them accordingly.
#X_test will be used to create predictions for the test dataset.
#y_test will be compared to these predictions.

X_test = []
y_test = [] 

for i in range(999, len(data_to_test)):
    X_test.append(data_to_test[i - 999: i])
    y_test.append(data_to_test[i])

X_test = np.array(X_test) 
y_test = np.array(y_test)

X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1],1))
y_test = y_test.reshape(-1,1)

print('X_test' + '\n', X_test, np.shape(X_test), '\n' + '\n' + '\n' + '\n' + ' and y_test ' + '\n', y_test, np.shape(y_test))

X_test
 [[[        -80]
  [         88]
  [         -4]
  ...
  [    1263355]
  [-2917680759]
  [ 3079614124]]

 [[         88]
  [         -4]
  [          0]
  ...
  [-2917680759]
  [ 3079614124]
  [        -32]]

 [[         -4]
  [          0]
  [       -152]
  ...
  [ 3079614124]
  [        -32]
  [         -4]]

 ...

 [[          0]
  [       -152]
  [          4]
  ...
  [          4]
  [        156]
  [          8]]

 [[       -152]
  [          4]
  [ 2918730140]
  ...
  [        156]
  [          8]
  [       -160]]

 [[          4]
  [ 2918730140]
  [-2918730144]
  ...
  [          8]
  [       -160]
  [          4]]] (7200, 999, 1) 



 and y_test 
 [[        -32]
 [         -4]
 [-3079682104]
 ...
 [       -160]
 [          4]
 [        -12]] (7200, 1)


In [71]:
#Before of the comparassion though, we have to bring y_test back to its original form and not this diff_value version.

for i in range(0, len(y_test) - 1):
  y_test[i] =  df_to_train_and_test[training_data_len + 999 + i + 1] - y_test[i]

print(y_test)  

[[3216124352]
 [3216124320]
 [3216124316]
 ...
 [3216122812]
 [3216122652]
 [       -12]]


In [72]:
#Here we make the predictions using X_test and start trasforming them back from onehot coding and the diff_value version.
#In that way they will be in the same form with y_test as the original dataset.

predictions = model.predict(X_test) 

print(predictions)

[[0.00000014037838    0.000000058271983   0.000000406103      ...
  0.000000003325179   0.0000000066073356  0.000000008629071  ]
 [0.000000034040678   0.000000020380266   0.0000002577242     ...
  0.0000001094906     0.00000013112384    0.00000016321442   ]
 [0.000000000385292   0.00000000022368254 0.000000006494422   ...
  0.00000000013481416 0.0000000001325892  0.00000000022854586]
 ...
 [0.00000007825584    0.0000000961178     0.00000014843239    ...
  0.00000038782568    0.0000003437446     0.00000045634448   ]
 [0.00000001816999    0.000000010864106   0.000000026583107   ...
  0.000000051645713   0.00000006814452    0.00000011518799   ]
 [0.00000013921114    0.00000013395032    0.00000025258464    ...
  0.00000010163829    0.000000074162145   0.0000001255221    ]]


In [73]:
#The onehot encoding makes an object, an integer in our case, into a series of 0s and one 1 across all the columns/classes.
#However, after passing through the model, the predictions that are created are no longer 0 and 1 but rather something in (0,1).
#In order to find where the prediction classifies we have to find the bigger float among all columns, in order to get the column index.
#The indexes of the columns are string names 'x0_' followed by the number they represent.
#So finding this string means we find the number that the model classified the prediction into.

classified_data = []

for i in range(0, len(predictions)):
  maximum = 0
  temp = 0
  column_index = 0
  for j in range(0, predictions.shape[1]):
    temp = predictions[i][j]
    if temp > maximum: 
      maximum = temp
      column_index = j
  classified_data.append(encoder.get_feature_names()[column_index]) 

print(classified_data)

['x0_-32', 'x0_-4', 'x0_-3079682104', 'x0_3079682100', 'x0_-4', 'x0_-4', 'x0_16', 'x0_-3079614864', 'x0_72', 'x0_88', 'x0_-8', 'x0_0', 'x0_-80', 'x0_88', 'x0_-4', 'x0_0', 'x0_-152', 'x0_4', 'x0_4', 'x0_-4', 'x0_-4', 'x0_72', 'x0_84', 'x0_0', 'x0_4', 'x0_-8', 'x0_0', 'x0_3079614696', 'x0_4', 'x0_4', 'x0_4', 'x0_12', 'x0_-12', 'x0_-3079682028', 'x0_3079682024', 'x0_0', 'x0_0', 'x0_0', 'x0_-161745640', 'x0_-1451040', 'x0_163196680', 'x0_4', 'x0_-163196684', 'x0_1263439', 'x0_-2917680798', 'x0_1', 'x0_-32', 'x0_-4', 'x0_-3079682104', 'x0_3079682100', 'x0_-4', 'x0_-4', 'x0_16', 'x0_-3079614864', 'x0_72', 'x0_88', 'x0_-8', 'x0_0', 'x0_-80', 'x0_88', 'x0_-4', 'x0_0', 'x0_-152', 'x0_4', 'x0_4', 'x0_-4', 'x0_-4', 'x0_72', 'x0_84', 'x0_0', 'x0_4', 'x0_-8', 'x0_0', 'x0_3079614696', 'x0_4', 'x0_4', 'x0_4', 'x0_12', 'x0_-12', 'x0_-3079682028', 'x0_3079682024', 'x0_0', 'x0_0', 'x0_0', 'x0_-161745640', 'x0_-1451040', 'x0_163196680', 'x0_4', 'x0_-163196684', 'x0_1263439', 'x0_-2917680798', 'x0_1', 'x0

In [74]:
#The only thing left to do is to 'clean' the string from unneccesary characters and tranform them into integers.

for i in range(0, len(predictions)):
  classified_data[i] = classified_data[i].replace('x0_', '')
  classified_data[i] = int(classified_data[i])

print(classified_data)

[-32, -4, -3079682104, 3079682100, -4, -4, 16, -3079614864, 72, 88, -8, 0, -80, 88, -4, 0, -152, 4, 4, -4, -4, 72, 84, 0, 4, -8, 0, 3079614696, 4, 4, 4, 12, -12, -3079682028, 3079682024, 0, 0, 0, -161745640, -1451040, 163196680, 4, -163196684, 1263439, -2917680798, 1, -32, -4, -3079682104, 3079682100, -4, -4, 16, -3079614864, 72, 88, -8, 0, -80, 88, -4, 0, -152, 4, 4, -4, -4, 72, 84, 0, 4, -8, 0, 3079614696, 4, 4, 4, 12, -12, -3079682028, 3079682024, 0, 0, 0, -161745640, -1451040, 163196680, 4, -163196684, 1263439, -2917680798, 1, -32, -4, -3079682104, 3079682100, -4, -4, 16, -3079614864, 72, 88, -8, 0, -80, 88, -4, 0, -152, 4, 4, -4, -4, 72, 84, 0, 4, -8, 0, 3079614696, 4, 4, 4, 12, -12, -3079682028, 3079682024, 0, 0, 0, -161745640, -1451040, 163196680, 4, -163196684, 1263439, -2917680798, 1, -32, -32, -4, -3079682104, 3079682100, -4, -4, 16, -3079614864, 72, 88, -8, 0, -80, 88, -4, 0, -152, 4, 4, -4, -4, 72, 84, 0, 4, -8, 0, 3079614696, 4, 4, 4, 12, -12, -3079682028, 3079682024, 0, 0

In [75]:
#In the same way that we did with y_test, we reverse the predictions from their difference version to the original dataset.

for i in range(0, len(classified_data) - 1):
  classified_data[i] =  df_to_train_and_test[training_data_len + 999 + i + 1] - classified_data[i]

print(classified_data)

[3216124352, 3216124320, 3216124316, 136442212, 3216124312, 3216124308, 3216124304, 3216124320, 136509456, 136509528, 136509616, 136509608, 136509608, 136509528, 136509616, 136509612, 136509612, 136509460, 3055239466, 136509464, 136509460, 136509456, 136509528, 136509612, 136509612, 136509616, 136509608, 136509608, 3216124304, 3216124308, 3216124312, 3216124316, 3216124328, 3216124316, 136442288, 3216124312, 3216124312, 3216124312, 3216124312, 3054378672, 3052927632, 3216124312, 3216124316, 3052927554, 3054191027, 3216124351, 3216124352, 3216124320, 3216124316, 136442212, 3216124312, 3216124308, 3216124304, 3216124320, 136509456, 136509528, 136509616, 136509608, 136509608, 136509528, 136509616, 136509612, 136509612, 136509460, 3055239467, 136509464, 136509460, 136509456, 136509528, 136509612, 136509612, 136509616, 136509608, 136509608, 3216124304, 3216124308, 3216124312, 3216124316, 3216124328, 3216124316, 136442288, 3216124312, 3216124312, 3216124312, 3216124312, 3054378672, 305292763

In [77]:
#In this part we calculate the hit rate, the percentage of correct predictions.

successes = 0

for i in range(0,len(y_test)):
  if y_test[i] == classified_data[i]:
    successes = successes + 1

print('Hit rate = ', successes * 100 / len(y_test), '%')

Hit rate =  85.51388888888889 %


## Part 3: Making predictions

In [128]:
#In this last part we load the second dataset that will be used only for predictions.

df_to_test_2 = pd.read_excel('memrefs_testing_Sept.xlsx', header = None)

print(df_to_test_2)

                0
0      0xbfb229b4
1      0xbfb229b8
2      0xbfb229bc
3      0xbfb22a74
4       0x8249704
...           ...
49945  0xbfb22998
49946  0xbfb22994
49947  0xbfb22990
49948  0xbfb229a0
49949   0x822f810

[49950 rows x 1 columns]


In [129]:
#Again we use the decimal version.

df_to_test_2 = df_to_test_2[0].apply(int, base=16)

print(df_to_test_2)

0        3216124340
1        3216124344
2        3216124348
3        3216124532
4         136615684
            ...    
49945    3216124312
49946    3216124308
49947    3216124304
49948    3216124320
49949     136509456
Name: 0, Length: 49950, dtype: int64


In [130]:
#Once again transforming the series to an array and reshaping it.

data_to_test_2 = np.array(df_to_test_2)

print(data_to_test_2)

[3216124340 3216124344 3216124348 ... 3216124304 3216124320  136509456]


In [131]:
#In a similar way we find the difference values, transform it into an array and reshape it.

diff_values_2 = list()

for i in range(1, len(data_to_test_2)):
    value_2 = data_to_test_2[i] - data_to_test_2[i - 1]
    diff_values_2.append(value_2)

diff_values_2 = np.array(diff_values_2)    
diff_values_2 = diff_values_2.reshape(-1,1)

print(diff_values_2)

[[          4]
 [          4]
 [        184]
 ...
 [         -4]
 [         16]
 [-3079614864]]


In [132]:
#Again in the same way we create the X_test but this time there is a problem that could not be solved.
#We fill the X_test again with batches of 999 samples but this time the batches do not overlap at all.
#This means that we traverse diff_values_2 with a step of 999 and not 1 by 1.
#The original test dataset has enough data for this but the diff_values_2 is for 1 sample smaller.
#And in this case the last batch is not complete, but consists of only 998 samples, causing problems in the array creation.
#Because I could not find a way to tackle this problem I just take less data and miss the last prediction out of all 50.
#The rest of the X_test creation is similar like before.

X_test_2 = []

for i in range(0, len(diff_values_2) - 999, 999):
    X_test_2.append(diff_values_2[i: (i + 999)])

X_test_2 = np.array(X_test_2)
X_test_2 = np.reshape(X_test_2, newshape = (X_test_2.shape[0], X_test_2.shape[1], 1))

print(X_test_2, np.shape(X_test_2))

[[[          4]
  [          4]
  [        184]
  ...
  [ 3079614696]
  [          4]
  [          8]]

 [[         12]
  [        -12]
  [-3079682028]
  ...
  [ 3079576304]
  [-3079576364]
  [ 3079576364]]

 [[-3079576364]
  [         64]
  [ 3079576272]
  ...
  [         -4]
  [          8]
  [        -16]]

 ...

 [[          0]
  [         -8]
  [          4]
  ...
  [         -8]
  [        252]
  [        -36]]

 [[       -208]
  [      -1165]
  [       1101]
  ...
  [         -8]
  [        -12]
  [         32]]

 [[         -4]
  [        -12]
  [        144]
  ...
  [  163196680]
  [          4]
  [ -161933337]]] (49, 999, 1)


In [133]:
#Finally making the predictions for the second dataset.

predictions_2 = model.predict(X_test_2) 

print(predictions_2)

[[0.000000003751224   0.0000000031784895  0.000000008471139   ...
  0.0000000017002231  0.0000000038612464  0.0000000053807443 ]
 [0.00015188294       0.000086686654      0.00017931785       ...
  0.00000084254845    0.0000012388216     0.0000017689237    ]
 [0.0000005043207     0.000000789252      0.0000031603604     ...
  0.00000025116208    0.00000034909667    0.00000032540657   ]
 ...
 [0.0000009125842     0.00000069023037    0.0000016158573     ...
  0.00000208079       0.0000021776254     0.0000027828146    ]
 [0.000000497645      0.0000006903893     0.0000028533027     ...
  0.00000022380807    0.0000002531496     0.00000036657065   ]
 [0.0000000003052022  0.00000000014093113 0.000000001170147   ...
  0.0000000015112472  0.000000003965917   0.0000000052151083 ]]


In [134]:
#Once again classifying our predicted data.

classified_data_2 = []

for i in range(0, len(predictions_2)):
  maximum_2 = 0
  column_index_2 = 0
  temp_2 = 0
  for j in range(0, predictions_2.shape[1]):
    temp_2 = predictions_2[i][j]
    if temp_2 > maximum_2: 
      maximum_2 = temp_2
      column_index_2 = j
  classified_data_2.append(encoder.get_feature_names()[column_index_2]) 

print(classified_data_2)

['x0_4', 'x0_-4', 'x0_-4', 'x0_4', 'x0_-4', 'x0_4', 'x0_-4', 'x0_12', 'x0_-4', 'x0_-4', 'x0_-76', 'x0_4', 'x0_-4', 'x0_-4', 'x0_4', 'x0_4', 'x0_4', 'x0_-4', 'x0_-4', 'x0_-4', 'x0_-4', 'x0_-4', 'x0_-161745040', 'x0_4', 'x0_4', 'x0_-4', 'x0_-124', 'x0_4', 'x0_4', 'x0_0', 'x0_4', 'x0_84', 'x0_-4', 'x0_4', 'x0_4', 'x0_-4', 'x0_4', 'x0_-8', 'x0_-4', 'x0_-4', 'x0_-76', 'x0_4', 'x0_-4', 'x0_-4', 'x0_0', 'x0_3079612980', 'x0_4', 'x0_-4', 'x0_1263439']


In [135]:
#'Cleaning' the strings.

for i in range(0, len(predictions_2)):
  classified_data_2[i] = classified_data_2[i].replace('x0_', '')
  classified_data_2[i] = int(classified_data_2[i])

classified_data_2 = np.array(classified_data_2)

print(classified_data_2)

[         4         -4         -4          4         -4          4
         -4         12         -4         -4        -76          4
         -4         -4          4          4          4         -4
         -4         -4         -4         -4 -161745040          4
          4         -4       -124          4          4          0
          4         84         -4          4          4         -4
          4         -8         -4         -4        -76          4
         -4         -4          0 3079612980          4         -4
    1263439]


In [136]:
#Reversing the difference version to the original.

classified_data_2[0] = data_to_test_2[1000] - classified_data_2[0]

for i in range(1, len(classified_data_2) - 1):
    classified_data_2[i] =  data_to_test_2[i * 999 +1] - classified_data_2[i]
  
print(classified_data_2)  

[3216124324 3216124332  136547928  136442120  136616913 3216123964
 3216122228 3216124332 3216124320 3216124032 3216124356 3216123432
  136509616 3216124324 3216124236 3216124300  136509460 3216124196
 3216124352  136509616 3216124276 3216124204 3377869336 3216123984
 3216124528 3216124320 3216124468  136509998 3216123984 3216124244
 3216124020 3216124200  136509616 3216124264 3216124308 3216124024
  136442448 3216124336 3216124188 3216123976 3216122340 3216123940
 3216122256 3216124336 3216124320  136511056  136623460 3216123508
    1263439]


In [137]:
#Finding the final answers of the predictions by reversing them to hexademical.

final_answers = []

for i in range(0, len(classified_data_2)):
  value_3 = hex(classified_data_2[i])
  final_answers.append(value_3)

final_answers = np.array(final_answers)

print(final_answers)

['0xbfb229a4' '0xbfb229ac' '0x8238e58' '0x821f108' '0x8249bd1'
 '0xbfb2283c' '0xbfb22174' '0xbfb229ac' '0xbfb229a0' '0xbfb22880'
 '0xbfb229c4' '0xbfb22628' '0x822f8b0' '0xbfb229a4' '0xbfb2294c'
 '0xbfb2298c' '0x822f814' '0xbfb22924' '0xbfb229c0' '0x822f8b0'
 '0xbfb22974' '0xbfb2292c' '0xc9563218' '0xbfb22850' '0xbfb22a70'
 '0xbfb229a0' '0xbfb22a34' '0x822fa2e' '0xbfb22850' '0xbfb22954'
 '0xbfb22874' '0xbfb22928' '0x822f8b0' '0xbfb22968' '0xbfb22994'
 '0xbfb22878' '0x821f250' '0xbfb229b0' '0xbfb2291c' '0xbfb22848'
 '0xbfb221e4' '0xbfb22824' '0xbfb22190' '0xbfb229b0' '0xbfb229a0'
 '0x822fe50' '0x824b564' '0xbfb22674' '0x13474f']


In [89]:
### At this point it would be nice to just open the existing excel file and write the predictions one by one in the empty cells.
### However, compared to other languages like java and c, this is a bit tricky and unusual.
### As a result we will do the last thing manually and not automatically.

### Thank you for reading.
### Athanasios Papanikolaou 2337 atpapanikolaou@e-ce.uth.gr