In [1]:
import pandas as pd

# Read data from the Excel file
data = pd.read_excel('raw_data.xlsx')

In [2]:
print('Data shape:', data.shape)
print(data.head())

Data shape: (63231, 34)
                timestamp      x1        x2        x3  x4  x5  x6  x7  \
0 2020-03-16 00:01:11.974  medium  1.199567  0.198783   2   0  19 -25   
1 2020-03-16 00:01:20.332  medium  1.338867  0.138917  16   0  22 -14   
2 2020-03-16 00:01:27.014  medium  1.450233  0.111100 -16   1   9 -16   
3 2020-03-16 00:01:36.200  medium  1.603333  0.152917 -21  16   9 -25   
4 2020-03-16 00:01:39.376  medium  1.656267  0.052800  -1  11  14 -16   

         x8        x9  ...  x24  x25        x26      x27     x28      x29  \
0  24500000  36550000  ...  0.0  0.0  18.136364        0       0   500000   
1  19000000  40000000  ...  0.0  0.0  17.421686  1500000       0  2000000   
2  15500000  28200000  ...  0.0  0.0  17.503649        0       0  3500000   
3  19500000  33100000  ...  0.0  0.0  15.133758        0  500000   500000   
4  14900000  28368000  ...  0.0  0.0  17.166666        0       0  1000000   

   x30  x31      x32      x33  
0    0    0  1500000  -47.261  
1    0    

In [3]:
data = data.drop(['timestamp'], axis=1)
print('Data shape:', data.shape)


Data shape: (63231, 33)


In [4]:
# Check for missing values
print(data.isnull().sum())

x1     0
x2     0
x3     0
x4     0
x5     0
x6     0
x7     0
x8     0
x9     0
x10    0
x11    0
x12    0
x13    0
x14    0
x15    0
x16    0
x17    0
x18    0
x19    0
x20    0
x21    0
x22    0
x23    0
x24    0
x25    0
x26    0
x27    0
x28    0
x29    0
x30    0
x31    0
x32    0
x33    0
dtype: int64


In [5]:
data.head

<bound method NDFrame.head of            x1           x2        x3  x4  x5  x6  x7        x8        x9  x10  \
0      medium     1.199567  0.198783   2   0  19 -25  24500000  36550000   43   
1      medium     1.338867  0.138917  16   0  22 -14  19000000  40000000   34   
2      medium     1.450233  0.111100 -16   1   9 -16  15500000  28200000   28   
3      medium     1.603333  0.152917 -21  16   9 -25  19500000  33100000   34   
4      medium     1.656267  0.052800  -1  11  14 -16  14900000  28368000   29   
...       ...          ...       ...  ..  ..  ..  ..       ...       ...  ...   
63226  medium  1204.058100  0.644683 -25  -3   4 -44  42025000  47934000   66   
63227  medium  1205.160600  1.102483  -6  -9  10 -16  34000000  41437000   49   
63228  medium  1205.694000  0.533117  13   1  16  -6  18900000  30270000   31   
63229  medium  1205.934000  0.239050   3   1  19 -22  20000000  34170000   28   
63230  medium  1206.578900  0.644533 -11  -6   3 -22  13000000  37501000   21  

# Great! Since data has no missing values in any of the columns, proceed directly to the next steps in preparing for training the CNN

In [11]:
# Get unique categories in x1
categories = data['x1'].unique()
num_categories = len(categories)
print(f'Number of categories in x1: {num_categories}')
print(f'Categories: {categories}')



Number of categories in x1: 3
Categories: ['medium' 'low' 'high']


In [12]:
# One-hot encode the 'x1' column
data = pd.get_dummies(data, columns=['x1'])


In [16]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# Fit the scaler to the data and transform
data_scaled = scaler.fit_transform(data)

# Convert the scaled data back to a DataFrame
data_scaled = pd.DataFrame(data_scaled, columns=data.columns)

# Create Input Sequences and Targets

In [18]:
import numpy as np

sequence_length = 32
X = []
y = []

for i in range(len(data_scaled) - sequence_length):
    X.append(data_scaled.iloc[i:i+sequence_length].values)
    y.append(data_scaled.iloc[i+sequence_length].values)

X = np.array(X)
y = np.array(y)

print('X shape:', X.shape)  # Expected: (samples, 32, num_features)
print('y shape:', y.shape)  # Expected: (samples, num_features)

X shape: (63199, 32, 35)
y shape: (63199, 35)
