In [7]:
import csv
import numpy as np
from pickle import dump
import os
from sklearn.model_selection import train_test_split
from Preprocessing import scale_features
from Preprocessing import create_features

np.random.seed(1258)  # for reproducibility
save_file_path = '../data/split-train-val-test/'
if not os.path.exists(save_file_path):
     os.mkdir(save_file_path)

'''
Data contains two datasets

'''
print('Loading data')
with open('../data/TrainingsDataV2.csv', 'r') as csv_file:
    csv_reader = csv.reader(csv_file)
    data = []
    for i in csv_reader:
        data.append([i[0], i[1], i[2], i[3], i[4]])
data.pop(0)

sequences = [i[2] for i in data]
data = np.array(data)
y0 = np.double(data[:, 4])
y = np.column_stack([y0, 1 - y0])

'''
Calculating 42 features
Splitting into train, validation, and test set

'''
# Defines the sequence window size and steps (stride length). Changing these is as easy as changing their values.
SEQUENCE_WINDOW = 5
STEPS = 1
LENGTH = 40

#Calculate features
print('Calculating features!')
features = create_features(sequences, SEQUENCE_WINDOW, STEPS, LENGTH)

# Save the features and activation scores
dump(features, open('features_OnlyPlants.pkl', 'wb'))

# Split sequences in training and testing data
X_train, X_test, y_train, y_test = train_test_split(features, y, random_state = 42, test_size=0.1, stratify = y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state = 42, test_size=0.22, stratify = y_train)

# Scale features
X_train_scaled = scale_features(X_train)
X_test_scaled = scale_features(X_test)
X_val_scaled = scale_features(X_val)
print(X_train_scaled[0].shape)

print('Saving train-validation-test sequences and labels')
np.savez_compressed(save_file_path + 'train-features-scaled.npz', X_train_scaled)
np.savez_compressed(save_file_path + 'train-labels.npz', y_train)

np.savez_compressed(save_file_path + 'validation-features-scaled.npz', X_val_scaled)
np.savez_compressed(save_file_path + 'validation-labels.npz', y_val)

np.savez_compressed(save_file_path + 'test-features-scaled.npz', X_test_scaled)
np.savez_compressed(save_file_path + 'test-labels.npz', y_test)

Loading data
Calculating features!
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
(36, 42)
Saving train-validation-test sequences and labels


In [6]:
!pip install protfasta

Collecting protfasta
  Downloading protfasta-0.1.12.tar.gz (142 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m142.6/142.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: protfasta
  Building wheel for protfasta (setup.py) ... [?25ldone
[?25h  Created wheel for protfasta: filename=protfasta-0.1.12-py3-none-any.whl size=120135 sha256=bf438f72f987c8213029c346b8aa18946607342671e69c425d83788c14b6d77d
  Stored in directory: /home/vspande/.cache/pip/wheels/f8/bd/7f/79ade79ffe632ddc2a8c5962f952b7b8712b6403de165f46cc
Successfully built protfasta
Installing collected packages: protfasta
Successfully installed protfasta-0.1.12
