In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler,MinMaxScaler

Load data

In [6]:
training_cells = pd.read_csv("../training_V2.csv",header=None).to_numpy(dtype=str).reshape(-1,).tolist()
test_in_cells = pd.read_csv("../test_in_V2.csv",header=None).to_numpy(dtype=str).reshape(-1,).tolist()
test_out_cells = pd.read_csv("../test_out_V2.csv",header=None).to_numpy(dtype=str).reshape(-1,).tolist()
test_out_cells.remove('G40C3') # This cell has some issued on week 1 rpt, thus omitted
# Import interpolated data for knot-point approach and end-to-end
Q_data = pd.read_csv("../NMC_data_V2_interp_clean.csv")

feature_table_1 = pd.read_csv("feature_all_1.csv")
feature_table_2 = pd.read_csv("feature_all_2.csv")
feature_table_3 = pd.read_csv("feature_all_3.csv")
feature_table_5 = pd.read_csv("feature_all_5.csv")
feature_table_6 = pd.read_csv("feature_all_6.csv")

Create empty numpy arrays for features and ah-throughput

In [7]:
num_training_cells = len(training_cells)
num_test_in_cells = len(test_in_cells)
num_test_out_cells = len(test_out_cells)

# Remove non-feature columns from feature list
feature_list_1 = feature_table_1.columns.to_list()
for element in ['Group', 'Cell','Lifetime']:
    feature_list_1.remove(element)
feature_list_2 = feature_table_2.columns.to_list()
for element in ['Group', 'Cell','Lifetime']:
    feature_list_2.remove(element)
feature_list_3 = feature_table_3.columns.to_list()
for element in ['Group', 'Cell','Lifetime']:
    feature_list_3.remove(element)
feature_list_5 = feature_table_5.columns.to_list()
for element in ['Group', 'Cell','Lifetime']:
    feature_list_5.remove(element)
feature_list_6 = feature_table_6.columns.to_list()
for element in ['Group', 'Cell','Lifetime']:
    feature_list_6.remove(element)

num_Q = 21
num_features = len(feature_list_1)

# Create variables for training input as well as training labels for different approaches
N_train = np.ndarray((num_training_cells,num_Q))
X_train_1 = np.ndarray((num_training_cells,num_features))
X_train_2 = np.ndarray((num_training_cells,num_features))
X_train_3 = np.ndarray((num_training_cells,num_features))
X_train_5 = np.ndarray((num_training_cells,num_features))
X_train_6 = np.ndarray((num_training_cells,num_features))

# Create variables for test inputs
X_test_in_1 = np.ndarray((num_test_in_cells,num_features))
X_test_in_2 = np.ndarray((num_test_in_cells,num_features))
X_test_in_3 = np.ndarray((num_test_in_cells,num_features))
X_test_in_5 = np.ndarray((num_test_in_cells,num_features))
X_test_in_6 = np.ndarray((num_test_in_cells,num_features))
N_test_in = np.ndarray((num_test_in_cells,num_Q))

X_test_out_1 = np.ndarray((num_test_out_cells,num_features))
X_test_out_2 = np.ndarray((num_test_out_cells,num_features))
X_test_out_3 = np.ndarray((num_test_out_cells,num_features))
X_test_out_5 = np.ndarray((num_test_out_cells,num_features))
X_test_out_6 = np.ndarray((num_test_out_cells,num_features))
N_test_out = np.ndarray((num_test_out_cells,num_Q))

Loop through training cells

In [8]:
for iii,cell in enumerate(training_cells):
    X_train_1[iii] = feature_table_1[feature_table_1['Cell']==cell][feature_list_1].values
    X_train_2[iii] = feature_table_2[feature_table_2['Cell']==cell][feature_list_2].values
    X_train_3[iii] = feature_table_3[feature_table_3['Cell']==cell][feature_list_3].values
    X_train_5[iii] = feature_table_5[feature_table_5['Cell']==cell][feature_list_5].values
    X_train_6[iii] = feature_table_6[feature_table_6['Cell']==cell][feature_list_6].values
    
    N_cell = Q_data[Q_data['cellID']==cell]['Ah_throughput'].values
    N_train[iii] = np.abs(N_cell) # abs() to ensure the first point is nonnegative (very small negative numbers due to interpolation)

for iii,cell in enumerate(test_in_cells):
    X_test_in_1[iii] = feature_table_1[feature_table_1['Cell']==cell][feature_list_1].values
    X_test_in_2[iii] = feature_table_2[feature_table_2['Cell']==cell][feature_list_2].values
    X_test_in_3[iii] = feature_table_3[feature_table_3['Cell']==cell][feature_list_3].values
    X_test_in_5[iii] = feature_table_5[feature_table_5['Cell']==cell][feature_list_5].values
    X_test_in_6[iii] = feature_table_6[feature_table_6['Cell']==cell][feature_list_6].values

    N_cell = Q_data[Q_data['cellID']==cell]['Ah_throughput'].values
    N_test_in[iii] = np.abs(N_cell) # abs() to ensure the first point is nonnegative (very small negative numbers due to interpolation)

for iii,cell in enumerate(test_out_cells):
    X_test_out_1[iii] = feature_table_1[feature_table_1['Cell']==cell][feature_list_1].values
    X_test_out_2[iii] = feature_table_2[feature_table_2['Cell']==cell][feature_list_2].values
    X_test_out_3[iii] = feature_table_3[feature_table_3['Cell']==cell][feature_list_3].values
    X_test_out_5[iii] = feature_table_5[feature_table_5['Cell']==cell][feature_list_5].values
    X_test_out_6[iii] = feature_table_6[feature_table_6['Cell']==cell][feature_list_6].values

    N_cell = Q_data[Q_data['cellID']==cell]['Ah_throughput'].values
    N_test_out[iii] = np.abs(N_cell) # abs() to ensure the first point is nonnegative (very small negative numbers due to interpolation)


Apply PCA on features from training set and also transform features from both test sets

In [9]:
X_scaler_1 = StandardScaler()
X_train_1_scaled = X_scaler_1.fit_transform(X_train_1)
X_test_in_1_scaled = X_scaler_1.transform(X_test_in_1)
X_test_out_1_scaled = X_scaler_1.transform(X_test_out_1)
PCA_model_1 = PCA(n_components=10)
X_train_1_PCA = PCA_model_1.fit_transform(X_train_1_scaled)
X_test_in_1_PCA = PCA_model_1.transform(X_test_in_1_scaled)
X_test_out_1_PCA = PCA_model_1.transform(X_test_out_1_scaled)

X_scaler_2 = StandardScaler()
X_train_2_scaled = X_scaler_2.fit_transform(X_train_2)
X_test_in_2_scaled = X_scaler_2.transform(X_test_in_2)
X_test_out_2_scaled = X_scaler_2.transform(X_test_out_2)
PCA_model_2 = PCA(n_components=10)
X_train_2_PCA = PCA_model_2.fit_transform(X_train_2_scaled)
X_test_in_2_PCA = PCA_model_2.transform(X_test_in_2_scaled)
X_test_out_2_PCA = PCA_model_2.transform(X_test_out_2_scaled)

X_scaler_3 = StandardScaler()
X_train_3_scaled = X_scaler_3.fit_transform(X_train_3)
X_test_in_3_scaled = X_scaler_3.transform(X_test_in_3)
X_test_out_3_scaled = X_scaler_3.transform(X_test_out_3)
PCA_model_3 = PCA(n_components=10)
X_train_3_PCA = PCA_model_3.fit_transform(X_train_3_scaled)
X_test_in_3_PCA = PCA_model_3.transform(X_test_in_3_scaled)
X_test_out_3_PCA = PCA_model_3.transform(X_test_out_3_scaled)

X_scaler_5 = StandardScaler()
X_train_5_scaled = X_scaler_5.fit_transform(X_train_5)
X_test_in_5_scaled = X_scaler_5.transform(X_test_in_5)
X_test_out_5_scaled = X_scaler_5.transform(X_test_out_5)
PCA_model_5 = PCA(n_components=10)
X_train_5_PCA = PCA_model_5.fit_transform(X_train_5_scaled)
X_test_in_5_PCA = PCA_model_5.transform(X_test_in_5_scaled)
X_test_out_5_PCA = PCA_model_5.transform(X_test_out_5_scaled)

X_scaler_6 = StandardScaler()
X_train_6_scaled = X_scaler_6.fit_transform(X_train_6)
X_test_in_6_scaled = X_scaler_6.transform(X_test_in_6)
X_test_out_6_scaled = X_scaler_6.transform(X_test_out_6)
PCA_model_6 = PCA(n_components=10)
X_train_6_PCA = PCA_model_6.fit_transform(X_train_6_scaled)
X_test_in_6_PCA = PCA_model_6.transform(X_test_in_6_scaled)
X_test_out_6_PCA = PCA_model_6.transform(X_test_out_6_scaled)

Save all data

In [10]:
np.savetxt("X_train_1_PCA.csv",X_train_1_PCA,delimiter=",")
np.savetxt("X_train_2_PCA.csv",X_train_2_PCA,delimiter=",")
np.savetxt("X_train_3_PCA.csv",X_train_3_PCA,delimiter=",")
np.savetxt("X_train_5_PCA.csv",X_train_5_PCA,delimiter=",")
np.savetxt("X_train_6_PCA.csv",X_train_6_PCA,delimiter=",")
np.savetxt("X_test_in_1_PCA.csv",X_test_in_1_PCA,delimiter=",")
np.savetxt("X_test_in_2_PCA.csv",X_test_in_2_PCA,delimiter=",")
np.savetxt("X_test_in_3_PCA.csv",X_test_in_3_PCA,delimiter=",")
np.savetxt("X_test_in_5_PCA.csv",X_test_in_5_PCA,delimiter=",")
np.savetxt("X_test_in_6_PCA.csv",X_test_in_6_PCA,delimiter=",")
np.savetxt("X_test_out_1_PCA.csv",X_test_out_1_PCA,delimiter=",")
np.savetxt("X_test_out_2_PCA.csv",X_test_out_2_PCA,delimiter=",")
np.savetxt("X_test_out_3_PCA.csv",X_test_out_3_PCA,delimiter=",")
np.savetxt("X_test_out_5_PCA.csv",X_test_out_5_PCA,delimiter=",")
np.savetxt("X_test_out_6_PCA.csv",X_test_out_6_PCA,delimiter=",")
np.savetxt("N_train.csv",N_train,delimiter=",")
np.savetxt("N_test_in.csv",N_test_in,delimiter=",")
np.savetxt("N_test_out.csv",N_test_out,delimiter=",")
