In [1]:
import os
import io
import sys
import re
import time
import math
import pickle

import numpy as np
import pandas as pd

from numpy import nan as Nan
from numpy import inf as inf
from tqdm import notebook as tqdm
from scipy.sparse import csr_matrix
from data_preprocessing import *

In [3]:
def make_features(path, index):
    valid_features_df = pd.DataFrame()
    files = sorted(os.listdir(path))
    feature_list = list()
    cnt_of_nan_features = list()
    print('Features for {}'.format(index))
    for file in files:
        # if file == 'scm_features_mode-structure.npy':
        #     continue
        print(file)
        # remove the .npy extension
        filename = file[0:-4]
        if re.search('SOAP', file):
            features = csr_matrix(np.load(io.BytesIO(open('{}/{}'.format(path, file), 'rb').read()), allow_pickle=True).all())
            lost_features_count, valid_features = nan_and_inf_finder_SOAP(features)
            # save the sparse representation
            save_path = os.path.join(os.getcwd(), 'groups_and_oxi_states_15_traj/df_step_{}/sparse_features/{}.pkl'.format(index, filename))
            save_file = open(save_path, 'wb')
            pickle.dump(features, save_file)
            save_file.close()
        elif re.search('ipynb_checkpoints', file):
            next
        else:
            features = np.load('{}/{}'.format(path, file), allow_pickle=True)
            feature_list.append(features)
            lost_features_count, valid_features = nan_and_inf_finder(features)
            # create a sparse representation for each feature
            sparse_features = save_sparse_features(features, filename)
            # feature_list.append(sparse_features)
        valid_features_df[filename] = valid_features
        print(features.shape)
        if lost_features_count != 0:
            print("{} rows are lost in the feature: {}".format(lost_features_count, file))
        cnt_of_nan_features.append(lost_features_count)
    return files, feature_list, cnt_of_nan_features


In [4]:
nan_features = set()
for number in range(5):
    path_to_csv = 'groups_and_oxi_states_5_frames/df_step_{}.pkl'.format(number)
    path_to_folder = 'groups_and_oxi_states_5_frames/df_step_{}/features'.format(number)
    labels_df = load_csv(path_to_csv)
    files, feature_list, cnt_of_nan_features = make_features(path_to_folder, number)
    for i, feature in enumerate(feature_list):
        labels_df[files[i]] = feature.tolist()
        if cnt_of_nan_features[i] != 0:
            nan_features.add(files[i])
    path_to_save = 'groups_and_oxi_states_5_frames/df_features_step_{}.pkl'.format(number)
    save_csv(labels_df, path_to_save)

Features for 0
ape_features_mode-structure_CAMN.npy
(121, 5)
bc_features_mode-structure_CAMN.npy
(121, 1)
bf_features_mode-structure_CAMN.npy
(121, 43)
co_features_mode-structure_CAMN.npy
(121, 3)
44 rows are lost in the feature: co_features_mode-structure_CAMN.npy
density_features_mode-structure_CAMN.npy
(121, 3)
ee_features_mode-structure_CAMN.npy
(121, 1)
end_features_mode-structure.npy
(121, 5)
gii_features_rcut-20_mode-structure.npy
(121, 1)
121 rows are lost in the feature: gii_features_rcut-20_mode-structure.npy
md_features_mode-structure_CAMN.npy
(121, 120)
mpe_features_mode-structure_CAMN.npy
(121, 1)
os_features_mode-structure_CAMN.npy
(121, 4)
rdf_features_cutoff-10_binsize-0.1_mode-structure_CAMN.npy
(121, 100)
sc_features_mode-structure.npy
(121, 2)
scm_features_mode-structure.npy
(121, 160)
sh_features_mode-structure.npy
(121, 9)
44 rows are lost in the feature: sh_features_mode-structure.npy
vo_features_mode-structure.npy
(121, 8)
xrd_features_pattern_length-451_mode-str

In [5]:
def load_csv(path):
    save_path = os.path.join(os.getcwd(), path)
    open_file = open(save_path, 'rb')
    labels_df = pickle.load(open_file)
    open_file.close()
    return labels_df


dfs = []
for i in range(5):
    dfs.append(load_csv(f"groups_and_oxi_states_5_frames/df_features_step_{i}.pkl"))


In [8]:
grand_zero_columns = set()
for i, df in enumerate(dfs):
    dfs[i] = dfs[i].drop(['stru_traj', 'structure_A', 'structure_AM',
       'structure_CAN', 'structure_CAMN', 'structure_A40', 'structure_AM40',
       'structure_CAN40', 'structure_CAMN40'], axis = 1)
    dfs[i] = dfs[i].drop(nan_features, axis = 1)
    print(dfs[i].shape)
    dfs[i], zero_columns = make_plane(dfs[i])
    grand_zero_columns.update(zero_columns)

(121, 17)
(121, 17)
(121, 17)
(121, 17)


(121, 17)


In [10]:
for i, df in enumerate(dfs):
    dfs[i] = dfs[i].drop(grand_zero_columns, axis = 1)
    dfs[i] = dfs[i][dfs[i]['group'] != 'group_D']
    dfs[i]['is_good'] = (dfs[i]['group'] == 'group_A') | (dfs[i]['group'] == 'group_B') | (dfs[i]['group'] == 'group_E')
    dfs[i] = dfs[i].drop(['group'], axis = 1)
    path_to_save = 'groups_and_oxi_states_5_frames/df_features_step_{}.pkl'.format(i)
    print(path_to_save)
    save_csv(dfs[i], path_to_save)

groups_and_oxi_states_5_frames/df_features_step_0.pkl
groups_and_oxi_states_5_frames/df_features_step_1.pkl
groups_and_oxi_states_5_frames/df_features_step_2.pkl
groups_and_oxi_states_5_frames/df_features_step_3.pkl
groups_and_oxi_states_5_frames/df_features_step_4.pkl
