In [1]:
import os
import io
import sys
import re
import time
import math
import pickle

import numpy as np
import pandas as pd

from numpy import nan as Nan
from numpy import inf as inf
from tqdm import notebook as tqdm
from scipy.sparse import csr_matrix
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA


In [2]:
from data_preprocessing import *

In [3]:
def make_features(path):
    valid_features_df = pd.DataFrame()
    files = sorted(os.listdir(path))
    feature_list = list()
    cnt_of_nan_features = list()
    for file in files:
        print(file)
        # remove the .npy extension
        filename = file[0:-4]
        if re.search('SOAP', file):
            features = csr_matrix(np.load(io.BytesIO(open('{}/{}'.format(path, file), 'rb').read()), allow_pickle=True).all())
            lost_features_count, valid_features = nan_and_inf_finder_SOAP(features)
            # save the sparse representation
            save_path = os.path.join(os.getcwd(), 'mpr_test/sparse_features/{}.pkl'.format(filename))
            save_file = open(save_path, 'wb')
            pickle.dump(features, save_file)
            save_file.close()
        elif re.search('ipynb_checkpoints', file):
            next
        else:
            features = np.load('{}/{}'.format(path, file), allow_pickle=True)
            feature_list.append(features)
            lost_features_count, valid_features = nan_and_inf_finder(features)
            # create a sparse representation for each feature
            sparse_features = save_sparse_features(features, filename)
            # feature_list.append(sparse_features)
        print(features.shape)
        valid_features_df[filename] = valid_features
        if lost_features_count != 0:
            print("{} rows are lost in the feature: {}".format(lost_features_count, file))
        cnt_of_nan_features.append(lost_features_count)
    return files, feature_list, cnt_of_nan_features


In [4]:
nan_features = set()
path_to_csv = 'mpr_test/testing_structures_oxi_state.pkl'
path_to_folder = 'mpr_test/features'
labels_df = load_csv(path_to_csv)
files, feature_list, cnt_of_nan_features = make_features(path_to_folder)
for i, feature in enumerate(feature_list):
    labels_df[files[i] + str(i)] = feature.tolist()
    if cnt_of_nan_features[i] != 0:
        nan_features.add('feature_' + str(i))
    path_to_save = 'mpr_test/structure_with_features.pkl'
    save_csv(labels_df, path_to_save)

ape_features_mode-structure_CAMN.npy
(70, 5)
bc_features_mode-structure_CAMN.npy
(70, 1)
bf_features_mode-structure_CAMN.npy
(70, 43)
co_features_mode-structure_CAMN.npy
(70, 3)
4 rows are lost in the feature: co_features_mode-structure_CAMN.npy
density_features_mode-structure_CAMN.npy
(70, 3)
ee_features_mode-structure_CAMN.npy
(70, 1)
end_features_mode-structure.npy
(70, 5)
gii_features_rcut-20_mode-structure.npy
(70, 1)
70 rows are lost in the feature: gii_features_rcut-20_mode-structure.npy
md_features_mode-structure_CAMN.npy
(70, 120)
mpe_features_mode-structure_CAMN.npy
(70, 1)
os_features_mode-structure_CAMN.npy
(70, 4)
rdf_features_cutoff-10_binsize-0.1_mode-structure_CAMN.npy
(70, 100)
sc_features_mode-structure.npy
(70, 2)
scm_features_mode-structure.npy
(70, 160)
sh_features_mode-structure.npy
(70, 9)
4 rows are lost in the feature: sh_features_mode-structure.npy
vo_features_mode-structure.npy
(70, 8)
xrd_features_pattern_length-451_mode-structure_CAMN.npy
(70, 451)
yss_feat

In [27]:
def load_csv(path):
    save_path = os.path.join(os.getcwd(), path)
    open_file = open(save_path, 'rb')
    labels_df = pickle.load(open_file)
    open_file.close()
    return labels_df


df = load_csv(f"groups_and_oxi_states_starting_structure/starting_structure_with_features.pkl")


In [30]:
def make_plane(labels_df):
  features = labels_df.drop(['label', 'src_id', 'group'], axis = 1).columns
  df_t = pd.DataFrame()
  last_index = 0
  print(features.shape)
  for i, feature in enumerate(features):
      data = labels_df[feature].to_numpy()
      data = np.array([np.array(x) for x in data]).T
      lenn = data.shape[0]
      columns = np.arange(last_index, last_index + lenn)
      last_index += lenn
      dictionary = dict(zip(columns, data))
      df = pd.DataFrame(dictionary)
      df_t = pd.concat([df_t, df], axis = 1)

  labels_df = pd.concat([df_t, labels_df], axis = 1)
  labels_df = labels_df.drop(features, axis = 1)
  zero_columns = list()
  for column in labels_df.columns:
    if labels_df[column].nunique() == 1:
        zero_columns.append(column)
  return labels_df, zero_columns

In [31]:
grand_zero_columns = set()
df = df.drop(['starting_structure', 'structure_A', 'structure_AM',
       'structure_CAN', 'structure_CAMN', 'structure_A40', 'structure_AM40',
       'structure_CAN40', 'structure_CAMN40'], axis = 1)
print(df.shape)
df = df.drop(nan_features, axis = 1)
print(df.shape)
df, zero_columns = make_plane(df)
grand_zero_columns.update(zero_columns)

(121, 21)
(121, 17)
(14,)


In [None]:
df = df.drop(grand_zero_columns, axis = 1)
df = df[df['group'] != 'group_D']
df['is_good'] = (df['group'] == 'group_A') | (df['group'] == 'group_B') | (df['group'] == 'group_E')
df = df.drop(['group'], axis = 1)
# path_to_save = 'groups_and_oxi_states_starting_structure/starting_structure_with_features.pkl'.format(i)
# print(path_to_save)
# save_csv(df, path_to_save)