### This file contains helpful code snippets used for preprocessing of metadata

### language: Groovy

### Get columns name from columns descriptions

In [7]:
columnsDescriptions = """
title: nazwa łącząca pdb_code (nazwę wpisu do bazy PDB), res_name (nazwę ligandu), res_id, i chain_id;
pbd_code: identyfikator pliku PDB;
res_name: nazwa ligandu, czyli opisanej w wierszu cząsteczki chemicznej (etykieta klasy);
res_id: identyfikator reszty w pliku PDB (umiejscowienie cząsteczki w białku);
chain_id: identyfikator łańcucha w pliku PDB (umiejscowienie cząsteczki w białku);

local_BAa, local_NPa, local_Ra, local_RGa, local_SRGa, local_CCSa, local_CCPa, local_ZOa, local_ZDa, local_ZD_minus_a, local_ZD_plus_a: te zmienne to miary jakości struktury zapisanej w PDB;

local_res_atom_count local_res_atom_non_h_count local_res_atom_non_h_occupancy_sum local_res_atom_non_h_electron_sum local_res_atom_non_h_electron_occupancy_sum local_res_atom_C_count local_res_atom_N_count local_res_atom_O_count local_res_atom_S_count: liczba atomów i elektronów ligandu zamodelowana w pliku PDB;

dict_atom_non_h_count dict_atom_non_h_electron_sum dict_atom_C_count dict_atom_N_count dict_atom_O_count dict_atom_S_count: liczba atomów i elektronów wyliczona na podstawie danych słownikowych (“tablicy pierwiastków”);

part_XX: kolumny zaczynające się od “part_” opisują wartości obliczone tylko na podstawie ligandu; liczba pojawiająca się po “part_” koduje próg odcięcia intensywności; atrybuty obliczone dla każdego poziomu odcięcia zostały opisane poniżej:

blob_electron_sum: szacowana liczba elektronów;
blob_volume_sum: szacowana objętość cząsteczki;
blob_parts: liczba rozdzielnych części;

shape: maska kształtu;
density: ,maska gęstości elektronowej;

O3, O4, O5, FL: niezmienniki kształtu opisane w artykule “Moment invariants as shape recognition technique for comparing protein binding sites”, Sommer et al.
I1, I2, I3, I4, I5, I6: niezmienniki kształtu opisane w artykule “3-D Surface Moment Invariants”, Xu i Li
M000: moment zerowy (licznik potrzebny do obliczeń - brak interpretacji fizycznej);

E1, E2, E3: wartości własne macierzy kowariancji obliczonej dla kształtu i gęstości; E1 to największa wartość własna, E3 najmniejsza; E3_E1 to E3 podzielone przez E1, id., sqrt oznacza pierwiastek danej wartości własnej;

norm: niezmienniki kształtu znormalizowane aby były niezmienne ze względu rotację, translację i skalę;
scaled: wartości niezmienne ze względu rotację, translację, ale nie skalę;

local_volume: objętość dla progu odcięcia 6 x FoFc_std;
local_electrons: liczba elektronów dla progu odcięcia 6 x FoFc_std;
local_mean: średnia gęstość elektronowa dla progu odcięcia 6 x FoFc_std;
local_std: odchylenie standardowe średniej gęstości elektronowej dla progu odcięcia 6 x FoFc_std;
local_min: minimalna gęstość elektronowa dla progu odcięcia 6 x FoFc_std;
local_max: maksymalna gęstość elektronowa dla progu odcięcia 6 x FoFc_std;
local_skewness: skośność dla progu odcięcia 6 x FoFc_std;
local_parts: liczba rozdzielnych części dla progu odcięcia 6 x FoFc_std;

TwoFoFc_mean, TwoFoFc_std TwoFoFc_square_std TwoFoFc_min TwoFoFc_max: to samo co dla local tylko bez progowania (TwoFoFc = 2Fo-Fc);
Fo_mean Fo_std Fo_square_std Fo_min Fo_max: to samo co dla local tylko bez progowania dla mapy Fo (Fo = observed);
FoFc_mean FoFc_std FoFc_square_std FoFc_min FoFc_max: to samo co dla local tylko bez progowania dla mapy FoFc (FoFc = Fo-Fc = error/residual map);
Fc_mean Fc_std Fc_square_std Fc_min Fc_max: to samo co dla local tylko bez progowania dla mapy Fc (Fc = calculated);

fo_col, fc_col, weight_col, grid_space, solvent_radius, solvent_opening_radius: parametry wykorzystane podczas tworzenia zbioru;
resolution: maksymalna rozdzielczość analizowanych danych;

solvent_mask_count: rozmiar maski rozpuszczalnika;
void_mask_count: rozmiar maski pustego obszaru;
modeled_mask_count: rozmiar maski ligandu;
solvent_ratio: solvent/(solvent+void+modeled);

TwoFoFc_bulk_mean, TwoFoFc_bulk_std, TwoFoFc_void_mean, TwoFoFc_void_std, TwoFoFc_modeled_mean, TwoFoFc_modeled_std: globalne statystyki dla mapy TWoFoFc;
Fo_bulk_mean, Fo_bulk_std, Fo_void_mean, Fo_void_std, Fo_modeled_mean, Fo_modeled_std: globalne statystyki dla mapy Fo;
Fc_bulk_mean, Fc_bulk_std, Fc_void_mean, Fc_void_std, Fc_modeled_mean, Fc_modeled_std: globalne statystyki dla mapy Fc;
FoFc_bulk_mean, FoFc_bulk_std, FoFc_void_mean, FoFc_void_std, FoFc_modeled_mean, FoFc_modeled_std: globalne statystyki dla mapy FoFc;
TwoFoFc_void_fit_binormal_mean1, TwoFoFc_void_fit_binormal_std1, TwoFoFc_void_fit_binormal_mean2, TwoFoFc_void_fit_binormal_std2, TwoFoFc_void_fit_binormal_scale, TwoFoFc_solvent_fit_normal_mean, TwoFoFc_solvent_fit_normal_std: dopasowanie do wybranych rozkładów danych;

part_step_FoFc_std_min: minimalny próg maskowania (dla part_00);
part_step_FoFc_std_max: maksymalny próg maskowania (dla part_10);
part_step_FoFc_std_step: krok maskowania (dł/szer./wys. piksela maski).
"""

null

null

In [8]:
matcher = columnsDescriptions =~ /(?<=\n)[^:\n]+(?=:)/
colNames = matcher.findAll().collect { it.split(/[ ,]+/) }.flatten()

[title, pbd_code, res_name, res_id, chain_id, local_BAa, local_NPa, local_Ra, local_RGa, local_SRGa, local_CCSa, local_CCPa, local_ZOa, local_ZDa, local_ZD_minus_a, local_ZD_plus_a, local_res_atom_count, local_res_atom_non_h_count, local_res_atom_non_h_occupancy_sum, local_res_atom_non_h_electron_sum, local_res_atom_non_h_electron_occupancy_sum, local_res_atom_C_count, local_res_atom_N_count, local_res_atom_O_count, local_res_atom_S_count, dict_atom_non_h_count, dict_atom_non_h_electron_sum, dict_atom_C_count, dict_atom_N_count, dict_atom_O_count, dict_atom_S_count, part_XX, blob_electron_sum, blob_volume_sum, blob_parts, shape, density, O3, O4, O5, FL, I1, I2, I3, I4, I5, I6, M000, E1, E2, E3, norm, scaled, local_volume, local_electrons, local_mean, local_std, local_min, local_max, local_skewness, local_parts, TwoFoFc_mean, TwoFoFc_std, TwoFoFc_square_std, TwoFoFc_min, TwoFoFc_max, Fo_mean, Fo_std, Fo_square_std, Fo_min, Fo_max, FoFc_mean, FoFc_std, FoFc_square_std, FoFc_min, FoFc_max

In [2]:
redColsDescription="""
title: nazwa łącząca pdb_code (nazwę wpisu do bazy PDB), res_name (nazwę ligandu), res_id, i chain_id;
pbd_code: identyfikator pliku PDB;
res_name: nazwa ligandu, czyli opisanej w wierszu cząsteczki chemicznej (etykieta klasy);
res_id: identyfikator reszty w pliku PDB (umiejscowienie cząsteczki w białku);
chain_id: identyfikator łańcucha w pliku PDB (umiejscowienie cząsteczki w białku);

local_BAa, local_NPa, local_Ra, local_RGa, local_SRGa, local_CCSa, local_CCPa, local_ZOa, local_ZDa, local_ZD_minus_a, local_ZD_plus_a: te zmienne to miary jakości struktury zapisanej w PDB;

local_res_atom_count local_res_atom_non_h_count local_res_atom_non_h_occupancy_sum local_res_atom_non_h_electron_sum local_res_atom_non_h_electron_occupancy_sum local_res_atom_C_count local_res_atom_N_count local_res_atom_O_count local_res_atom_S_count: liczba atomów i elektronów ligandu zamodelowana w pliku PDB;

dict_atom_non_h_count dict_atom_non_h_electron_sum dict_atom_C_count dict_atom_N_count dict_atom_O_count dict_atom_S_count: liczba atomów i elektronów wyliczona na podstawie danych słownikowych (“tablicy pierwiastków”);

fo_col, fc_col, weight_col, grid_space, solvent_radius, solvent_opening_radius: parametry wykorzystane podczas tworzenia zbioru;

part_step_FoFc_std_min: minimalny próg maskowania (dla part_00);
part_step_FoFc_std_max: maksymalny próg maskowania (dla part_10);
part_step_FoFc_std_step: krok maskowania (dł/szer./wys. piksela maski).
"""
null

null

In [9]:
matcher = redColsDescription =~ /(?<=\n)[^:\n]+(?=:)/
redColNames = matcher.findAll().collect { it.split(/[ ,]+/) }.flatten()

[title, pbd_code, res_name, res_id, chain_id, local_BAa, local_NPa, local_Ra, local_RGa, local_SRGa, local_CCSa, local_CCPa, local_ZOa, local_ZDa, local_ZD_minus_a, local_ZD_plus_a, local_res_atom_count, local_res_atom_non_h_count, local_res_atom_non_h_occupancy_sum, local_res_atom_non_h_electron_sum, local_res_atom_non_h_electron_occupancy_sum, local_res_atom_C_count, local_res_atom_N_count, local_res_atom_O_count, local_res_atom_S_count, dict_atom_non_h_count, dict_atom_non_h_electron_sum, dict_atom_C_count, dict_atom_N_count, dict_atom_O_count, dict_atom_S_count, fo_col, fc_col, weight_col, grid_space, solvent_radius, solvent_opening_radius, part_step_FoFc_std_min, part_step_FoFc_std_max, part_step_FoFc_std_step]

In [12]:
formatColNames = { names ->
    names.collect { "\"$it\"" }.join ", "
}

script1544459701166$_run_closure1@7a7e0200

In [13]:
formatColNames(colNames)

"title", "pbd_code", "res_name", "res_id", "chain_id", "local_BAa", "local_NPa", "local_Ra", "local_RGa", "local_SRGa", "local_CCSa", "local_CCPa", "local_ZOa", "local_ZDa", "local_ZD_minus_a", "local_ZD_plus_a", "local_res_atom_count", "local_res_atom_non_h_count", "local_res_atom_non_h_occupancy_sum", "local_res_atom_non_h_electron_sum", "local_res_atom_non_h_electron_occupancy_sum", "local_res_atom_C_count", "local_res_atom_N_count", "local_res_atom_O_count", "local_res_atom_S_count", "dict_atom_non_h_count", "dict_atom_non_h_electron_sum", "dict_atom_C_count", "dict_atom_N_count", "dict_atom_O_count", "dict_atom_S_count", "part_XX", "blob_electron_sum", "blob_volume_sum", "blob_parts", "shape", "density", "O3", "O4", "O5", "FL", "I1", "I2", "I3", "I4", "I5", "I6", "M000", "E1", "E2", "E3", "norm", "scaled", "local_volume", "local_electrons", "local_mean", "local_std", "local_min", "local_max", "local_skewness", "local_parts", "TwoFoFc_mean", "TwoFoFc_std", "TwoFoFc_square_std", "Tw

In [14]:
formatColNames(redColNames)

"title", "pbd_code", "res_name", "res_id", "chain_id", "local_BAa", "local_NPa", "local_Ra", "local_RGa", "local_SRGa", "local_CCSa", "local_CCPa", "local_ZOa", "local_ZDa", "local_ZD_minus_a", "local_ZD_plus_a", "local_res_atom_count", "local_res_atom_non_h_count", "local_res_atom_non_h_occupancy_sum", "local_res_atom_non_h_electron_sum", "local_res_atom_non_h_electron_occupancy_sum", "local_res_atom_C_count", "local_res_atom_N_count", "local_res_atom_O_count", "local_res_atom_S_count", "dict_atom_non_h_count", "dict_atom_non_h_electron_sum", "dict_atom_C_count", "dict_atom_N_count", "dict_atom_O_count", "dict_atom_S_count", "fo_col", "fc_col", "weight_col", "grid_space", "solvent_radius", "solvent_opening_radius", "part_step_FoFc_std_min", "part_step_FoFc_std_max", "part_step_FoFc_std_step"

### Replace pretty quotes with oridenry quotes

In [29]:
colsToDeleteString = "“UNK”, “UNX”, “UNL”, “DUM”, “N”, “BLOB”, “ALA”, “ARG”, “ASN”, “ASP”, “CYS”, “GLN”, “GLU”, “GLY”, “HIS”, “ILE”, “LEU”, “LYS”, “MET”, “MSE”, “PHE”, “PRO”, “SEC”, “SER”, “THR”, “TRP”, “TYR”, “VAL”, “DA”, “DG”, “DT”, “DC”, “DU”, “A”, “G”, “T”, “C”, “U”, “HOH”, “H20”, “WAT”"

“UNK”, “UNX”, “UNL”, “DUM”, “N”, “BLOB”, “ALA”, “ARG”, “ASN”, “ASP”, “CYS”, “GLN”, “GLU”, “GLY”, “HIS”, “ILE”, “LEU”, “LYS”, “MET”, “MSE”, “PHE”, “PRO”, “SEC”, “SER”, “THR”, “TRP”, “TYR”, “VAL”, “DA”, “DG”, “DT”, “DC”, “DU”, “A”, “G”, “T”, “C”, “U”, “HOH”, “H20”, “WAT”

In [30]:
colsToDeleteMatcher = colsToDeleteString =~ /(?<=“)[^“]+(?=”)/
colsToDelete = colsToDeleteMatcher.findAll()
colsToDelete.collect { "\"$it\"" }
.join(", ")

"UNK", "UNX", "UNL", "DUM", "N", "BLOB", "ALA", "ARG", "ASN", "ASP", "CYS", "GLN", "GLU", "GLY", "HIS", "ILE", "LEU", "LYS", "MET", "MSE", "PHE", "PRO", "SEC", "SER", "THR", "TRP", "TYR", "VAL", "DA", "DG", "DT", "DC", "DU", "A", "G", "T", "C", "U", "HOH", "H20", "WAT"

### Get number of records with allowed res_name

In [31]:
f = new File("all_summary.csv")

all_summary.csv

In [11]:
disallowedCount = 0
allCount = 0
def line
f.withReader {
    while(line = it.readLine()) {
        resName = line.split(";")[4]
        if (colsToDelete.contains(resName)) {
            disallowedCount++
        }
        allCount++
    }
}

null

In [15]:
disallowedCount / allCount

0.0096490441

### Find columns that are not specified in project description: http://www.cs.put.poznan.pl/dbrzezinski/teaching/zed/zed_projekt_2018-2019_analiza.html

In [32]:
dfColumnNamesString = """
[1] "blob_coverage"                               "res_coverage"                               
  [3] "title"                                       "pdb_code"                                   
  [5] "res_name"                                    "res_id"                                     
  [7] "chain_id"                                    "blob_volume_coverage"                       
  [9] "blob_volume_coverage_second"                 "res_volume_coverage"                        
 [11] "res_volume_coverage_second"                  "local_res_atom_count"                       
 [13] "local_res_atom_non_h_count"                  "local_res_atom_non_h_occupancy_sum"         
 [15] "local_res_atom_non_h_electron_sum"           "local_res_atom_non_h_electron_occupancy_sum"
 [17] "local_res_atom_C_count"                      "local_res_atom_N_count"                     
 [19] "local_res_atom_O_count"                      "local_res_atom_S_count"                     
 [21] "dict_atom_non_h_count"                       "dict_atom_non_h_electron_sum"               
 [23] "dict_atom_C_count"                           "dict_atom_N_count"                          
 [25] "dict_atom_O_count"                           "dict_atom_S_count"                          
 [27] "skeleton_data"                               "skeleton_cycle_4"                           
 [29] "skeleton_diameter"                           "skeleton_cycle_6"                           
 [31] "skeleton_cycle_7"                            "skeleton_closeness_006_008"                 
 [33] "skeleton_closeness_002_004"                  "skeleton_cycle_3"                           
 [35] "skeleton_avg_degree"                         "skeleton_closeness_004_006"                 
 [37] "skeleton_closeness_010_012"                  "skeleton_closeness_012_014"                 
 [39] "skeleton_edges"                              "skeleton_radius"                            
 [41] "skeleton_cycle_8_plus"                       "skeleton_closeness_020_030"                 
 [43] "skeleton_deg_5_plus"                         "skeleton_closeness_016_018"                 
 [45] "skeleton_closeness_008_010"                  "skeleton_closeness_018_020"                 
 [47] "skeleton_average_clustering"                 "skeleton_closeness_040_050"                 
 [49] "skeleton_closeness_014_016"                  "skeleton_center"                            
 [51] "skeleton_closeness_000_002"                  "skeleton_density"                           
 [53] "skeleton_closeness_030_040"                  "skeleton_deg_4"                             
 [55] "skeleton_deg_0"                              "skeleton_deg_1"                             
 [57] "skeleton_deg_2"                              "skeleton_deg_3"                             
 [59] "skeleton_graph_clique_number"                "skeleton_nodes"                             
 [61] "skeleton_cycles"                             "skeleton_cycle_5"                           
 [63] "skeleton_closeness_050_plus"                 "skeleton_periphery"                         
 [65] "local_volume"                                "local_electrons"                            
 [67] "local_mean"                                  "local_std"                                  
 [69] "local_min"                                   "local_max"                                  
 [71] "local_max_over_std"                          "local_skewness"                             
 [73] "local_cut_by_mainchain_volume"               "local_near_cut_count_C"                     
 [75] "local_near_cut_count_other"                  "local_near_cut_count_S"                     
 [77] "local_near_cut_count_O"                      "local_near_cut_count_N"                     
 [79] "part_00_shape_segments_count"                "part_00_density_segments_count"             
 [81] "part_00_volume"                              "part_00_electrons"                          
 [83] "part_00_mean"                                "part_00_std"                                
 [85] "part_00_max"                                 "part_00_max_over_std"                       
 [87] "part_00_skewness"                            "part_00_parts"                              
 [89] "part_00_shape_O3"                            "part_00_shape_O4"                           
 [91] "part_00_shape_O5"                            "part_00_shape_FL"                           
 [93] "part_00_shape_O3_norm"                       "part_00_shape_O4_norm"                      
 [95] "part_00_shape_O5_norm"                       "part_00_shape_FL_norm"                      
 [97] "part_00_shape_I1"                            "part_00_shape_I2"                           
 [99] "part_00_shape_I3"                            "part_00_shape_I4"                           
[101] "part_00_shape_I5"                            "part_00_shape_I6"                           
[103] "part_00_shape_I1_norm"                       "part_00_shape_I2_norm"                      
[105] "part_00_shape_I3_norm"                       "part_00_shape_I4_norm"                      
[107] "part_00_shape_I5_norm"                       "part_00_shape_I6_norm"                      
[109] "part_00_shape_M000"                          "part_00_shape_CI"                           
[111] "part_00_shape_E3_E1"                         "part_00_shape_E2_E1"                        
[113] "part_00_shape_E3_E2"                         "part_00_shape_sqrt_E1"                      
[115] "part_00_shape_sqrt_E2"                       "part_00_shape_sqrt_E3"                      
[117] "part_00_density_O3"                          "part_00_density_O4"                         
[119] "part_00_density_O5"                          "part_00_density_FL"                         
[121] "part_00_density_O3_norm"                     "part_00_density_O4_norm"                    
[123] "part_00_density_O5_norm"                     "part_00_density_FL_norm"                    
[125] "part_00_density_I1"                          "part_00_density_I2"                         
[127] "part_00_density_I3"                          "part_00_density_I4"                         
[129] "part_00_density_I5"                          "part_00_density_I6"                         
[131] "part_00_density_I1_norm"                     "part_00_density_I2_norm"                    
[133] "part_00_density_I3_norm"                     "part_00_density_I4_norm"                    
[135] "part_00_density_I5_norm"                     "part_00_density_I6_norm"                    
[137] "part_00_density_M000"                        "part_00_density_CI"                         
[139] "part_00_density_E3_E1"                       "part_00_density_E2_E1"                      
[141] "part_00_density_E3_E2"                       "part_00_density_sqrt_E1"                    
[143] "part_00_density_sqrt_E2"                     "part_00_density_sqrt_E3"                    
[145] "part_00_shape_Z_7_3"                         "part_00_shape_Z_0_0"                        
[147] "part_00_shape_Z_7_0"                         "part_00_shape_Z_7_1"                        
[149] "part_00_shape_Z_3_0"                         "part_00_shape_Z_5_2"                        
[151] "part_00_shape_Z_6_1"                         "part_00_shape_Z_3_1"                        
[153] "part_00_shape_Z_6_0"                         "part_00_shape_Z_2_1"                        
[155] "part_00_shape_Z_6_3"                         "part_00_shape_Z_2_0"                        
[157] "part_00_shape_Z_6_2"                         "part_00_shape_Z_5_0"                        
[159] "part_00_shape_Z_5_1"                         "part_00_shape_Z_4_2"                        
[161] "part_00_shape_Z_1_0"                         "part_00_shape_Z_4_1"                        
[163] "part_00_shape_Z_7_2"                         "part_00_shape_Z_4_0"                        
[165] "part_00_density_Z_7_3"                       "part_00_density_Z_0_0"                      
[167] "part_00_density_Z_7_0"                       "part_00_density_Z_7_1"                      
[169] "part_00_density_Z_3_0"                       "part_00_density_Z_5_2"                      
[171] "part_00_density_Z_6_1"                       "part_00_density_Z_3_1"                      
[173] "part_00_density_Z_6_0"                       "part_00_density_Z_2_1"                      
[175] "part_00_density_Z_6_3"                       "part_00_density_Z_2_0"                      
[177] "part_00_density_Z_6_2"                       "part_00_density_Z_5_0"                      
[179] "part_00_density_Z_5_1"                       "part_00_density_Z_4_2"                      
[181] "part_00_density_Z_1_0"                       "part_00_density_Z_4_1"                      
[183] "part_00_density_Z_7_2"                       "part_00_density_Z_4_0"                      
[185] "part_01_shape_segments_count"                "part_01_density_segments_count"             
[187] "part_01_volume"                              "part_01_electrons"                          
[189] "part_01_mean"                                "part_01_std"                                
[191] "part_01_max"                                 "part_01_max_over_std"                       
[193] "part_01_skewness"                            "part_01_parts"                              
[195] "part_01_shape_O3"                            "part_01_shape_O4"                           
[197] "part_01_shape_O5"                            "part_01_shape_FL"                           
[199] "part_01_shape_O3_norm"                       "part_01_shape_O4_norm"                      
[201] "part_01_shape_O5_norm"                       "part_01_shape_FL_norm"                      
[203] "part_01_shape_I1"                            "part_01_shape_I2"                           
[205] "part_01_shape_I3"                            "part_01_shape_I4"                           
[207] "part_01_shape_I5"                            "part_01_shape_I6"                           
[209] "part_01_shape_I1_norm"                       "part_01_shape_I2_norm"                      
[211] "part_01_shape_I3_norm"                       "part_01_shape_I4_norm"                      
[213] "part_01_shape_I5_norm"                       "part_01_shape_I6_norm"                      
[215] "part_01_shape_M000"                          "part_01_shape_CI"                           
[217] "part_01_shape_E3_E1"                         "part_01_shape_E2_E1"                        
[219] "part_01_shape_E3_E2"                         "part_01_shape_sqrt_E1"                      
[221] "part_01_shape_sqrt_E2"                       "part_01_shape_sqrt_E3"                      
[223] "part_01_density_O3"                          "part_01_density_O4"                         
[225] "part_01_density_O5"                          "part_01_density_FL"                         
[227] "part_01_density_O3_norm"                     "part_01_density_O4_norm"                    
[229] "part_01_density_O5_norm"                     "part_01_density_FL_norm"                    
[231] "part_01_density_I1"                          "part_01_density_I2"                         
[233] "part_01_density_I3"                          "part_01_density_I4"                         
[235] "part_01_density_I5"                          "part_01_density_I6"                         
[237] "part_01_density_I1_norm"                     "part_01_density_I2_norm"                    
[239] "part_01_density_I3_norm"                     "part_01_density_I4_norm"                    
[241] "part_01_density_I5_norm"                     "part_01_density_I6_norm"                    
[243] "part_01_density_M000"                        "part_01_density_CI"                         
[245] "part_01_density_E3_E1"                       "part_01_density_E2_E1"                      
[247] "part_01_density_E3_E2"                       "part_01_density_sqrt_E1"                    
[249] "part_01_density_sqrt_E2"                     "part_01_density_sqrt_E3"                    
[251] "part_01_shape_Z_7_3"                         "part_01_shape_Z_0_0"                        
[253] "part_01_shape_Z_7_0"                         "part_01_shape_Z_7_1"                        
[255] "part_01_shape_Z_3_0"                         "part_01_shape_Z_5_2"                        
[257] "part_01_shape_Z_6_1"                         "part_01_shape_Z_3_1"                        
[259] "part_01_shape_Z_6_0"                         "part_01_shape_Z_2_1"                        
[261] "part_01_shape_Z_6_3"                         "part_01_shape_Z_2_0"                        
[263] "part_01_shape_Z_6_2"                         "part_01_shape_Z_5_0"                        
[265] "part_01_shape_Z_5_1"                         "part_01_shape_Z_4_2"                        
[267] "part_01_shape_Z_1_0"                         "part_01_shape_Z_4_1"                        
[269] "part_01_shape_Z_7_2"                         "part_01_shape_Z_4_0"                        
[271] "part_01_density_Z_7_3"                       "part_01_density_Z_0_0"                      
[273] "part_01_density_Z_7_0"                       "part_01_density_Z_7_1"                      
[275] "part_01_density_Z_3_0"                       "part_01_density_Z_5_2"                      
[277] "part_01_density_Z_6_1"                       "part_01_density_Z_3_1"                      
[279] "part_01_density_Z_6_0"                       "part_01_density_Z_2_1"                      
[281] "part_01_density_Z_6_3"                       "part_01_density_Z_2_0"                      
[283] "part_01_density_Z_6_2"                       "part_01_density_Z_5_0"                      
[285] "part_01_density_Z_5_1"                       "part_01_density_Z_4_2"                      
[287] "part_01_density_Z_1_0"                       "part_01_density_Z_4_1"                      
[289] "part_01_density_Z_7_2"                       "part_01_density_Z_4_0"                      
[291] "part_02_shape_segments_count"                "part_02_density_segments_count"             
[293] "part_02_volume"                              "part_02_electrons"                          
[295] "part_02_mean"                                "part_02_std"                                
[297] "part_02_max"                                 "part_02_max_over_std"                       
[299] "part_02_skewness"                            "part_02_parts"                              
[301] "part_02_shape_O3"                            "part_02_shape_O4"                           
[303] "part_02_shape_O5"                            "part_02_shape_FL"                           
[305] "part_02_shape_O3_norm"                       "part_02_shape_O4_norm"                      
[307] "part_02_shape_O5_norm"                       "part_02_shape_FL_norm"                      
[309] "part_02_shape_I1"                            "part_02_shape_I2"                           
[311] "part_02_shape_I3"                            "part_02_shape_I4"                           
[313] "part_02_shape_I5"                            "part_02_shape_I6"                           
[315] "part_02_shape_I1_norm"                       "part_02_shape_I2_norm"                      
[317] "part_02_shape_I3_norm"                       "part_02_shape_I4_norm"                      
[319] "part_02_shape_I5_norm"                       "part_02_shape_I6_norm"                      
[321] "part_02_shape_M000"                          "part_02_shape_CI"                           
[323] "part_02_shape_E3_E1"                         "part_02_shape_E2_E1"                        
[325] "part_02_shape_E3_E2"                         "part_02_shape_sqrt_E1"                      
[327] "part_02_shape_sqrt_E2"                       "part_02_shape_sqrt_E3"                      
[329] "part_02_density_O3"                          "part_02_density_O4"                         
[331] "part_02_density_O5"                          "part_02_density_FL"                         
[333] "part_02_density_O3_norm"                     "part_02_density_O4_norm"                    
[335] "part_02_density_O5_norm"                     "part_02_density_FL_norm"                    
[337] "part_02_density_I1"                          "part_02_density_I2"                         
[339] "part_02_density_I3"                          "part_02_density_I4"                         
[341] "part_02_density_I5"                          "part_02_density_I6"                         
[343] "part_02_density_I1_norm"                     "part_02_density_I2_norm"                    
[345] "part_02_density_I3_norm"                     "part_02_density_I4_norm"                    
[347] "part_02_density_I5_norm"                     "part_02_density_I6_norm"                    
[349] "part_02_density_M000"                        "part_02_density_CI"                         
[351] "part_02_density_E3_E1"                       "part_02_density_E2_E1"                      
[353] "part_02_density_E3_E2"                       "part_02_density_sqrt_E1"                    
[355] "part_02_density_sqrt_E2"                     "part_02_density_sqrt_E3"                    
[357] "part_02_shape_Z_7_3"                         "part_02_shape_Z_0_0"                        
[359] "part_02_shape_Z_7_0"                         "part_02_shape_Z_7_1"                        
[361] "part_02_shape_Z_3_0"                         "part_02_shape_Z_5_2"                        
[363] "part_02_shape_Z_6_1"                         "part_02_shape_Z_3_1"                        
[365] "part_02_shape_Z_6_0"                         "part_02_shape_Z_2_1"                        
[367] "part_02_shape_Z_6_3"                         "part_02_shape_Z_2_0"                        
[369] "part_02_shape_Z_6_2"                         "part_02_shape_Z_5_0"                        
[371] "part_02_shape_Z_5_1"                         "part_02_shape_Z_4_2"                        
[373] "part_02_shape_Z_1_0"                         "part_02_shape_Z_4_1"                        
[375] "part_02_shape_Z_7_2"                         "part_02_shape_Z_4_0"                        
[377] "part_02_density_Z_7_3"                       "part_02_density_Z_0_0"                      
[379] "part_02_density_Z_7_0"                       "part_02_density_Z_7_1"                      
[381] "part_02_density_Z_3_0"                       "part_02_density_Z_5_2"                      
[383] "part_02_density_Z_6_1"                       "part_02_density_Z_3_1"                      
[385] "part_02_density_Z_6_0"                       "part_02_density_Z_2_1"                      
[387] "part_02_density_Z_6_3"                       "part_02_density_Z_2_0"                      
[389] "part_02_density_Z_6_2"                       "part_02_density_Z_5_0"                      
[391] "part_02_density_Z_5_1"                       "part_02_density_Z_4_2"                      
[393] "part_02_density_Z_1_0"                       "part_02_density_Z_4_1"                      
[395] "part_02_density_Z_7_2"                       "part_02_density_Z_4_0"                      
[397] "fo_col"                                      "fc_col"                                     
[399] "weight_col"                                  "grid_space"                                 
[401] "solvent_radius"                              "solvent_opening_radius"                     
[403] "resolution_max_limit"                        "resolution"                                 
[405] "FoFc_mean"                                   "FoFc_std"                                   
[407] "FoFc_square_std"                             "FoFc_min"                                   
[409] "FoFc_max"                                    "part_step_FoFc_std_min"                     
[411] "part_step_FoFc_std_max"                      "part_step_FoFc_std_step" 
"""
null

null

In [72]:
dfColNamesMatcher = dfColumnNamesString =~ /(?<=\")\w+(?=\")/
dfColNames = dfColNamesMatcher.findAll() as List
null

null

In [76]:
preprocessedColNames = colNames.collect { it.replace("XX", "") }
undescribedColNames = dfColNames.findAll { name -> !preprocessedColNames.any { name.contains(it) } }

[blob_coverage, res_coverage, pdb_code, blob_volume_coverage, blob_volume_coverage_second, res_volume_coverage, res_volume_coverage_second, skeleton_data, skeleton_cycle_4, skeleton_diameter, skeleton_cycle_6, skeleton_cycle_7, skeleton_closeness_006_008, skeleton_closeness_002_004, skeleton_cycle_3, skeleton_avg_degree, skeleton_closeness_004_006, skeleton_closeness_010_012, skeleton_closeness_012_014, skeleton_edges, skeleton_radius, skeleton_cycle_8_plus, skeleton_closeness_020_030, skeleton_deg_5_plus, skeleton_closeness_016_018, skeleton_closeness_008_010, skeleton_closeness_018_020, skeleton_average_clustering, skeleton_closeness_040_050, skeleton_closeness_014_016, skeleton_center, skeleton_closeness_000_002, skeleton_closeness_030_040, skeleton_deg_4, skeleton_deg_0, skeleton_deg_1, skeleton_deg_2, skeleton_deg_3, skeleton_graph_clique_number, skeleton_nodes, skeleton_cycles, skeleton_cycle_5, skeleton_closeness_050_plus, skeleton_periphery, local_cut_by_mainchain_volume, local

#### Description of all missing columns
\*coverage\* - all such columns name are missing in the descriptions  
pdb_code - should be "pbd_code"  
\*skeleton\* - all such columns name are missing in the descriptions  
\*cut\* - all such columns name are missing in the descriptions

#### Validate if above statement is true

In [80]:
assert undescribedColNames.findAll { name -> !["coverage", "skeleton", "cut"].any { name.contains(it) }  } == ["pdb_code"] 

null