In [1]:
import numpy as np
import pandas as pd
import atomic_info

In [13]:
#features
#atomic_row_index={'B':2,'Al':3,'Ga':4,'In':5,'C':2,'Si':3,'Ge':4,'Sn':5,'Pb':6,'N':2,'P':3,'As':4,'Sb':5,'Bis':6,'O':2,'S':3,'Se':4,'Te':5,'F':2,'Cl':3,'Br':4,'I':5}

In [5]:
def str2float(data_frame,col_name):
    data_frame[col_name]=data_frame[col_name].map(lambda s:float(s))

In [None]:
def read_vac_lev(filename,col_name,data_frame):
    """
    filename: contains your vacuum level information
    col_name: the column name you want to store in the data_frame
    data_frame: the loaded data_frame
    """
    vacuum_level_info = open(filename,'r')
    vacuum_level_cont = vacuum_level_info.readlines()
    vacuum_level_info.close()
    vacuum_level_dic = {}
    for line in vacuum_level_cont:
        vacuum_level_dic[line.split()[0]]=float(line.split()[1])
    data_frame[col_name]=data_frame['Phase'].map(lambda x: float(vacuum_level_dic[x]) 
                                                if x in vacuum_level_dic else 'null')

In [13]:
def computed2DF(file_pbe='band_gap_info_pbe.dat',file_hse='band_gap_info_hse.dat',struc_data='struc_info.dat',vac_pbe='VAC_lev_pbe.dat',vac_hse='VAC_lev_hse.dat'):
    """
    1. band gap and band edge information
    - "band_gap_info_pbe.dat" and "band_gap_info_hse.dat"
    - These two files are generated by "data_analysis.py" module
    - file_pbe: store PBE results; format: 'Compound','Phase','Band gap (eV)','VBM (eV)','CBM (eV)'
    - file_hse: store HSE results; format: 'Compound','Phase','Band gap (eV)','VBM (eV)','CBM (eV)'
    2. structural properties
    - 
    - struc_data: store structural info; format: 'Phase','a1 (A)','a2 (A)','a3 (A)','alpha','beta','gamma'
    optional:
    vac_pbe: contain vaccum level calculated by PBE; 'None' if there is no such file.
    vac_hse: contain vaccum level calculated by HSE; 'None' if there is no such file.
    """
    data_pbe=pd.read_csv(file_pbe,sep=' ',names=['Compound','Phase','Band gap (eV)','VBM (eV)','CBM (eV)'])
    data_hse=pd.read_csv(file_hse,sep=' ',names=['Compound','Phase','HSE-gap (eV)','HSE-VBM (eV)','HSE-CBM (eV)'])
    data_struc=pd.read_csv(struc_data,sep='\t',names=['Phase','a1 (A)','a2 (A)','a3 (A)','alpha','beta','gamma'])
    data_hse.drop('Compound',axis=1,inplace=True)
    data_hse.set_index('Phase',inplace=True)
    data_pbe.set_index('Phase',inplace=True)
    data_struc.set_index('Phase',inplace=True)
    data_frame = pd.concat([data_pbe, data_hse,data_struc], axis=1, join='inner')
    data_frame.reset_index(inplace=True)
    if vac_pbe != 'None':
        read_vac_lev(vac_pbe,'Vacuum level (eV)',data_frame)
    if vac_hse != 'None':
        read_vac_lev(vac_hse,'HSE vacuum level (eV)',data_frame)
    return data_frame

In [11]:
#computed2DF()

In [19]:
def feature_gene_ternary(data_frame):
    """
    
    """
    if 'Vacuum level (eV)' in list(data_frame):
        data_frame['VBM Absolute (eV)']=data_frame['VBM (eV)'] - data_frame['Vacuum level (eV)']
        data_frame['CBM Absolute (eV)']=data_frame['CBM (eV)'] - data_frame['Vacuum level (eV)']
        data_frame['HSE-VBM Absolute (eV)']=data_frame['HSE-VBM (eV)'] - data_frame['HSE vacuum level (eV)']
        data_frame['HSE-CBM Absolute (eV)']=data_frame['HSE-CBM (eV)'] - data_frame['HSE vacuum level (eV)']
    #
    data_frame['Cation']=data_frame['Compound'].map(lambda x:atomic_info.count_elem(x)[0])
    data_frame['Anion_1']=data_frame['Compound'].map(lambda x:atomic_info.count_elem(x)[1])
    data_frame['Anion_2']=data_frame['Compound'].map(lambda x:atomic_info.count_elem(x)[2])
    #
    data_frame['Phase index']=data_frame['Phase'].map(lambda x:int(x[-1]))
    #
    data_frame['Cation_negativity']=data_frame['Cation'].map(lambda x:atomic_info.element.atomic_elec_neg[x])
    data_frame['Anion_1_negativity']=data_frame['Anion_1'].map(lambda x:atomic_info.element.atomic_elec_neg[x])
    data_frame['Anion_2_negativity']=data_frame['Anion_2'].map(lambda x:atomic_info.element.atomic_elec_neg[x])
    #
    data_frame['Cation_mass']=data_frame['Cation'].map(lambda x:atomic_info.element.atomic_mass[x])
    data_frame['Anion_1_mass']=data_frame['Anion_1'].map(lambda x:atomic_info.element.atomic_mass[x])
    data_frame['Anion_2_mass']=data_frame['Anion_2'].map(lambda x:atomic_info.element.atomic_mass[x])
    #
    data_frame['Cation_atomic_radius']=data_frame['Cation'].map(lambda x:atomic_info.element.atomic_radius[x])
    data_frame['Anion_1_atomic_radius']=data_frame['Anion_1'].map(lambda x:atomic_info.element.atomic_radius[x])
    data_frame['Anion_2_atomic_radius']=data_frame['Anion_2'].map(lambda x:atomic_info.element.atomic_radius[x])
    #
    data_frame['Cation_row_index']=data_frame['Cation'].map(lambda x:atomic_info.atomic_row_index[x])
    data_frame['Anion_1_row_index']=data_frame['Anion_1'].map(lambda x:atomic_info.element.atomic_row_index[x])
    data_frame['Anion_2_row_index']=data_frame['Anion_2'].map(lambda x:atomic_info.element.atomic_row_index[x])
    #
    data_frame['Average mass']=data_frame['Cation_mass'].map(lambda x:x/3)+data_frame['Anion_1_mass'].map(lambda x:x/3)+data_frame['Anion_2_mass'].map(lambda x:x/3)
    #
    data_frame['Cation_elec_aff']=data_frame['Cation'].map(lambda x:atomic_info.element.atomic_elec_affinity[x])
    data_frame['Anion_1_elec_aff']=data_frame['Anion_1'].map(lambda x:atomic_info.element.atomic_elec_affinity[x])
    data_frame['Anion_2_elec_aff']=data_frame['Anion_2'].map(lambda x:atomic_info.element.atomic_elec_affinity[x])
    #
    data_frame['Cation_ionization']=data_frame['Cation'].map(lambda x:atomic_info.element.atomic_ionization_energy[x])
    data_frame['Anion_1_ionization']=data_frame['Anion_1'].map(lambda x:atomic_info.element.atomic_ionization_energy[x])
    data_frame['Anion_2_ionization']=data_frame['Anion_2'].map(lambda x:atomic_info.element.atomic_ionization_energy[x])
    #
    data_frame['Anion_1 - Cation negativity']=data_frame['Anion_1_negativity']-data_frame['Cation_negativity']
    data_frame['Anion_2 - Cation negativity']=data_frame['Anion_2_negativity']-data_frame['Cation_negativity']
    data_frame['Anion_1 + Anion_2 - Cation negativity']=data_frame['Anion_1_negativity']+data_frame['Anion_2_negativity']-data_frame['Cation_negativity']
    #

In [7]:
#atomic_info.count_elem('GeS')

In [18]:
def feature_target_seperation(data_frame,target=['Band gap (eV)','VBM Absolute (eV)','CBM Absolute (eV)','HSE-gap (eV)','HSE-VBM Absolute (eV)','HSE-CBM Absolute (eV)'],feature=['Phase index', 'Cation_negativity', 'Anion_1_negativity','Anion_2_negativity', 'Cation_mass', 'Anion_1_mass', 'Anion_2_mass','Cation_atomic_radius', 'Anion_1_atomic_radius','Anion_2_atomic_radius', 'Cation_row_index', 'Anion_1_row_index','Anion_2_row_index', 'Average mass', 'Cation_elec_aff','Anion_1_elec_aff', 'Anion_2_elec_aff', 'Cation_ionization','Anion_1_ionization', 'Anion_2_ionization','Anion_1 - Cation negativity', 'Anion_2 - Cation negativity','Anion_1 + Anion_2 - Cation negativity']):
    target_df = data_frame.set_index(['Compound','Phase'])[target]
    feature_df = data_frame.set_index(['Compound','Phase'])[feature]
    return target_df, feature_df

In [19]:
def normalize(data_frame, col_names, normal_type='rescaling'):
    """
    function: Normalize the data
    data_frame: the data frame you are dealing with
    col_name: the label of the column
    """
    for col_name in col_names:
        max_val = data_frame[col_name].max()
        min_val = data_frame[col_name].min()
        mean_val = data_frame[col_name].mean()
        var_val = data_frame[col_name].var()
        if normal_type == 'rescaling' or normal_type == 'r':
            data_frame[col_name]=data_frame[col_name].map(lambda x: (x-min_val)*1.0/(max_val-min_val))
        elif normal_type == 'standardization' or normal_type == 's':
            data_frame[col_name]=data_frame[col_name].map(lambda x: (x-mean_val)*1.0/(var_val))