In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
#mpl.rc('figure', max_open_warning = 0)
#%matplotlib inline
#%config InlineBackend.figure_format='retina'

from pandas_profiling import ProfileReport

from sklearn.model_selection import train_test_split

from collections import OrderedDict

In [None]:
class style:
   BOLD = '\033[1m'
   END = '\033[0m'

In [None]:
PATH = os.getcwd() # Getting current directory
descriptor_in_path = os.path.join(PATH, '../input/descriptor.csv')

df_descriptor = pd.read_csv(descriptor_in_path)

print(f'Descriptor input DataFrame shape:\n\n {df_descriptor.shape}\n')
print('------------------------------------------------------------')

print(f'\nDescriptor input data columns:\n\n {df_descriptor.columns}\n')
print('------------------------------------------------------------')

print(f'\nDescriptor input dataframe head:\n\n {df_descriptor.head()}\n')
print('------------------------------------------------------------')

del descriptor_in_path

## Renaming descriptor columns

In [None]:
rename_dict = {'name': 'mof', 'Di': 'LCD', 'Df': 'PLD', 'ASA(m2/gram)_1.9': 'GSA', 
               'AV_Volume_fraction_1.9': 'AVF', 'AV(cm3/gram)_1.9': 'GPV', 'density(gram_cm3)': 'Density'}

df_descriptor = df_descriptor.rename(columns=rename_dict)

print(f'\nCurated descriptor columns:\n\n {df_descriptor.columns}\n')
print('------------------------------------------------------------')

print(df_descriptor.dtypes) # Prints the datatype of each column in dataframe
del rename_dict

## Curating descriptor data

In [None]:
df_descriptor_gross1_atomic = df_descriptor

# Selecting materials with PLD > 3.8 A

df_descriptor_gross1_atomic = df_descriptor_gross1_atomic[(df_descriptor_gross1_atomic['PLD'] > 3.8)]

# Selecting materials with non-zero void fraction

df_descriptor_gross1_atomic = df_descriptor_gross1_atomic[(df_descriptor_gross1_atomic['AVF'] > 0.0)]

descriptor_mof_name = df_descriptor_gross1_atomic['mof'].astype(str)

PATH = os.getcwd() # Getting current directory
curated_mof_name = os.path.join(PATH, '../output/curated-mof.csv')
descriptor_mof_name.to_csv(curated_mof_name, index=False)

columns = ['PLD', 'LCD', 'GSA', 'AVF', 'GPV', 'Density', 'total_degree_unsaturation', 'degree_unsaturation', 
           'metallic_percentage', 'O_to_Metal_ration', 'N_to_O_ratio', 'H' ,'Ni', 'Co', 'Cu', 'Zn', 'Pb', 'Mn',
           'Cd', 'C', 'O', 'N', 'S', 'Cl', 'Br', 'F', 'I']

shap_columns = columns

df_descriptor_gross1_atomic = df_descriptor_gross1_atomic[columns].astype(float)
curated_mof_prop = os.path.join(PATH, '../output/curated-mof-prop.csv')

df_descriptor_gross1_atomic.to_csv(curated_mof_prop, index=False)

print(f'\nCurated gross1_atomic descriptor data:\n\n {df_descriptor_gross1_atomic}\n')
print('\n------------------------------------------------------------\n')

print(f'\nData type of each column. Note that it should be float\n\n {df_descriptor_gross1_atomic.dtypes}\n')
print('\n------------------------------------------------------------\n')

del df_descriptor
del columns
del descriptor_mof_name
del curated_mof_name
del curated_mof_prop

## Taking look at target data

In [None]:
target_in_path = os.path.join(PATH, '../input/C3H8-C3H6.csv')
#target_in_path = os.path.join(PATH, '../input/C2H6-C2H4.csv')

df_target = pd.read_csv(target_in_path)

print(f'Target property input DataFrame shape:\n\n {df_target.shape}\n')
print('------------------------------------------------------------')

print(f'\nTarget property input data columns:\n\n {df_target.columns}\n')
print('------------------------------------------------------------')

print(f'\nTarget property input dataframe head:\n\n {df_target.head()}\n')
print('------------------------------------------------------------')

del target_in_path

## Renaming Target property columns

In [None]:
rename_dict = {'MOF_no': 'mof', 'propane_avg(mol/kg)': 'propane_uptake(mol/kg)',
              'propylene_avg(mol/kg)': 'propylene_uptake(mol/kg)',
              'C3H8/C3H6 Selectivity (1Bar)': 'propane_propylene_selectivity', 'Df': 'PLD',
              'AV_Volume_fraction_1.9': 'AVF'}
'''

rename_dict = {'MOF_no': 'mof', 'ethane_avg(mol/kg)': 'ethane_uptake(mol/kg)',
              'ethylene_avg(mol/kg)': 'ethylene_uptake(mol/kg)',
              'C2H6/C2H4 Selectivity (1Bar)': 'ethane_ethylene_selectivity', 'Df': 'PLD',
              'AV_Volume_fraction_1.9': 'AVF'}

'''
df_target = df_target.rename(columns=rename_dict)

print(f'\nCurated target columns:\n\n {df_target.columns}\n')
print('------------------------------------------------------------')
      
del rename_dict

## Curating Target dataset

In [None]:
df_target_gross1_atomic = df_target

# Selecting materials with PLD > 3.8 A

df_target_gross1_atomic = df_target_gross1_atomic[(df_target_gross1_atomic['PLD'] > 3.8)]

# Selecting material with AVF > 0
df_target_gross1_atomic = df_target_gross1_atomic[(df_target_gross1_atomic['AVF'] > 0.0)]

target_mof_name = df_target_gross1_atomic['mof'].astype(str)
target_mof_name_path = os.path.join(PATH, '../output/target-mof-name.csv')
target_mof_name.to_csv(target_mof_name_path, index=False)

columns = ['propane_uptake(mol/kg)', 'propane_propylene_selectivity', 'TSN', 'propylene_uptake(mol/kg)']

#columns = ['ethane_uptake(mol/kg)', 'ethane_ethylene_selectivity', 'TSN', 'ethylene_uptake(mol/kg)']


df_target_gross1_atomic = df_target_gross1_atomic[columns].astype(float)
target_mof_prop_path = os.path.join(PATH, '../output/target-mof-prop.csv')

print(f'\nCurated target data:\n\n {df_target_gross1_atomic}\n')
print('\n------------------------------------------------------------\n')

print(f'\nData type of each column. Note that it should be float\n\n {df_target_gross1_atomic.dtypes}\n')
print('\n------------------------------------------------------------\n')

del df_target
del columns
del target_mof_name
del target_mof_name_path
del target_mof_prop_path

In [None]:
'''
profile = ProfileReport(df_join.copy(),title='C3H8-C3H6', html={'style':{'full_width':True}})
# profile.to_widgets()
#profile.to_notebook_iframe()
C3H8_report = os.path.join(PATH, '../output/C3H8-C3H6-report.csv')

profile.to_file("/home/varad/Pictures/best_model_selection_updated/1_excluding_oms/1_Propane_RACs_excluding.html")

''''

In [None]:
X_crude = df_descriptor_gross1_atomic
Y_crude = df_target_gross1_atomic

print(f'\nShape of X_crude: {X_crude.shape}')
print(f'\nShape of Y_crude: {Y_crude.shape}')

del df_descriptor_gross1_atomic
del df_target_gross1_atomic

In [None]:
X, X_val_crude, Y, Y_val_crude = train_test_split(X_crude, Y_crude, test_size=0.32, random_state=42)

## RFE for Property

In [None]:
RNG_SEED = 42

In [None]:
X_train_crude, X_test_crude, Y_train_crude, Y_test_crude = train_test_split(X, Y, test_size=0.294, random_state=RNG_SEED)

In [None]:
len(X_test_crude)

In [None]:
len(X_val_crude)

In [None]:
# Scaling the data
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train_crude)
X_val_scaled   = scaler.transform(X_val_crude)
X_test_scaled  = scaler.transform(X_test_crude)

# Normalizing the unscaled data
norm = MinMaxScaler().fit(X_train_crude)

X_train_norm  = norm.transform(X_train_crude)
X_val_norm    = norm.transform(X_val_crude)
X_test_norm   = norm.transform(X_test_crude)

# Normalizing the scaled data
norm_scaled         = MinMaxScaler().fit(X_train_scaled)

X_train_scaled_norm = norm_scaled.transform(X_train_scaled)
X_val_scaled_norm   = norm_scaled.transform(X_val_scaled)
X_test_scaled_norm  = norm_scaled.transform(X_test_scaled)

In [None]:
X_train  = X_train_scaled_norm
X_val    = X_val_scaled_norm
X_test   = X_test_scaled_norm

In [None]:
# Target Y is neigther scaled nor normalized

# If index is 0 then, propane / ethane uptake (mol/kg)  
# If index is 1 then, selectivity
# If index is 2 then, TSN
# If index is 3 then, propylene / ethylene uptake (mol/kg)

print('------------------------------------------------------------')
print(style.BOLD + 'Define property here :' + style.END)
print('------------------------------------------------------------')

Y_target_train = Y_train_crude.iloc[:,1]
Y_target_test  = Y_test_crude.iloc[:,1]
Y_target_val   = Y_val_crude.iloc[:,1]

## RFE (Recursive Feature Elimination)
1. RFE is used to recurcively eliminate the most unimportant features. To impliment RFE we need an estimator. Official documentation can be found [here](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html).
2. Useful synmtax of RFE for me is :
```
object_variable = RFE(estimator(para_grid of estimator), n_features_to_select = index), for example
sel = RFE(GradientBoostingRegressor(n_estimators=100, random_state=RNG_SEED), n_features_to_select = index)
```
3. The most important features will be calculated by RFE using following line:
```
sel.fit(X_train, Y_target_train)
```

In [None]:
X_train_scaled_normalized_original = X_train
X_test_scaled_normalized_original  = X_test
X_val_scaled_normalized_original   = X_val

In [None]:
X_crude.columns

In [None]:
X.columns

In [None]:

def run_randomForest_test(X_train, X_val, Y_train, Y_val):
    
    rfr = RandomForestRegressor(random_state=RNG_SEED, bootstrap=True, max_depth=10,max_features='auto',
                                min_samples_leaf=2, min_samples_split=2, n_estimators = 50, n_jobs = 10)
    rfr.fit(X_train, Y_train)
    Y_pred_train = rfr.predict(X_train)
    Y_pred_val  = rfr.predict(X_val)
    print("\nR^2 score on train set: %.3f\n" % r2_score(Y_train, Y_pred_train))
    print("\nR^2 score on validation set: %.3f\n" % r2_score(Y_val, Y_pred_val))
    print("\nMAE score on validation set: %.3f\n" % mean_absolute_error(Y_val, Y_pred_val))


In [None]:

sel = RFE(GradientBoostingRegressor(random_state=RNG_SEED), n_features_to_select = 24)
      # Calling RFE

sel.fit(X_train_scaled_normalized_original, Y_target_train) # At this point RFE has selected the $index most important features 
                                 # (where index = no of featues = variable from 1 to 29)

X_train_rfe = sel.transform(X_train_scaled_normalized_original) # Say X_train has colums = 29, index = 1, then X_train_rfe will have
X_val_rfe = sel.transform(X_val_scaled_normalized_original) # same features as selected by RFE. This is done because I wanted to select
                                 # Only those features which perform good on validation set

print('No of Selected Feature are : 24')

run_randomForest_test(X_train_rfe, X_val_rfe, Y_target_train, Y_target_val) # Calculating R2 score for train and validation set

features = X.columns[sel.get_support()] # printing the columns

print(f'Selected Features are : \n {features}\n')
print('------------------------------------------------------------')
print()
