# Capturing greenhouse gases with data

## Feature Engineering and Preprocessing

### by Zachary Brown

Now that I've completed the initial data analysis, it's time to process the data so it will work with our models. I'll start by installing the necessary libraries for this step.

In [1]:
!pip install scikit-learn==1.2.1



In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme('notebook')
import scipy.stats
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression

Next I'll begin loading the dataframe and checking the columns still present.

In [3]:
data = pd.read_csv('../data/interim/eda.csv')

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,filename,unit_cell_volume,Density,accessible_surface_area,volumetric_surface_area,gravimetric_surface_area,inaccessible_surface_area,inac_grav_surf_area,inac_vol_surf_area,...,D_func-alpha-2-all,D_func-alpha-3-all,order_f-lig,bool_f-lig,order_mc,bool_mc,order_func,bool_func,order_lc,bool_lc
0,0,DB0-m2_o1_o10_f0_pcu.sym.66.cif,901.788,1.23322,87.4832,970.108,786.644,0.0,0.0,0.0,...,41.88578,23.764297,43831,True,18963,True,21096,True,4072,True
1,1,DB0-m3_o23_o23_f0_pcu.sym.74.cif,7545.84,0.537679,1566.33,2075.75,3860.57,0.0,0.0,0.0,...,4.4,17.857187,100001,False,44145,True,100001,False,42138,True
2,2,DB0-m2_o8_o25_f0_pcu.sym.91.cif,4172.23,0.371648,771.93,1850.16,4978.27,0.0,0.0,0.0,...,-7.433333,-5.745183,100001,False,100001,False,100001,False,100001,False
3,3,DB0-m29_o82_o46_f0_pts.sym.1.cif,1715.11,0.786327,378.905,2209.23,2809.55,0.0,0.0,0.0,...,-6.0,-12.0,100001,False,100001,False,100001,False,100001,False
4,4,DB0-m29_o99_o470_f0_pts.sym.128.cif,2552.97,0.754924,419.589,1643.53,2177.08,0.164038,0.642539,0.851131,...,-10.339258,-17.664703,100001,False,100001,False,84717,True,100001,False


Ok, so first I want to drop unnecessary columns. That will include 'Unnamed: 0' since it's the old index, and anything with the prefixes order_ and bool_ because they're not explained in the source material so it's unclear what they represent. I'm also going to drop 'Crystalnet' because it is almost completely redundant with 'likely topology' which is the topology feature I'll be dummying.  

In [5]:
drops = []
for col in data.columns:
    if 'order_' in col:
        drops.append(col)
    elif 'bool_' in col:
        drops.append(col)

In [6]:
drops

['order_geo',
 'bool_geo',
 'order_f-lig',
 'bool_f-lig',
 'order_mc',
 'bool_mc',
 'order_func',
 'bool_func',
 'order_lc',
 'bool_lc']

In [7]:
drops.append('Unnamed: 0')
drops.append('Crystalnet')

In [8]:
drops

['order_geo',
 'bool_geo',
 'order_f-lig',
 'bool_f-lig',
 'order_mc',
 'bool_mc',
 'order_func',
 'bool_func',
 'order_lc',
 'bool_lc',
 'Unnamed: 0',
 'Crystalnet']

In [9]:
data.drop(columns=drops, inplace=True)

In [10]:
data.head()

Unnamed: 0,filename,unit_cell_volume,Density,accessible_surface_area,volumetric_surface_area,gravimetric_surface_area,inaccessible_surface_area,inac_grav_surf_area,inac_vol_surf_area,accessible_volume_per_uc,...,D_func-Z-3-all,D_func-T-1-all,D_func-T-2-all,D_func-T-3-all,D_func-S-1-all,D_func-S-2-all,D_func-S-3-all,D_func-alpha-1-all,D_func-alpha-2-all,D_func-alpha-3-all
0,DB0-m2_o1_o10_f0_pcu.sym.66.cif,901.788,1.23322,87.4832,970.108,786.644,0.0,0.0,0.0,26.0256,...,20.0,-2.0,1.0,0.333333,0.333333,1.64,0.666667,10.8,41.88578,23.764297
1,DB0-m3_o23_o23_f0_pcu.sym.74.cif,7545.84,0.537679,1566.33,2075.75,3860.57,0.0,0.0,0.0,2364.41,...,36.0,-1.333333,-2.666667,-2.666667,0.146667,0.293333,1.12,2.2,4.4,17.857187
2,DB0-m2_o8_o25_f0_pcu.sym.91.cif,4172.23,0.371648,771.93,1850.16,4978.27,0.0,0.0,0.0,2102.05,...,13.222222,-0.888889,-2.0,-0.888889,-0.022222,-0.042222,0.568889,-3.733333,-7.433333,-5.745183
3,DB0-m29_o82_o46_f0_pts.sym.1.cif,1715.11,0.786327,378.905,2209.23,2809.55,0.0,0.0,0.0,281.586,...,4.0,-2.0,-2.0,-4.0,-0.04,-0.04,-0.08,-6.0,-6.0,-12.0
4,DB0-m29_o99_o470_f0_pts.sym.128.cif,2552.97,0.754924,419.589,1643.53,2177.08,0.164038,0.642539,0.851131,268.47,...,12.395833,-2.416667,-2.0,-3.625,-0.056667,0.226667,0.2,-8.5,-10.339258,-17.664703


In [11]:
data.set_index('filename', inplace=True)

Ok, now the columns have been pared down a bit, now it's time to identify which columns need to be converted from categorical to dummies.

In [12]:
for col in data.columns:
    if data[col].dtypes == 'object':
        print(col)

likely topology


In [13]:
pd.get_dummies(data, columns = ['likely topology'], drop_first = True)

Unnamed: 0_level_0,unit_cell_volume,Density,accessible_surface_area,volumetric_surface_area,gravimetric_surface_area,inaccessible_surface_area,inac_grav_surf_area,inac_vol_surf_area,accessible_volume_per_uc,volume_fraction,...,likely topology_xux,likely topology_ylf,likely topology_yug,likely topology_zec,likely topology_znp,likely topology_zsn,likely topology_zul,likely topology_zxc,likely topology_zyg,likely topology_zyl
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DB0-m2_o1_o10_f0_pcu.sym.66.cif,901.788,1.233220,87.4832,970.108,786.644,0.000000,0.000000,0.000000,26.0256,0.02886,...,0,0,0,0,0,0,0,0,0,0
DB0-m3_o23_o23_f0_pcu.sym.74.cif,7545.840,0.537679,1566.3300,2075.750,3860.570,0.000000,0.000000,0.000000,2364.4100,0.31334,...,0,0,0,0,0,0,0,0,0,0
DB0-m2_o8_o25_f0_pcu.sym.91.cif,4172.230,0.371648,771.9300,1850.160,4978.270,0.000000,0.000000,0.000000,2102.0500,0.50382,...,0,0,0,0,0,0,0,0,0,0
DB0-m29_o82_o46_f0_pts.sym.1.cif,1715.110,0.786327,378.9050,2209.230,2809.550,0.000000,0.000000,0.000000,281.5860,0.16418,...,0,0,0,0,0,0,0,0,0,0
DB0-m29_o99_o470_f0_pts.sym.128.cif,2552.970,0.754924,419.5890,1643.530,2177.080,0.164038,0.642539,0.851131,268.4700,0.10516,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DB0-m2_o12_o16_f0_pcu.sym.10.cif,1358.680,0.754709,290.7150,2139.690,2835.120,0.000000,0.000000,0.000000,189.8350,0.13972,...,0,0,0,0,0,0,0,0,0,0
DB0-m3_o160_o480_f0_fsc.sym.50.cif,1243.540,0.972493,216.3000,1739.390,1788.590,0.000000,0.000000,0.000000,154.4720,0.12422,...,0,0,0,0,0,0,0,0,0,0
DB0-m3_o7_o15_f0_pcu.sym.26.cif,3245.820,0.460190,607.9870,1873.140,4070.370,0.000000,0.000000,0.000000,1383.1700,0.42614,...,0,0,0,0,0,0,0,0,0,0
DB0-m2_o9_o11_f0_nbo.sym.43.cif,5025.910,0.784130,842.4600,1676.240,2137.700,0.000000,0.000000,0.000000,799.8230,0.15914,...,0,0,0,0,0,0,0,0,0,0


In [14]:
data.drop(columns = 'likely topology', inplace = True)

Ok, none of the other columns appear to be categorical, so now I should be in the clear to separate my X and y variables, then train/test split them.

In [15]:
y = data[['v/v_working_capacity']]
X = data.drop(columns='v/v_working_capacity')

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 15)

In [17]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(204750, 849) (51188, 849) (204750, 1) (51188, 1)


I need to convert those y sets into 1 dimensional arrays.

In [18]:
y_train = np.ravel(y_train)
y_test = np.ravel(y_test)

In [19]:
y_train.shape, y_test.shape

((204750,), (51188,))

Next I need to impute any missing values in the X datasets. I'll train a simple imputer on the X_train set using the most frequent value, then transform both X_train and X_test using that same imputer.

In [20]:
from sklearn.impute import SimpleImputer

mode_imputer = SimpleImputer(strategy='most_frequent')

for col in X_train.columns:
    if X_train[col].isna().sum() != 0:
        X_train[col] = mode_imputer.fit_transform(X_train[col].values.reshape(-1,1))
        X_test[col] = mode_imputer.transform(X_test[col].values.reshape(-1,1))
        
X_train_imp = X_train.copy()
X_test_imp = X_test.copy()

In [21]:
X_train.isna().sum().sum()

0

Now that there are no missing values I'll scale both X sets using a standard scaler, training on the X_train, and transforming both X sets based on that fit.

In [22]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train_imp)
scaler.transform(X_train_imp)
scaler.transform(X_test_imp)

array([[-0.55932985,  1.39451244, -0.72404569, ...,  0.54180977,
         0.33179527,  0.30376641],
       [-0.54616764,  0.65108426, -0.65606188, ..., -0.22254302,
        -0.35812643, -0.2786721 ],
       [-0.38631625,  0.69999289, -0.62904959, ...,  0.54180977,
         0.33179527,  0.30376641],
       ...,
       [ 1.32318317, -0.97755093,  1.97276498, ..., -0.81848793,
         0.06957653, -0.27833261],
       [ 2.35608813, -1.40420988,  2.20401977, ..., -0.80148421,
        -0.05007456,  0.2868402 ],
       [-0.36049336, -0.55279797, -0.32881476, ..., -0.46254337,
        -0.0792748 , -0.18933221]])

Next I need to reduce the number of features to the square root of the number of entries I have in the dataset. I'll use the select K best feature from scikit-learn using f-regression as the metric to identify the features worth keeping.

In [23]:
print(np.sqrt(204750))

452.49309386995066


In [24]:
selector = SelectKBest(f_regression, k=450)
transformed = selector.fit_transform(X_train_imp, y_train)

In [25]:
features = selector.get_support(indices=True)

In [26]:
X_train_selected = X_train_imp.iloc[:,features]
X_test_selected = X_test_imp.iloc[:,features]
X_train_selected.shape, X_test_selected.shape

((204750, 450), (51188, 450))

That's the last step for this portion of the project. I'll export these split datasets and move on to the modeling next.

In [27]:
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)

In [28]:
X_train_selected.to_csv('../data/processed/X_train.csv')
X_test_selected.to_csv('../data/processed/X_test.csv')
y_train.to_csv('../data/processed/y_train.csv')
y_test.to_csv('../data/processed/y_test.csv')