# Feature selection

In [1]:
from feature_selector import FeatureSelector
import pandas as pd

* Import csv files into dataframes 
* Make sure to remove the orange category label row in the csv
* Also move the original features in front of the new features
* Also remove the other targets in the set

In [2]:
meta = ['Reference DOI','Composition ID']
coercivity = pd.read_csv('SplitDB\\Coercivity7-26.csv').drop(columns=meta)
core_loss = pd.read_csv('SplitDB\\CoreLoss7-26.csv').drop(columns=meta)
curie_temp = pd.read_csv('SplitDB\\CurieTemperature7-26.csv').drop(columns=meta)
electrical_resistivity = pd.read_csv('SplitDB\\ElectricalResistivity7-26.csv').drop(columns=meta)
grain_size = pd.read_csv('SplitDB\\GrainSize7-26.csv').drop(columns=meta)
magnetic_saturation = pd.read_csv('SplitDB\\MagneticSaturation7-26.csv').drop(columns=meta)
magnetostriction = pd.read_csv('SplitDB\\Magnetostriction7-26.csv').drop(columns=meta)
permeability = pd.read_csv('SplitDB\\Permeability7-26.csv').drop(columns=meta)

# Dataframes, for reference 
* coercivity 
* core_loss 
* curie_temp 
* electrical_resistivity 
* grain_size 
* magnetic_saturation 
* magnetostriction 
* permeability

In [4]:
# Defining training labels
coercivity_labels = coercivity['Coercivity']
core_loss_labels = core_loss['Core Loss']
curie_temp_labels = curie_temp['Curie Temp']
electrical_resistivity_labels = electrical_resistivity['Electrical Resistivity']
grain_size_labels = grain_size['Grain Diameter']
magnetic_saturation_labels = magnetic_saturation['Magnetic Saturation']
magnetostriction_labels = magnetostriction['Magnetostriction']
permeability_labels = permeability['Permeability']

In [5]:
# Defining training features
coercivity_features = coercivity.drop(columns=['Coercivity'])
core_loss_features = core_loss.drop(columns=['Core Loss'])
curie_temp_features = curie_temp.drop(columns=['Curie Temp'])
electrical_resistivity_features = electrical_resistivity.drop(columns=['Electrical Resistivity'])
grain_size_features = grain_size.drop(columns=['Grain Diameter'])
magnetic_saturation_features = magnetic_saturation.drop(columns=['Magnetic Saturation'])
magnetostriction_features = magnetostriction.drop(columns=['Magnetostriction'])
permeability_features = permeability.drop(columns=['Permeability'])

In [6]:
# Building feature selector objects from labels and features
fs_coercivity = FeatureSelector(data = coercivity_features, labels = coercivity_labels)
fs_core_loss = FeatureSelector(data = core_loss_features, labels = core_loss_labels)
fs_curie_temp = FeatureSelector(data = curie_temp_features, labels = curie_temp_labels)
fs_electrical_resistivity = FeatureSelector(data = electrical_resistivity_features, labels = electrical_resistivity_labels)
fs_grain_size = FeatureSelector(data = grain_size_features, labels = grain_size_labels)
fs_magnetic_saturation = FeatureSelector(data = magnetic_saturation_features, labels = magnetic_saturation_labels)
fs_magnetostriction = FeatureSelector(data = magnetostriction_features, labels = magnetostriction_labels)
fs_permeability = FeatureSelector(data = permeability_features, labels = permeability_labels)

In [7]:
fs_coercivity.identify_all(selection_params = {'missing_threshold': 0.5, 'correlation_threshold': 0.95, 
                                    'task': 'regression', 'eval_metric': 'l1', 
                                     'cumulative_importance': 0.95})

4 features with greater than 0.50 missing values.

0 features with a single unique value.

15 features with a correlation magnitude greater than 0.95.

Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[120]	valid_0's l1: 190.099
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[131]	valid_0's l1: 143.998
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[717]	valid_0's l1: 308.1
Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[979]	valid_0's l1: 256.209
Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l1: 251.318
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[876]	valid_0's l1: 309.318
Training until validation scores 

In [9]:
fs_curie_temp.identify_all(selection_params = {'missing_threshold': 0.5, 'correlation_threshold': 0.99, 
                                    'task': 'regression', 'eval_metric': 'l1', 
                                     'cumulative_importance': 0.99})

4 features with greater than 0.50 missing values.

0 features with a single unique value.

20 features with a correlation magnitude greater than 0.99.

Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[12]	valid_0's l1: 24.9997
Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[998]	valid_0's l1: 14.8009
Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[996]	valid_0's l1: 33.2468
Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l1: 18.6517
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[20]	valid_0's l1: 42.5464
Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[998]	valid_0's l1: 24.0921
Trainin

In [10]:
#fs_electrical_resistivity.identify_all(selection_params = {'missing_threshold': 0.5, 'correlation_threshold': 0.95, 
#                                    'task': 'regression', 'eval_metric': 'l1', 
#                                     'cumulative_importance': 0.95})

In [11]:
fs_grain_size.identify_all(selection_params = {'missing_threshold': 0.5, 'correlation_threshold': 0.95, 
                                    'task': 'regression', 'eval_metric': 'l1', 
                                     'cumulative_importance': 0.95})

6 features with greater than 0.50 missing values.

0 features with a single unique value.

13 features with a correlation magnitude greater than 0.95.

Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[954]	valid_0's l1: 3.7626
Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[973]	valid_0's l1: 2.87316
Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[990]	valid_0's l1: 2.86748
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[469]	valid_0's l1: 2.91933
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[127]	valid_0's l1: 4.25741
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[152]	valid_0's l1: 2.40558
Training until valid

In [12]:
fs_magnetic_saturation.identify_all(selection_params = {'missing_threshold': 0.5, 'correlation_threshold': 0.95, 
                                    'task': 'regression', 'eval_metric': 'l1', 
                                     'cumulative_importance': 0.99})

2 features with greater than 0.50 missing values.

0 features with a single unique value.

28 features with a correlation magnitude greater than 0.95.

Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[390]	valid_0's l1: 0.0803961
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[427]	valid_0's l1: 0.0696476
Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[972]	valid_0's l1: 0.0798001
Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[977]	valid_0's l1: 0.0776766
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[259]	valid_0's l1: 0.0670665
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[474]	valid_0's l1: 0.0517401
Training until valid

In [13]:
fs_magnetostriction.identify_all(selection_params = {'missing_threshold': 0.5, 'correlation_threshold': 0.99, 
                                    'task': 'regression', 'eval_metric': 'l1', 
                                     'cumulative_importance': 0.99})

6 features with greater than 0.50 missing values.

0 features with a single unique value.

19 features with a correlation magnitude greater than 0.99.

Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[86]	valid_0's l1: 2.09474
Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l1: 1.36537
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[466]	valid_0's l1: 1.17018
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[446]	valid_0's l1: 1.76963
Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[991]	valid_0's l1: 2.07973
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[117]	valid_0's l1: 1.30538
Training until validation scores

In [14]:
fs_permeability.identify_all(selection_params = {'missing_threshold': 0.5, 'correlation_threshold': 0.95, 
                                    'task': 'regression', 'eval_metric': 'l1', 
                                     'cumulative_importance': 0.95})

5 features with greater than 0.50 missing values.

0 features with a single unique value.

18 features with a correlation magnitude greater than 0.95.

Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[605]	valid_0's l1: 9854.2
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[369]	valid_0's l1: 11431.4
Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l1: 11409
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[404]	valid_0's l1: 9822.38
Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[999]	valid_0's l1: 8552.66
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[267]	valid_0's l1: 10959
Training until validation scores don

In [15]:
coercivity_removed_all = fs_coercivity.remove(methods = 'all', keep_one_hot = False)
#core_loss_removed_all = fs_core_loss.remove(methods = 'all', keep_one_hot = False)
curie_temp_removed_all = fs_curie_temp.remove(methods = 'all', keep_one_hot = False)
#electrical_resistivity_removed_all = fs_electrical_resistivity.remove(methods = 'all', keep_one_hot = False)
grain_size_removed_all = fs_grain_size.remove(methods = 'all', keep_one_hot = False)
magnetic_saturation_removed_all = fs_magnetic_saturation.remove(methods = 'all', keep_one_hot = False)
magnetostriction_removed_all = fs_magnetostriction.remove(methods = 'all', keep_one_hot = False)
permeability_removed_all = fs_permeability.remove(methods = 'all', keep_one_hot = False)

['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance'] methods have been run

Removed 40 features including one-hot features.
['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance'] methods have been run

Removed 39 features including one-hot features.
['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance'] methods have been run

Removed 38 features including one-hot features.
['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance'] methods have been run

Removed 43 features including one-hot features.
['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance'] methods have been run

Removed 31 features including one-hot features.
['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance'] methods have been run

Removed 42 features including one-hot features.


In [16]:
coercivity_best = list(coercivity_removed_all.iloc[:,:])
#core_loss_best = list(core_loss_removed_all.iloc[:,:])
curie_temp_best  = list(curie_temp_removed_all.iloc[:,:])
#electrical_resistivity_best  = list(electrical_resistivity_removed_all.iloc[:,:])
grain_size_best  = list(grain_size_removed_all.iloc[:,:])
magnetic_saturation_best  = list(magnetic_saturation_removed_all.iloc[:,:])
magnetostriction_best = list(magnetostriction_removed_all.iloc[:,:])
permeability_best = list(permeability_removed_all.iloc[:,:])

In [17]:
with open('kept_coercivity.txt', 'w') as file_handler:
    for item in coercivity_best:
        file_handler.write("{}\n".format(item))

In [18]:
#with open('kept_core_loss.txt', 'w') as file_handler:
#    for item in core_loss_best:
#         file_handler.write("{}\n".format(item))

In [19]:
with open('kept_curie_temp.txt', 'w') as file_handler:
    for item in curie_temp_best:
        file_handler.write("{}\n".format(item))

In [20]:
#with open('kept_electrical_resistivity.txt', 'w') as file_handler:
#    for item in electrical_resistivity_best:
#        kept_electrical_resistivity.write("%s\n" % item)

In [21]:
with open('kept_grain_size.txt', 'w') as file_handler:
    for item in grain_size_best:
        file_handler.write("{}\n".format(item))

In [22]:
with open('kept_magnetic_saturation.txt', 'w') as file_handler:
    for item in magnetic_saturation_best:
        file_handler.write("{}\n".format(item))

In [23]:
with open('kept_magnetostriction.txt', 'w') as file_handler:
    for item in magnetostriction_best:
        file_handler.write("{}\n".format(item))

In [24]:
with open('kept_permability.txt', 'w') as file_handler:
    for item in permeability_best:
        file_handler.write("{}\n".format(item))