In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from numpy import mean, nanargmin
from numpy import std
from matplotlib import pyplot as plt
import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import cv
import shap

Using TensorFlow backend.
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_merged_sorted = pd.read_excel("Nans_removed_dataset.xlsx", index_col =  [0, 1])
companies_list = df_merged_sorted.index.get_level_values('Company Name').unique().tolist()
print(companies_list)

['1-800-Flowers.Com Inc', '11 88 0 Solutions AG', '111 Inc', '17 Education & Technology Group Inc', '22nd Century Group Inc', '2G Energy AG', '2U Inc', '3M Co', '3P Learning Ltd', '5N Plus Inc', '888 Holdings PLC', 'A G Barr PLC', 'A O Smith Corp', 'A and W Revenue Royalties Income Fund', 'A2 Milk Company Ltd', 'A2B Australia Ltd', 'AAC Clyde Space AB', 'AAK AB (publ)', 'AAPICO Hitech PCL', 'AAR Corp', 'ABC Technologies Holdings Inc', 'ABC-Mart Inc', 'ABM Industries Inc', 'ACC Ltd', 'ACCO Brands Corp', 'ACS Actividades de Construccion y Servicios SA', 'ADAMA Ltd', 'ADT Inc', 'AECC Aviation Power Co Ltd', 'AECI Ltd', 'AECOM', 'AG Anadolu Grubu Holding AS', 'AGC Inc', 'AGCO Corp', 'AIA Engineering Ltd', 'AIC Mines Ltd', 'AK Alrosa PAO', 'AKR Corporindo Tbk PT', 'ALAFCO Avaiation Lease and Finance Co KSCP', 'ALD SA', 'ALS Ltd', 'AMA Group Ltd', 'AMETEK Inc', 'AMG Advanced Metallurgical Group NV', 'ANA Holdings Inc', 'ANTA Sports Products Ltd', 'AO World PLC', 'AP Moeller - Maersk A/S', 'A

In [3]:
# Check the number of missing values before filling
df_merged_sorted.isna().sum()
columns_list = list(df_merged_sorted.drop(columns=['Sum','Label',
                                    'GICS Sector Name','Identifier (RIC)']))
df_list = []

In [4]:
for company in companies_list:

    subset_df = df_merged_sorted.loc[company].bfill(axis = "rows")
    df_list.append(subset_df)

vertical_concat = pd.concat(df_list, axis=0)
vertical_concat.isna().sum() 

GICS Sector Name                                         0
Identifier (RIC)                                         0
Health&SafetyPolicy                                      2
PolicyEmployeeHealth&Safety                            307
PolicySupplyChainHealth&Safety                        1495
TrainingandDevelopmentPolicy                             2
PolicySkillsTraining                                   470
PolicyCareerDevelopment                                579
PolicyDiversityandOpportunity                          636
TargetsDiversityandOpportunity                        1760
EmployeesHealth&SafetyTeam                            1142
Health&SafetyTraining                                  636
SupplyChainHealth&SafetyTraining                      1753
SupplyChainHealth&SafetyImprovements                  1827
EmployeesHealth&SafetyOHSAS18001                      1058
FlexibleWorkingHours                                  1547
DayCareServices                                       17

In [None]:
vertical_concat.head()


In [5]:
vertical_concat_features = vertical_concat.iloc[:,2:56]
# N_neighbors is a preprocessing hyperparameter
impute_knn = KNNImputer(n_neighbors=9)
knn_imputed_df = impute_knn.fit_transform(vertical_concat_features)

In [6]:
#%% TRAINING
# Drop the row that has NaN values (last row)
X = np.array(vertical_concat_features)
y = np.array(vertical_concat["Label"])
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                   test_size=0.2, random_state=42)


In [None]:
#%%
results = list()
strategies = [str(i) for i in [1,3,5,7,9,15,18,21]]

for s in strategies:
	# create the modeling pipeline
    pipeline = Pipeline(steps=[('i', KNNImputer(n_neighbors=int(s))), ('m', RandomForestClassifier())])
    # define model evaluation
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    # evaluate model
    scores = cross_val_score(pipeline, X, y, scoring='precision', cv=cv, n_jobs=-1, error_score='raise')
    results.append(scores)
    print('Mean Precision: %.3f (%.3f)' % (mean(scores), std(scores)))

pyplot.boxplot(results, labels=strategies, showmeans=True)
pyplot.show()


In [None]:
#%%
from xgboost import XGBClassifier

xgboost_model = XGBClassifier()
xgboost_model.fit(X_train, y_train)

y_pred = xgboost_model.predict(X_test)

predictions = [round(value) for value in y_pred]

precision = precision_score(y_test, predictions)


In [None]:
tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train,y_train)
predictions = tree_clf.predict(X_test)

rnd_clf = RandomForestClassifier(n_estimators=500)
rnd_clf.fit(X_train,y_train)
rf_predictions = rnd_clf.predict(X_test)

print("Random Forest Accuracy: ", accuracy_score(y_test, rf_predictions))
print("Random Forest Precision: ", precision_score(y_test, rf_predictions))
print("Random Forest Recall: ", recall_score(y_test, rf_predictions))
confusion_matrix(y_test,rf_predictions)

print(accuracy_score(y_test, predictions))
print(precision_score(y_test, predictions))
print(recall_score(y_test, predictions))
confusion_matrix(y_test,predictions)

In [None]:
from collections import Counter
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state = 42) 
X_res, y_res = sm.fit_resample(X_train, y_train)

print('Original dataset shape %s'%Counter(y_train))
print('Resampled dataset shape %s'%Counter(y_res))

# Test the newly populated dataset 
rf_clf_resampled = RandomForestClassifier()
rf_clf_resampled.fit(X_res,y_res)
rf_predictions_after_resampled = rf_clf_resampled.predict(X_test)

print(confusion_matrix(y_test, rf_predictions_after_resampled))
print(accuracy_score(y_test, rf_predictions_after_resampled))
print(precision_score(y_test, rf_predictions_after_resampled))
print(recall_score(y_test, rf_predictions_after_resampled))


In [7]:
from sklearn.preprocessing import LabelEncoder
from keras import metrics

#%%
def create_baseline():
	# create model
	model = Sequential()
	model.add(Dense(54, input_dim=54, activation='relu'))
	model.add(Dense(1, activation='sigmoid'))
	# Compile model
	model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[metrics.AUC(),
                                                                   metrics.Precision(),
                                                                   metrics.Recall()])
	return model

encoder = LabelEncoder()
encoder.fit(y)
encoded_Y = encoder.transform(y)

estimator = KerasClassifier(build_fn=create_baseline, epochs=100, batch_size=5, verbose=2)

#%%
kfold = StratifiedKFold(n_splits=10, shuffle=True)
results = cross_val_score(estimator, X, encoded_Y, cv=kfold)

2022-07-01 23:34:32.741613: I tensorflow/core/platform/cpu_feature_guard.cc:145] This TensorFlow binary is optimized with Intel(R) MKL-DNN to use the following CPU instructions in performance critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in non-MKL-DNN operations, rebuild TensorFlow with the appropriate compiler flags.
2022-07-01 23:34:32.744306: I tensorflow/core/common_runtime/process_util.cc:115] Creating new thread pool with default inter op setting: 4. Tune using inter_op_parallelism_threads for best performance.


Epoch 1/100
 - 18s - loss: nan - auc_1: 1.3433e-08 - precision_1: 0.0000e+00 - recall_1: 0.0000e+00
Epoch 2/100
 - 14s - loss: nan - auc_1: 0.0000e+00 - precision_1: 0.0000e+00 - recall_1: 0.0000e+00
Epoch 3/100


KeyboardInterrupt: 