-
Notifications
You must be signed in to change notification settings - Fork 0
/
training_model.py
86 lines (71 loc) · 3.46 KB
/
training_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# Import necessary packages
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# from imbalanced import ensemble_model, simple_model
# from Interpret import Interpret, white_box
from Interpret import Interpret
from data_loader import preprocess_dataframe, split_and_normalize
# from imbalanced import simple_model, ensemble_model
from utilities import prediction_evaluation
from active_learning import create_and_implement_strategy
SIMPLE_IMBALANCE = 0
INTERPRET = 1
def main():
# Read training data
train_data = pd.read_csv("data/training_set_features.csv")
# Read target labels
labels = pd.read_csv("data/training_set_labels.csv")
# TODO ------- Future work ------
# temp = Preprocessor(train_data, labels).encode()
# print(temp.named_transformers_["object"].categories_)
# TODO ===========================
# Preprocess the data
data, labels = preprocess_dataframe(train_data, labels)
# Run only once to create a global split for training and test saved in pickle
# create_dataset_splits(data, labels)
# Time for Active Learning
# Implement each strategy
# strategy1_examples = create_and_implement_strategy("QueryInstanceUncertainty", data, labels, queries)
# strategy2_examples = create_and_implement_strategy("QueryInstanceRandom", data, labels, queries)
# strategy3_examples = create_and_implement_strategy("QueryInstanceQBC", data, labels, 1000)
# Plot learning curves
# plot_learning_curves(strategy1_examples, strategy2_examples, strategy3_examples)
# Split the data into train and test
# x_train, x_test, y_train, y_test = split_and_normalize(data, labels, "None")
# Active Learning Uncertainty Sampling Dataset
# x_train, x_test, y_train, y_test = split_and_normalize(data, labels, "uncertainty")
# Active Learning Random Sampling Dataset
x_train, x_test, y_train, y_test = split_and_normalize(data, labels, "random")
# Active Learning QBC Sampling Dataset
# x_train, x_test, y_train, y_test = split_and_normalize(data, labels, "qbc")
# if SIMPLE_IMBALANCE:
# x_train, y_train = simple_model(x_train, y_train)
# else:
# x_train, y_train = ensemble_model(x_train, y_train)
# White box interpretation
drop = ['h1n1_concern', 'income_poverty', 'household_children', 'household_adults', 'behavioral_antiviral_meds',
'race', 'child_under_6_months', 'chronic_med_condition', 'education', 'behavioral_wash_hands',
'behavioral_outside_home', 'behavioral_touch_face', 'behavioral_face_mask', 'behavioral_avoidance',
'rent_or_own', 'sex', 'employment_status', 'hhs_geo_region', 'behavioral_large_gatherings']
# x_train = x_train.drop(drop, axis=1)
# x_test = x_test.drop(drop, axis=1)
# white_box(x_train, y_train, x_test, y_test)
# return
# Black box interpretation
black_box = Interpret(RandomForestClassifier(n_estimators=750, random_state=1, max_depth=5))
# black_box = Interpret(LinearSVC())
black_box.fit(x_train, y_train)
prediction = black_box.predict(x_test, y_test)
# Evaluate the model
prediction_evaluation(prediction=prediction, y_test=y_test)
# Interpret the model
if not INTERPRET:
black_box.feature_importance()
else:
black_box.shap_interpret()
# black_box.lime(num_features=len(x_train.columns))
# black_box.surrogate(LogisticRegression())
if __name__ == "__main__":
main()
# EOF