In [2]:
import time
import warnings
from pathlib import Path
import numpy as np
import pandas as pd

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    import xgboost as xgb
    
pd.options.display.float_format = '{:20,.5f}'.format

PIO = {"participants", "interventions", "outcomes"}

In [3]:
def hotcode(df):

    cat_cols = set(df.select_dtypes(include=['category']).columns)
    print(f"Generating one-hot coding for columns {', '.join(list(cat_cols))}")

    dummies = pd.get_dummies(df[cat_cols].copy(), columns=cat_cols, dtype=bool)

    df = downcast(pd.concat([df.drop(cat_cols, axis=1), dummies], axis=1))
    print('hotcode complete.')
          
#     dropcols = df.loc[:,(df.sum() == 0)].columns
                   
#     df = df.drop(dropcols, axis=1)

    return df


def downcast(df):
    unsigned = df.select_dtypes(include=['uint8', 'uint16', 'int8', 'int16', 'int32', 'int64']).columns
    for col in unsigned:
        df[col] = df[col].apply(pd.to_numeric, downcast='signed')

    return df
          
          
def print_attrs(obj, only_attrs=False, only_names=False):
    
    attrs = [s for s in dir(obj) if not s[:2] == '__']
    
    for attr in attrs:
        if not callable(getattr(obj, attr)):
            print(f'\n*** ATTR: xgtrain.{attr} ***')
            if not only_names: print(getattr(obj, attr))

        elif not only_attrs:
            print(f'\n*** METHOD: xgtrain.{attr} ***')
            try:
                if not only_names: print(getattr(obj, attr)())
            except:
                print('failed')
          
def write_lines(path, collection):
    with (path).open('w') as f:
        namestr = '\n'.join(list(collection))
        f.write(namestr)

# Load data

Data preparation; clear the memory after running.

In [41]:
selected_label = 'interventions'
df = pd.read_parquet('data/split/train3_100.parquet')

labels = df.copy()[PIO]
features = hotcode(
    df.copy().drop(PIO, axis=1)
)

num_pos = labels[selected_label].sum()
num_neg = len(labels[selected_label]) - num_pos
weight = num_neg / num_pos

weights = ((labels[selected_label] * weight) + 1); weights = weights/max(weights)

# features.insert(0, selected_label, df[selected_label])
# features.insert(1, 'class_weight', weights)

NameError: name 'pd' is not defined

In [5]:
# dmatrix = xgb.DMatrix(features.reset_index(drop=True), label=labels[selected_label], weight=weights)                          

In [6]:
dir_name = Path(f'data\\xgb_test\\{selected_label}')
dir_name.mkdir(exist_ok=True,parents=True)

write_lines(dir_name / 'feature_names.txt', dmatrix.feature_names)
write_lines(dir_name / 'feature_types.txt', dmatrix.feature_types)
# write_lines(dir_name / 'handle.txt', dmatrix.handle)

# Model script

In [19]:
print(list(globals().keys())[25:])



In [27]:
xg_train = dmatrix
del warnings, dump_svmlight_file, hotcode, downcast, df, labels, features, weight, weights, dmatrix

In [7]:
dmatrix.feature_names

['stopword',
 'punctuation',
 'is_upper',
 'is_lower',
 'cap_first',
 'is_int',
 'is_dec',
 'first_sent',
 'last_sent',
 'first_word',
 'last_word',
 'FIRST_WORD_LAG1',
 'FIRST_WORD_LAG2',
 'LAST_WORD_LAG-2',
 'LAST_WORD_LAG-1',
 'doc_loc',
 'sent_id',
 'sent_loc',
 'dist_to_parent',
 'PMFT_1',
 'PMFT_2',
 'PMFT_3',
 'PMFT_4',
 'PMFT_5',
 'PMFT_6',
 'PMFT_7',
 'PMFT_8',
 'PMFT_9',
 'PMFT_10',
 'PMFT_11',
 'PMFT_12',
 'PMFT_13',
 'PMFT_14',
 'PMFT_15',
 'PMFT_16',
 'PMFT_17',
 'PMFT_18',
 'PMFT_19',
 'PMFT_20',
 'PMFT_21',
 'PMFT_22',
 'PMFT_23',
 'PMFT_24',
 'PMFT_25',
 'PMFT_26',
 'PMFT_27',
 'PMFT_28',
 'PMFT_29',
 'PMFT_30',
 'PMFT_31',
 'PMFT_32',
 'PMFT_33',
 'PMFT_34',
 'PMFT_35',
 'PMFT_36',
 'PMFT_37',
 'PMFT_38',
 'PMFT_39',
 'PMFT_40',
 'PMFT_41',
 'PMFT_42',
 'PMFT_43',
 'PMFT_44',
 'PMFT_45',
 'PMFT_46',
 'PMFT_47',
 'PMFT_48',
 'PMFT_49',
 'PMFT_50',
 'PMFT_51',
 'PMFT_52',
 'PMFT_53',
 'PMFT_54',
 'PMFT_55',
 'PMFT_56',
 'PMFT_57',
 'PMFT_58',
 'PMFT_59',
 'PMFT_60',
 'PM

< can clear mem >

In [23]:
xg_train.handle

c_void_p(1792626301920)

In [34]:
dir_name = Path(f'data\\xgb_test\\{selected_label}')
dir_name.mkdir(exist_ok=True,parents=True)

print('data shape:', xg_train.num_row(), xg_train.num_col())

params = {
    'verbosity': 3,
#     'eta': ?, 
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'tree_method': 'gpu_hist',
    'evals': xg_train,
    'num_rounds': 250,
    'max_depth': 6,  # default 6
    'num_parallel_tree': 1, # default 1
    'scale_pos_weight': 7.0 # set in loop
}

params['nthread'] = 2 # sys.argv with flag

data shape: 23452 2442


In [40]:
clf = xgb.XGBClassifier(**params)

clf = xgb.train(params, xg_train)
print(f"Classifier loaded. Initalizing training...\n")
# clf.fit(params, xgtrain), # xgtrain
#         eval_set=[xgtrain, xgtest], #xgtrain, xgtest
#         eval_metric='logloss',
#         verbose=True)

#             clf.fit(X_train, y_train[label],
#                     eval_set=[(X_train, y_train[label]), (X_test, y_test[label])], #insert kcv here
#                     eval_metric='logloss',
#                     verbose=True)

[05:27:03] DEBUG: C:/Users/Administrator/workspace/xgboost-win64_release_1.0.0/src/tree/updater_gpu_hist.cu:1167: [GPU Hist]: Configure
Parameters: { evals, num_rounds } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




TypeError: expected string, got ndarray

In [None]:
hotcode(features)

In [None]:
labels

In [30]:
xgtrain

NameError: name 'xgtrain' is not defined

In [None]:
try:
    features = hotcode(features)
except ValueError:
    print('No categorical values found in data')
    
features

Saving DMatrix into a XGBoost binary file will make loading faster:

In [None]:
data = pd.DataFrame(np.arange(12).reshape((4,3)), columns=['a', 'b', 'c'])
label = pd.DataFrame(np.random.randint(2, size=4))
dtrain = xgboost.DMatrix(data, label=label)
dtrain

In [None]:
drop.axis(1)
labels

In [None]:
labels["interventions"]

dtrain = xgboost.DMatrix('train.svm.txt')
dtrain.save_binary('train.buffer')

# bin

In [50]:
# from pathlib import Path
# import xgboost as xgb
   
# xgtrain = xgb.DMatrix("train.buffer#dtrain.cache")
# xgtrain.feature_names = dmatrix.feature_names
# xgtrain.feature_types = dmatrix.feature_types
# xgtrain.handle = dmatrix.handle
# xgtrain._feature_names = dmatrix._feature_names
# xgtrain._feature_types = dmatrix._feature_types

# del dmatrix

[05:07:42] 23452x2442 matrix with 57269784 entries loaded from train.buffer#dtrain.cache


In [None]:
# features = pd.read_parquet('train3_100.parquet') # X
# labels = pd.read_parquet('data/labels_sent.parquet') # y

# # print(features.index)

# df = labels.loc[(features.index.unique('doc'), slice(None), slice(None))]

# # df = labels.loc[features.index]

# df.to_parquet('data/labels_sent_100.parquet')
# # df.to_parquet()
# print(df)

In [None]:
# ## Validation Split(s) if desired

#     k_folds = 5

#     train_idx, val_idx = train_test_split(df.index.unique('doc'),
#                                           train_size=1 - (1 / k_folds))  # default 80:20

#     print(f'Performing a {len(train_idx)}:{len(val_idx)} train/test split.')

#     train_set = df.loc[(train_idx, slice(None)), :]
#     val_set = df.loc[(val_idx, slice(None)), :]

# #     X_train_df, y_train_df = train_set[features['all']], train_set[PIO]
# #     X_test_df, y_test_df = val_set[features['all']], val_set[PIO]

#     start_time = time.time()
#     try:
#         X_train, X_test = hotcode(X_train_df), hotcode(X_test_df)
#     except ValueError:
#         print('No categorical values found in data')

#     print(f'\nOne-hot coding took {time.time() - start_time:.2f}s.\n')

In [None]:




# for feature_set, feats in features.items():
#         if feature_set != 'all': print('done'); break

#         y_pred = words.loc[y_test.index].to_frame().join(y_test)  # index=y_test.index)#, columns=labels)

#         start_time = time.time()

#         for label in labels:

#             print(f"Training a GBC model to predict label '{label}'.")
#             print('Starting XGBoost.')

#             params['scale_pos_weight'] = (len(y_train) - sum(y_train[label])) / sum(y_train[label])
#             clf = xgboost.XGBClassifier(**params)

#             clf.fit(X_train, y_train[label],
#                     eval_set=[(X_train, y_train[label]), (X_test, y_test[label])], #insert kcv here
#                     eval_metric='logloss',
#                     verbose=True)

#             evals_result = clf.evals_result()

#             clf = clf.fit(X_train, y_train[label])
#             score = clf.score(X_test, y_test[label])

#             targets = np.delete(clf.classes_, 0)

#             y_pred[f'{label}_pred'] = clf.predict(X_test)

#             report = pd.DataFrame(
#                 classification_report(y_test[label].values.flatten(), y_pred[f'{label}_pred'].values.flatten(),
#                                       labels=targets, digits=3, output_dict=True)
#             )

#             print(report)
#             time.sleep(3)

#             print(f"Saving data for run with features '{feature_set}' and target '{label}' in '{dir_name}'")

#             model_name = f'_{label}_{feature_set}'

#             save_model = True
#             if save_model:
#                 Path(f'models/{dir_name}').mkdir(exist_ok=True, parents=True)
#                 clf.save_model(f'models/{dir_name}/{model_name}.xgbm')

#             exp_folder = EXPERIMENTS / dir_name / model_name
#             exp_folder.mkdir(exist_ok=True, parents=True)

#             report.to_csv(exp_folder / 'class_report.csv')
#             y_pred.to_csv(exp_folder / 'predictions.csv')
#             pd.to_pickle(evals_result, exp_folder / 'evals_result.pickle')

#             with open(exp_folder / '.params', 'w') as f:
#                 f.write('\n'.join([f'{k}, {v}' for k, v in params.items()]))

#             pd.DataFrame(clf.predict_proba(X_test), index=y_pred.index, columns=[f'not {label}', label]) \
#               .to_csv(exp_folder / 'predict_proba.csv')

#             print(f'\n{time.time() - start_time:.2f}')

#     print('Done')


# if __name__ == '__main__':
#     main()