In [1]:
from imports import *
config = load_config("./config.yml")

In [2]:

df = pd.read_csv('./data/data.csv', sep='|', header=None)
# show all columns
pd.set_option('display.max_columns', None)

# load config file containing the column names based on datatype.
config = load_config("./config.yml")


In [3]:
pre = preprocessing(df, config)
df = pre.rename_cols()
df = pre.fillna()
df = pre.remove_outliers()
df = pre.drop_duplicates()
df = pre.convert_dtypes()
df = pre.labelencode()


In [4]:
feat_engg = feat_engg(df, config)
df = feat_engg.split_datetime_col()
df = feat_engg.cal_time_diff() # calculate the time difference between the 
df = feat_engg.categorify_columns()
df = feat_engg.count_encode_columns()

In [5]:
# feature selection
feat_sel = feature_selection(df, config)
cont_feature = feat_sel.cont_feature_oneway_anova()
cat_feature = feat_sel.cat_feature_mutual_info()
final_features = list(cont_feature) + list(cat_feature)

X = df[final_features]
y = df['target']

In [6]:
train = train_model(df, final_features, config)
X_train, X_test, y_train, y_test = train.split_data()
lr = train.base_model()

0    39668
1    39668
Name: target, dtype: int64




In [None]:
y_pred = lr.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))
print("F1 Score: ", f1_score(y_test, y_pred))
print("ROC AUC Score: ", roc_auc_score(y_test, y_pred))
from sklearn.metrics import plot_confusion_matrix
# Plot confusion matrix with figsize
fig, ax = plt.subplots(figsize=(10, 10))
plot_confusion_matrix(lr, X_test, y_test, cmap=plt.cm.Blues, ax=ax)
plt.show()

In [None]:
best_params = train.hyperparameter_tuning_randomforest()
rf = train.train_random_forest()

In [None]:
y_pred = rf.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))
print("F1 Score: ", f1_score(y_test, y_pred))
print("ROC AUC Score: ", roc_auc_score(y_test, y_pred))

from sklearn.metrics import plot_confusion_matrix
# Plot confusion matrix with figsize
fig, ax = plt.subplots(figsize=(10, 10))
plot_confusion_matrix(rf, X_test, y_test, cmap=plt.cm.Blues, ax=ax)
plt.show()

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
rf_stra = RandomForestClassifier(**best_params, oob_score=True)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

scores = cross_val_score(rf_stra, df[final_features], df['target'], scoring='precision', cv=cv, n_jobs=-1)

In [None]:
scores

In [None]:
from sklearn.model_selection import cross_validate
rf_stra = RandomForestClassifier(**best_params, oob_score=True)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cross = cross_validate(rf_stra, df[final_features], df['target'], scoring=['f1', 'roc_auc','precision','recall','accuracy'], cv=skf, n_jobs=-1)

In [None]:
cross.keys()

In [None]:
print("F1 Score: ", cross['test_f1'].mean())
print("ROC AUC Score: ", cross['test_roc_auc'].mean())
print("Precision: ", cross['test_precision'].mean())
print("Recall: ", cross['test_accuracy'].mean())

In [None]:
scores.mean

In [None]:
np.mean(scores)

In [None]:
scores

In [None]:
scores

In [None]:
scores.keys()

In [None]:
## Split the data into train and test set with ratio 80:20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# SMOTE technique to handle imbalanced data



In [None]:
# model training

base_model = train.base_model()


In [None]:
# hyperparameter tuning
train.hyperparameter_tuning_randomforest()
# train the model with best parameters
rf_model = train.train_random_forest()

In [None]:
lr = LogisticRegression(solver='sag', max_iter=1000, random_state=42)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))
print("F1 Score: ", f1_score(y_test, y_pred))
print("ROC AUC Score: ", roc_auc_score(y_test, y_pred))

In [None]:
## Baseline model is logistic regression
X = df[final_features]
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)


# impor classifcation report and roc_auc_score from sklearn.metrics



In [None]:
# undersampling
X = df[final_features]
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# concatenate our training data back together
X = pd.concat([X_train, y_train], axis=1)

# separate minority and majority classes
not_fraud = X[X.target==0]
fraud = X[X.target==1]
not_fraud_downsampled = resample(not_fraud,
                                replace = False, # sample without replacement
                                n_samples = len(fraud), # match minority n
                                random_state = 27) # reproducible results

In [None]:
downsampled = pd.concat([not_fraud_downsampled, fraud])


In [None]:
y_train = downsampled.target
X_train = downsampled.drop('target', axis=1)

undersampled = LogisticRegression(solver='liblinear').fit(X_train, y_train)

undersampled_pred = undersampled.predict(X_test)
accuracy_score(y_test, undersampled_pred)
print(classification_report(y_test, undersampled_pred))


In [None]:
print(accuracy_score(y_test, undersampled_pred))
print(confusion_matrix(y_test, undersampled_pred))

In [None]:
# print precision and recall scores
print("Precision:", precision_score(y_test, undersampled_pred))
print("Recall:",recall_score(y_test, undersampled_pred))
# print()
# plot ROC curve
undersampled_probs = undersampled.predict_proba(X_test)
undersampled_probs = undersampled_probs[:, 1]
auc = roc_auc_score(y_test, undersampled_probs)
print('AUC: %.2f' % auc)
# calculate roc curve
fpr, tpr, thresholds = roc_curve(y_test, undersampled_probs)
# plot no skill
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.')
# show the plot
plt.show()


In [None]:
downsampled.target.value_counts()

In [None]:
# Plot ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()

In [None]:
## Undersample 


In [None]:
from sklearn.model_selection import StratifiedKFold
X = df[final_features]
y = df['target']
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
for train_index, test_index in kfold.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]


In [None]:
rt = pd.concat([X_train, y_train], axis=1)

In [None]:
# example of stratified k-fold cross-validation with an imbalanced dataset
from sklearn.datasets import make_classification
from sklearn.model_selection import StratifiedKFold
# generate 2 class dataset
X, y = make_classification(n_samples=1000, n_classes=2, weights=[0.99, 0.01], flip_y=0, random_state=1)
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)
# enumerate the splits and summarize the distributions
for train_ix, test_ix in kfold.split(X, y):
	# select rows
	train_X, test_X = X[train_ix], X[test_ix]
	train_y, test_y = y[train_ix], y[test_ix]
	# summarize train and test composition
	train_0, train_1 = len(train_y[train_y==0]), len(train_y[train_y==1])
	test_0, test_1 = len(test_y[test_y==0]), len(test_y[test_y==1])
	print('>Train: 0=%d, 1=%d, Test: 0=%d, 1=%d' % (train_0, train_1, test_0, test_1))

In [None]:
y =pd.DataFrame(y, columns=['target'])

In [None]:
y.value_counts()

In [None]:
(660/666)*100

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
X = df[config['all_features']]
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)

In [None]:
# import LogisticRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, roc_auc_score, roc_curve, recall_score, precision_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV


smote = LogisticRegression(solver='liblinear').fit(X_train, y_train)

smote_pred = smote.predict(X_test)

# Checking accuracy
print(accuracy_score(y_test, smote_pred))


# f1 score
print(f1_score(y_test, smote_pred))

print(recall_score(y_test, smote_pred))


In [None]:
# import LogisticRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, roc_auc_score, roc_curve, recall_score, precision_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV


smote = LogisticRegression(solver='liblinear').fit(X_train, y_train)

smote_pred = smote.predict(X_test)

# Checking accuracy
print(accuracy_score(y_test, smote_pred))


# f1 score
print(f1_score(y_test, smote_pred))

print(recall_score(y_test, smote_pred))


In [None]:

df[config['cat_cols_for_feature_selection']] = df[config['cat_cols_for_feature_selection']].astype('int64')
fs = SelectKBest(score_func=mutual_info_classif, k=7)
X = df[config['cat_cols_for_feature_selection']]
y = df['target']
x_best = fs.fit_transform(X, y)
print('orinal feature names:', X.columns)
print('selected top 5 features:', X.columns[fs.get_support()])



In [None]:

fs = SelectKBest(score_func=f_classif, k=7)
X = df[config['cont_cols_for_feature_selection']]
y = df['target']
fs.fit_transform(X, y)
print('orinal feature names:', X.columns)
print('selected top 5 features:', X.columns[fs.get_support()])

In [None]:
df.columns

In [None]:
df.columns

In [None]:
from mlxtend.feature_selection import ExhaustiveFeatureSelector
from sklearn.ensemble import RandomForestClassifier
esf = ExhaustiveFeatureSelector(RandomForestClassifier(), min_features=4, max_features=5, scoring='roc_auc', print_progress=True, cv=2)
X = df_engg_[config['all_features']]
y = df['target']

In [None]:
select_features = df_engg_[config['all_features']].columns[list(esf.fit(X, y).best_idx_)]
print(select_features)

In [None]:
from category_encoders import LeaveOneOutEncoder
loe = LeaveOneOutEncoder()
X = df[config['categorify_cols']]
y = df['target']
X_loe = loe.fit_transform(X, y)
X_loe.head()