In [10]:
!pip install numpy


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [11]:
import numpy as np
from collections import Counter
from sklearn.datasets import make_classification
X,Y = make_classification(n_samples=100000, n_classes=2, weights=[0.995,0.005], flip_y=0)

In [12]:
print(np.bincount(Y))
print(np.mean(Y))
print(Counter(Y))

[99500   500]
0.005
Counter({np.int64(0): 99500, np.int64(1): 500})


In [13]:
print(np.shape(X))
print(np.shape(Y))

(100000, 20)
(100000,)


In [14]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.2)
print(np.shape(x_train))
print(np.shape(x_test))
print(np.shape(y_train))
print(np.shape(y_test))

(80000, 20)
(20000, 20)
(80000,)
(20000,)


In [15]:
classes, counts = np.unique(y_train, return_counts = True)

In [16]:
#minority = 1
A = x_train[y_train == 1] #creating minority matrix named A.
datapoints_to_add = counts[0]-counts[1]


In [17]:
def euclidean_distance(X):
    D = np.zeros((X.shape[0],X.shape[0]))
    for i in range(X.shape[0]):
        for j in range(X.shape[0]):
            D[i,j] = np.sqrt(np.sum((X[i]-X[j])**2))
    return D

def knn(X, k):
    D = euclidean_distance(X)
    np.fill_diagonal(D, np.inf)
    return np.argpartition(D, kth=k-1, axis=1)[:,:k]

def smote(A, n_new, k=5, rng=None):
    if rng is None:
        rng = np.random.default_rng()
    n, d = A.shape
    if n<2:
        raise ValueError("Need at least 2 minority samples for SMOTE")
    if k<1 or k >= n:
        raise ValueError("k must be in range [1,n-1].")
    
    nbrs = knn(A, k=k)
    base_index = rng.integers(low=0, high=n, size=n_new)
    ngbr_choice = rng.integers(low=0, high=k, size=n_new)
    ngbr_index = nbrs[base_index, ngbr_choice]
    u = rng.random(n_new).astype(A.dtype)

    P = A[base_index]
    Q = A[ngbr_index]
    Z = P + (u[:,None] * (Q-P))
    return Z

In [18]:
A_syn = smote(A, datapoints_to_add)
y_syn = np.full(shape=(A_syn.shape[0],), fill_value=1, dtype=y_train.dtype)

xp_train = np.vstack([x_train, A_syn])
yp_train = np.concatenate([y_train, y_syn])

In [19]:
#Task 3
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [20]:
imbmodel = LogisticRegression()
imbmodel.fit(x_train, y_train)
y_pred_imb = imbmodel.predict(x_test)
accuracy = accuracy_score(y_test, y_pred_imb)
print(accuracy)
print(classification_report(y_test, y_pred_imb))


0.99575
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     19892
           1       0.87      0.25      0.39       108

    accuracy                           1.00     20000
   macro avg       0.93      0.62      0.69     20000
weighted avg       1.00      1.00      0.99     20000



In [21]:
bmodel = LogisticRegression()
bmodel.fit(xp_train, yp_train)
y_pred_bal = bmodel.predict(x_test)
accuracy = accuracy_score(y_test, y_pred_bal)
print(accuracy)
print(classification_report(y_test, y_pred_bal))

0.9029
              precision    recall  f1-score   support

           0       1.00      0.90      0.95     19892
           1       0.05      0.92      0.09       108

    accuracy                           0.90     20000
   macro avg       0.52      0.91      0.52     20000
weighted avg       0.99      0.90      0.94     20000



In [22]:
#analysis
#minority class = 1
from sklearn.metrics import precision_recall_fscore_support
p,r,f,s = precision_recall_fscore_support(y_test, y_pred_bal)
pp,rp,fp,sp = precision_recall_fscore_support(y_test, y_pred_imb)

print(p[1], r[1], f[1], s[1])
print(pp[1], rp[1], fp[1], sp[1])

print()
print("Recall for minority class in balancned set", r[1])
print("Recall for minority class in imbalancned set", rp[1])


0.04872047244094488 0.9166666666666666 0.09252336448598131 108
0.8709677419354839 0.25 0.38848920863309355 108

Recall for minority class in balancned set 0.9166666666666666
Recall for minority class in imbalancned set 0.25


In [23]:
print("For fraud detection system, we cannot ignore actual positive that is predicted negative, therefore False Negatives should be consider in the matrix, which means RECALL is important metric for same.")
print("Smote typically improves recall by exposing the model to more minority examples which are synthetically created")
print("Therefore, the SMOTE-balanced model is generally preferable when recall is the priority.")

For fraud detection system, we cannot ignore actual positive that is predicted negative, therefore False Negatives should be consider in the matrix, which means RECALL is important metric for same.
Smote typically improves recall by exposing the model to more minority examples which are synthetically created
Therefore, the SMOTE-balanced model is generally preferable when recall is the priority.
