# **Imbalanced Dataset**
A dataset with an unequal class distribution
eg: when we have two class one is head and another tail. when the data of head is 600 and the data of tail is 60. This is imbalanced dataset.

In [1]:
import numpy as np
import pandas as pd

In [4]:
card = pd.read_csv('credit_data.csv')

In [5]:
card.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [6]:
card.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
71391,54278,1.039072,-0.213656,0.431251,0.918611,-0.621456,-0.667224,0.134553,-0.181427,0.400214,...,-0.167499,-0.51009,-0.079011,0.444677,0.428995,0.256921,-0.039011,0.031128,106.85,0.0
71392,54279,-5.08429,-4.002881,1.329865,-0.573442,-0.690674,0.019033,0.844001,-0.466913,0.381563,...,-0.962575,0.808686,2.249283,0.165161,1.093631,-0.2355,-0.055029,-0.27236,314.31,0.0
71393,54280,1.441869,-0.411978,-0.357381,-0.98139,-0.244335,-0.292055,-0.242842,-0.09125,-1.24176,...,-0.832744,-2.041536,0.134713,-0.883851,0.100577,0.680045,-0.083669,-0.013649,11.97,0.0
71394,54280,-0.441533,0.363429,1.434158,-2.1165,-0.582532,-1.354317,0.886887,-0.335278,0.949012,...,-0.028278,0.146103,-0.10125,0.781919,0.258258,-0.843708,0.155215,0.121668,57.01,0.0
71395,54281,1.236141,0.256586,0.175963,0.504282,-0.1976,-0.569588,-0.058326,-0.005874,-0.125664,...,-0.263713,-0.821591,0.081703,-0.056701,0.212892,,,,,


In [8]:
card.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64

In [10]:
card['Class'].value_counts()

Class
0.0    71218
1.0      177
Name: count, dtype: int64

This is highly imbalanced dataset

In [11]:
#0 --> legit transaction
#1 --> fraudulent transaction

In [13]:
#separating the legin and fraudulent transaction
legit = card[card.Class == 0]
fraud = card[card.Class == 1]

In [14]:
print(legit.shape)
print(fraud.shape)

(71218, 31)
(177, 31)


# **undersampling**
building a sample dataset containing similar distribution of legit and fraud transaction by reducing the number of legit transaction as same as fraud. so reducing 71218 to around 177

In [15]:
legit_sample = legit.sample(n=200) #here .sample gives random sample

In [17]:
print(legit_sample.shape)

(200, 31)


In [18]:
#concatencate the two dataframe
new_dataset = pd.concat([legit_sample,fraud], axis = 0)

In [19]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
27088,34380,-1.596365,-0.600011,-0.657045,-1.651893,-1.541419,-0.123707,2.517731,0.164091,0.258792,...,0.483879,0.585999,1.100304,-0.032292,0.199587,-0.832829,0.119214,0.206759,560.84,0.0
48092,43492,1.083102,-1.201881,0.991803,-0.382602,-1.74468,-0.440754,-0.952107,-0.035968,-0.133107,...,0.394883,0.898303,-0.230588,0.455831,0.409979,-0.0621,0.023898,0.050671,152.25,0.0
23867,32959,-0.592406,-0.429611,1.723092,-2.916705,-0.6101,0.64942,-0.744899,0.468296,-2.054493,...,-0.080745,0.123116,-0.404481,-1.222663,0.405581,-0.078297,0.311266,0.099517,6.0,0.0
28981,35264,-1.372181,-0.529822,2.204877,0.634738,-1.280958,1.26447,1.051763,0.433367,0.282827,...,0.486855,1.05428,0.488414,0.204594,0.267482,-0.284383,0.042102,0.134463,347.0,0.0
35295,38052,1.114742,0.061822,1.533625,1.459828,-1.147879,-0.486329,-0.471939,-0.007701,0.687983,...,-0.014189,0.216001,0.047215,0.945932,0.380639,-0.445427,0.080366,0.046095,9.99,0.0


In [20]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
68633,53076,1.296231,0.417447,0.193963,0.901644,0.130531,-0.371634,0.158126,-0.202669,-0.079512,...,-0.112114,-0.220002,-0.121022,-0.440454,0.67154,-0.413518,0.032838,0.0206,1.18,1.0
69498,53451,0.385108,1.21762,-1.953872,2.087076,-1.144225,-0.576888,-2.582865,0.64323,-1.191233,...,0.594623,0.372144,-0.310456,-0.624065,0.840216,-0.159452,0.599482,0.288916,8.0,1.0
69980,53658,-1.739341,1.344521,-0.534379,3.195291,-0.416196,-1.261961,-2.340991,0.713004,-1.416265,...,0.38318,-0.213952,-0.33664,0.237076,0.246003,-0.044228,0.510729,0.220952,0.0,1.0
70141,53727,-1.649279,1.263974,-1.050826,2.237991,-2.527889,-0.88994,-2.355254,0.854659,-1.281243,...,0.679176,0.731907,0.333045,0.392505,-0.274197,0.802349,0.390809,0.112146,112.45,1.0
70589,53937,-2.042608,1.573578,-2.372652,-0.572676,-2.097353,-0.174142,-3.03952,-1.634233,-0.594809,...,-0.723326,0.501222,-0.696892,-0.600514,0.127547,-0.786072,0.606097,0.171697,261.87,1.0


In [21]:
new_dataset['Class'].value_counts()

Class
0.0    200
1.0    177
Name: count, dtype: int64