In [2]:
import requests
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
import random
import math
from scipy.stats import norm

In [3]:
url="https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv"
response=requests.get(url)
with open("Creditcard_data.csv","wb") as f:
    f.write(response.content)


In [4]:
data=pd.read_csv("Creditcard_data.csv")
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,1
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [5]:
data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [6]:
legit=data[data.Class==0]
fraud=data[data.Class==1]
print(legit.shape)
print(fraud.shape)

(763, 31)
(9, 31)


In [7]:
pipeline = Pipeline(steps=[('o', SMOTE()), ('u', RandomUnderSampler())])
X = data.drop('Class', axis=1)
y = data.Class
X_balanced, y_balanced = pipeline.fit_resample(X, y)
balanced_df = pd.concat([X_balanced, y_balanced], axis=1)
balanced_df.shape

(1526, 31)

In [8]:
# simple random sampling
n = int((1.96*1.96 * 0.5*0.5)/(0.05**2))
simple_df = balanced_df.sample(n=n, random_state=42)
print(simple_df)

      Time        V1        V2        V3        V4        V5        V6  \
1439   534  0.293313  0.379500  0.939714  0.466517  0.226453 -0.994029   
76      49  0.921544 -0.067084  0.077461  0.953638  0.067412  0.016152   
1010   525 -1.540704  0.137080  1.322331 -0.451736  0.931796 -0.440525   
660    503  0.074738 -3.032009 -0.429919 -0.442721 -1.784710 -0.066668   
1132   492 -1.140421 -0.174707  1.884871  0.372218  1.262496 -0.604966   
...    ...       ...       ...       ...       ...       ...       ...   
1481   171  0.033593  0.492420  0.490663  0.134686  0.854481  0.258075   
756    574 -1.062129 -0.618574  0.615388 -3.335834  0.746649 -0.540531   
1074   258 -0.853791  1.095560 -0.349471  1.624193  0.300117 -0.400738   
867    429 -2.252251  1.096115 -0.825268  3.447937  0.052095 -1.037659   
485    358  1.106251  0.398625  0.860421  2.388862 -0.365843 -0.192568   

            V7        V8        V9  ...       V21       V22       V23  \
1439  0.440511 -0.186603 -0.054627  ..

In [9]:
# systematic sampling

interval = 2
systematic_df = balanced_df.iloc[::interval]
systematic_df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.620000,0
2,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.500000,0
4,2,-0.425966,0.960523,1.141109,-0.168252,0.420987,-0.029728,0.476201,0.260314,-0.568671,...,-0.208254,-0.559825,-0.026398,-0.371427,-0.232794,0.105915,0.253844,0.081080,3.670000,0
6,7,-0.644269,1.417964,1.074380,-0.492199,0.948934,0.428118,1.120631,-3.807864,0.615375,...,1.943465,-1.015455,0.057504,-0.649709,-0.415267,-0.051634,-1.206921,-1.085339,40.800000,0
8,9,-0.338262,1.119593,1.044367,-0.222187,0.499361,-0.246761,0.651583,0.069539,-0.736727,...,-0.246914,-0.633753,-0.120794,-0.385050,-0.069733,0.094199,0.246219,0.083076,3.680000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1516,476,0.977200,0.408869,0.341263,0.554272,-0.078098,-0.755045,0.142133,-0.127440,-0.018117,...,-0.249406,-0.716732,0.133194,-0.061250,-0.160305,0.089895,0.037964,0.078137,1.218936,1
1518,488,-0.555211,1.170796,-0.666475,2.370172,-0.441277,-1.249825,-1.242167,0.603224,-1.399624,...,0.121485,-0.421701,-0.173417,0.331483,0.131437,0.136769,0.121697,-0.057530,0.634898,1
1520,132,0.873670,0.415068,0.350700,0.506545,0.013968,-0.643732,0.154399,-0.092365,-0.015898,...,-0.236337,-0.687225,0.135482,-0.210258,-0.301997,0.088478,0.058968,0.095199,2.141410,1
1522,528,-1.979626,-2.438982,2.452973,1.121359,2.432806,0.565065,-2.051815,0.769896,0.937313,...,0.415499,1.177547,0.286984,-0.832844,-0.207402,0.835344,-0.124228,-0.170176,1.490237,1


In [10]:
# stratified sampling

from sklearn.model_selection import train_test_split

n = int((1.96*1.96 * 0.5*0.5)/((0.05)**2))

strata = balanced_df.groupby('Class')

# sample 2 rows from each stratum
stratified_df = strata.apply(lambda x: x.sample(n))

stratified_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,656,499,-0.860626,-0.109137,2.112474,-1.400567,0.180269,1.329656,0.431001,0.317761,0.593762,...,-0.197635,-0.436535,-0.194840,-1.236873,0.218668,0.902383,-0.210195,-0.190458,92.820000,0
0,305,222,-0.352704,0.975517,1.603365,-0.282475,0.480561,-0.222647,0.850194,-0.265018,-0.283460,...,-0.139086,-0.062304,-0.231531,-0.412572,-0.180927,0.201127,0.178832,-0.034650,7.810000,0
0,519,387,1.248022,0.367821,0.058192,0.933295,0.443929,0.226403,0.115654,-0.059580,-0.155528,...,-0.075303,-0.098561,-0.137178,-0.940903,0.645500,-0.334234,0.044271,0.015850,10.000000,0
0,585,441,-0.565329,-0.061420,2.197934,-1.958795,-0.048529,0.748808,0.296588,-0.149943,1.209929,...,0.104993,0.885525,-0.548767,-0.711467,0.267147,0.242160,-0.306742,-0.410484,50.000000,0
0,761,580,1.267030,-0.071114,0.037680,0.512683,0.242392,0.705212,-0.226582,0.109483,0.657565,...,-0.164468,-0.177225,-0.222918,-1.245505,0.678360,0.525059,0.002920,-0.003333,12.360000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,922,47,1.216997,0.299694,0.220704,0.545815,-0.112009,-0.476216,-0.012788,-0.029694,-0.139577,...,-0.250421,-0.716020,0.111971,-0.069033,0.186615,0.113459,-0.014713,0.021170,2.690000,1
1,1160,426,-1.944048,1.538685,-0.718496,2.983060,-0.126230,-1.286752,-1.628630,0.979826,-2.071053,...,0.397299,0.049120,-0.410084,0.337537,0.136714,0.009811,0.163898,-0.138618,0.265998,1
1,1159,551,0.701393,0.373254,0.671954,0.561352,-0.020765,-1.025110,0.294003,-0.196630,-0.025395,...,-0.196958,-0.540171,0.029514,0.353883,0.264387,-0.045147,-0.043125,-0.008941,1.216190,1
1,1496,514,-2.213831,-1.116318,1.151150,0.415986,1.080568,-0.573317,0.289656,0.064726,-0.083759,...,0.126824,0.036496,0.334602,-0.257990,-0.111124,0.099861,-0.311494,-0.230850,193.290698,1


In [11]:
# cluster sampling
import numpy as np

from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=10, random_state=42).fit(balanced_df)
cluster_assignments = kmeans.labels_

# Select the clusters you want to include in the sample
selected_clusters = [0, 2, 4, 5, 8]

cluster_series = pd.Series(cluster_assignments)

# Create the new DataFrame containing only the rows from the selected clusters
df_cluster_sample = balanced_df[cluster_series.isin(selected_clusters)]


# Print the resulting DataFrame
(df_cluster_sample)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
1,1,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.660000,0
19,16,0.694885,-1.361819,1.029221,0.834159,-1.191209,1.309109,-0.878586,0.445290,-0.446196,...,-0.295583,-0.571955,-0.050881,-0.304215,0.072001,-0.422234,0.086553,0.063499,231.710000,0
59,41,0.986063,-0.202965,-0.492768,0.407691,0.305660,-0.230529,0.585028,-0.208225,-0.247503,...,-0.305874,-1.216555,-0.077602,-0.741341,0.286881,0.200347,-0.075203,0.027271,169.050000,0
63,42,-0.522666,1.009923,0.276470,1.475289,-0.707013,0.355243,1.559849,-0.399579,-0.479813,...,0.172401,1.011543,0.069666,0.157820,-1.109224,-0.302369,0.318170,0.316910,243.660000,0
84,55,-4.575093,-4.429184,3.402585,0.903915,3.002224,-0.491078,-2.705393,0.666451,1.922216,...,-0.047365,0.853360,-0.971600,-0.114862,0.408300,-0.304576,0.547785,-0.456297,200.010000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1518,488,-0.555211,1.170796,-0.666475,2.370172,-0.441277,-1.249825,-1.242167,0.603224,-1.399624,...,0.121485,-0.421701,-0.173417,0.331483,0.131437,0.136769,0.121697,-0.057530,0.634898,1
1519,546,-1.092166,0.120050,0.997179,-0.365991,0.644821,-0.459135,0.231370,0.064922,0.021520,...,-0.202696,-0.327654,-0.177840,-0.112218,-0.215055,0.208840,-0.275459,-0.294138,1.062564,1
1521,498,1.040373,0.398931,0.333529,0.584982,-0.141085,-0.825405,0.131355,-0.146100,-0.010782,...,-0.257840,-0.740136,0.131948,0.029815,-0.074408,0.090908,0.024418,0.067509,1.234939,1
1522,528,-1.979626,-2.438982,2.452973,1.121359,2.432806,0.565065,-2.051815,0.769896,0.937313,...,0.415499,1.177547,0.286984,-0.832844,-0.207402,0.835344,-0.124228,-0.170176,1.490237,1


In [12]:
# convenience sampling
convenience_sample = pd.concat([balanced_df.head(380), balanced_df.tail(380)])
convenience_sample

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.620000,0
1,1,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.660000,0
2,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.500000,0
3,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.990000,0
4,2,-0.425966,0.960523,1.141109,-0.168252,0.420987,-0.029728,0.476201,0.260314,-0.568671,...,-0.208254,-0.559825,-0.026398,-0.371427,-0.232794,0.105915,0.253844,0.081080,3.670000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1521,498,1.040373,0.398931,0.333529,0.584982,-0.141085,-0.825405,0.131355,-0.146100,-0.010782,...,-0.257840,-0.740136,0.131948,0.029815,-0.074408,0.090908,0.024418,0.067509,1.234939,1
1522,528,-1.979626,-2.438982,2.452973,1.121359,2.432806,0.565065,-2.051815,0.769896,0.937313,...,0.415499,1.177547,0.286984,-0.832844,-0.207402,0.835344,-0.124228,-0.170176,1.490237,1
1523,172,0.153678,-0.631935,0.914660,0.673160,0.841148,0.137678,-0.739409,0.313798,0.139445,...,-0.014971,-0.042232,0.165206,-0.508059,0.041483,0.364958,-0.046589,-0.045693,2.303011,1
1524,311,-0.763365,-0.678203,1.265025,0.528576,1.484291,0.389789,-0.674678,0.401301,0.313468,...,0.093649,0.254501,0.210254,-1.159454,-0.917095,0.392396,0.087958,0.068279,1.195779,1


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [14]:
X = simple_df.iloc[:, :-1].values
y = simple_df.iloc[:, -1].values

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define classification models to evaluate
models = [
    ("Logistic Regression", LogisticRegression()),
    ("Decision Tree Classifier", DecisionTreeClassifier()),
    ("K Nearest Neighbors Classifier", KNeighborsClassifier()),
    ("Random Forest Classifier", RandomForestClassifier())
]

# Evaluate models using accuracy score
for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} accuracy: {accuracy}")

Logistic Regression accuracy: 0.8181818181818182
Decision Tree Classifier accuracy: 1.0
K Nearest Neighbors Classifier accuracy: 0.7792207792207793
Random Forest Classifier accuracy: 0.987012987012987


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [None]:
X = systematic_df.iloc[:, :-1].values
y = systematic_df.iloc[:, -1].values

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define classification models to evaluate
models = [
    ("Logistic Regression", LogisticRegression()),
    ("Decision Tree Classifier", DecisionTreeClassifier()),
    ("K Nearest Neighbors Classifier", KNeighborsClassifier()),
    ("Random Forest Classifier", RandomForestClassifier())
]

# Evaluate models using accuracy score
for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} accuracy: {accuracy}")