In [2]:
import numpy as np
import seaborn as sns

In [5]:
df = sns.load_dataset('mpg')
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [6]:
df.shape

(398, 9)

## Simple Random Sampling

In [9]:
random_sample = df.sample(10)
random_sample

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
106,12.0,8,350.0,180.0,4499,12.5,73,usa,oldsmobile vista cruiser
14,24.0,4,113.0,95.0,2372,15.0,70,japan,toyota corona mark ii
61,21.0,4,122.0,86.0,2226,16.5,72,usa,ford pinto runabout
63,14.0,8,400.0,175.0,4385,12.0,72,usa,pontiac catalina
211,16.5,6,168.0,120.0,3820,16.7,76,europe,mercedes-benz 280s
21,24.0,4,107.0,90.0,2430,14.5,70,europe,audi 100 ls
309,41.5,4,98.0,76.0,2144,14.7,80,europe,vw rabbit
365,20.2,6,200.0,88.0,3060,17.1,81,usa,ford granada gl
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup
67,11.0,8,429.0,208.0,4633,11.0,72,usa,mercury marquis


## Systematic Sample

In [10]:
np.arange(0,len(df),step = 20)

array([  0,  20,  40,  60,  80, 100, 120, 140, 160, 180, 200, 220, 240,
       260, 280, 300, 320, 340, 360, 380])

In [11]:
df.iloc[np.arange(0,len(df),step = 30)]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
30,28.0,4,140.0,90.0,2264,15.5,71,usa,chevrolet vega 2300
60,20.0,4,140.0,90.0,2408,19.5,72,usa,chevrolet vega
90,12.0,8,429.0,198.0,4952,11.5,73,usa,mercury marquis brougham
120,19.0,4,121.0,112.0,2868,15.5,73,europe,volvo 144ea
150,26.0,4,108.0,93.0,2391,15.5,74,japan,subaru
180,25.0,4,121.0,115.0,2671,13.5,75,europe,saab 99le
210,19.0,6,156.0,108.0,2930,15.5,76,japan,toyota mark ii
240,30.5,4,97.0,78.0,2190,14.1,77,europe,volkswagen dasher
270,21.1,4,134.0,95.0,2515,14.8,78,japan,toyota celica gt liftback


## Stratified Sample

In [12]:
df["origin"].unique()

array(['usa', 'japan', 'europe'], dtype=object)

In [13]:
df.groupby("origin",group_keys=False).apply(lambda x: x.sample(3))

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
304,37.3,4,91.0,69.0,2130,14.7,79,europe,fiat strada custom
122,24.0,4,121.0,110.0,2660,14.0,73,europe,saab 99le
180,25.0,4,121.0,115.0,2671,13.5,75,europe,saab 99le
318,29.8,4,134.0,90.0,2711,15.5,80,japan,toyota corona liftback
111,18.0,3,70.0,90.0,2124,13.5,73,japan,maxda rx3
84,27.0,4,97.0,88.0,2100,16.5,72,japan,toyota corolla 1600 (sw)
291,19.2,8,267.0,125.0,3605,15.0,79,usa,chevrolet malibu classic (sw)
12,15.0,8,400.0,150.0,3761,9.5,70,usa,chevrolet monte carlo
80,22.0,4,122.0,86.0,2395,16.0,72,usa,ford pinto (sw)


In [14]:
df["origin"].value_counts()

usa       249
japan      79
europe     70
Name: origin, dtype: int64

In [15]:
prop_stratified_sampling = df.groupby("origin",group_keys=False).apply(lambda x: x.sample(frac=0.25))

In [16]:
prop_stratified_sampling

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
218,36.0,4,79.0,58.0,1825,18.6,77,europe,renault 5 gtl
143,26.0,4,97.0,78.0,2300,14.5,74,europe,opel manta
21,24.0,4,107.0,90.0,2430,14.5,70,europe,audi 100 ls
233,29.0,4,97.0,78.0,1940,14.5,77,europe,volkswagen rabbit custom
117,29.0,4,68.0,49.0,1867,19.5,73,europe,fiat 128
...,...,...,...,...,...,...,...,...,...
379,36.0,4,98.0,70.0,2125,17.3,82,usa,mercury lynx l
365,20.2,6,200.0,88.0,3060,17.1,81,usa,ford granada gl
15,22.0,6,198.0,95.0,2833,15.5,70,usa,plymouth duster
230,15.5,8,350.0,170.0,4165,11.4,77,usa,chevrolet monte carlo landau


In [17]:
prop_stratified_sampling["origin"].value_counts()

usa       62
japan     20
europe    18
Name: origin, dtype: int64

## Cluster Smaple

In [18]:
clusters = np.random.choice(df["model_year"], size=2, replace = False)

In [19]:
clusters

array([72, 81])

In [20]:
cluster_sample = df[df["model_year"].isin(clusters)]

In [21]:
cluster_sample

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
57,24.0,4,113.0,95.0,2278,15.5,72,japan,toyota corona hardtop
58,25.0,4,97.5,80.0,2126,17.0,72,usa,dodge colt hardtop
59,23.0,4,97.0,54.0,2254,23.5,72,europe,volkswagen type 3
60,20.0,4,140.0,90.0,2408,19.5,72,usa,chevrolet vega
61,21.0,4,122.0,86.0,2226,16.5,72,usa,ford pinto runabout
62,13.0,8,350.0,165.0,4274,12.0,72,usa,chevrolet impala
63,14.0,8,400.0,175.0,4385,12.0,72,usa,pontiac catalina
64,15.0,8,318.0,150.0,4135,13.5,72,usa,plymouth fury iii
65,14.0,8,351.0,153.0,4129,13.0,72,usa,ford galaxie 500
66,17.0,8,304.0,150.0,3672,11.5,72,usa,amc ambassador sst
