# MAT 690 Example Dataset

Wei Li

In [1]:
import pandas as pd
import numpy as np
import torch
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader

In [None]:
# %pip install watermark
%load_ext watermark
%watermark -a "Wei Li" -u -t -d -v -p numpy,torch,soundfile,librosa


This is data on a job training program (the treatment) that was intended to raise
future earnings (the outcome). The income is $\$1000$ in the year of 1978. The data combines the original experimental data (445 observations) and additional control observations from PSID data, totalling $2675$ observations.


\begin{align*}
%\begin{array}{ll}
\hline \text { Variable } & \text { Description } \\
\hline \text { age } & \text { Age in years } \\
\text { educ } & \text { Years of education } \\
\text { black } & 1=\text { Black; } 0 \text { otherwise } \\
\text { hisp } & 1=\text { Hispanic; } 0 \text { otherwise } \\
\text { married } & 1=\text { married; } 0 \text { otherwise } \\
\text { nodegr } & 1=\text { no degree; } 0 \text { otherwise } \\
\text { re74 } & 1974 \text { income}\\
\text { re75 } & 1975 \text { income}  \\
\text { re78 } & 1978 \text { income} \\
\text { treat } & 1=\text { received treatment; } 0 \text { otherwise } \\
\hline
%\end{array}
\end{align*}

In [2]:
raw_data_all = pd.read_csv('psid_LDW_merged.csv').drop(["u74", "u75"], axis=1)

In [3]:
raw_data_all.head()

Unnamed: 0,T,age,educ,black,hisp,married,nodegr,re74,re75,re78
0,1,37,11,1,0,1,1,0.0,0.0,9930.046
1,1,22,9,0,1,0,1,0.0,0.0,3595.894
2,1,30,12,1,0,0,0,0.0,0.0,24909.45
3,1,27,11,1,0,0,1,0.0,0.0,7506.146
4,1,33,8,1,0,0,1,0.0,0.0,289.7899


In [4]:
# stats for treatment
print(raw_data_all.describe())
print()

np.bincount(raw_data_all['T']), np.bincount(raw_data_all["re78"] > 0)
# counts of 0/1
# (array([2490,  185]), array([ 331, 2344]))
# about 7% enrolled in the training program
# about 13% unemployed in 1978

                 T          age         educ        black         hisp  \
count  2675.000000  2675.000000  2675.000000  2675.000000  2675.000000   
mean      0.069159    34.225794    11.994393     0.291589     0.034393   
std       0.253772    10.499842     3.053556     0.454579     0.182269   
min       0.000000    17.000000     0.000000     0.000000     0.000000   
25%       0.000000    25.000000    10.000000     0.000000     0.000000   
50%       0.000000    32.000000    12.000000     0.000000     0.000000   
75%       0.000000    43.500000    14.000000     1.000000     0.000000   
max       1.000000    55.000000    17.000000     1.000000     1.000000   

           married       nodegr           re74           re75           re78  
count  2675.000000  2675.000000    2675.000000    2675.000000    2675.000000  
mean      0.819439     0.333084   18230.003096   17850.893766   20502.375641  
std       0.384726     0.471404   13722.251526   13877.777180   15632.519749  
min       0.00000

### Task 1: predict re78 using all other variables

In [5]:
#Split data into training features and labels
X, y = raw_data_all.loc[:, raw_data_all.columns != 're78'], raw_data_all['re78']

# split data into train and test sets using train_test_split()

seed = 2022
test_size = 0.3
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed, stratify=raw_data_all['T'])
# statify is used to balance the data based on the portion of treatments

In [6]:
np.bincount(x_train['T']), np.bincount(x_test['T'])

(array([1743,  129]), array([747,  56]))

In [7]:
# scaling of the features/predictors are essential for most algorithms (with the exception of tree-based methods)

stdscaler = StandardScaler()
x_train_std = stdscaler.fit_transform(x_train)
x_test_std = stdscaler.transform(x_test)

In [8]:
# define our own Dataset class

class MyDataset(Dataset):
    # constructor
    # In this case it contains the data
    def __init__(self, x, y):
        xs = torch.tensor(np.array(x)).to(torch.float)
        ys = torch.tensor(np.array(y)).to(torch.float)
        self.xs = xs
        self.ys = ys
    
    #len()
    # returns the length of the dataset
    def __len__(self):
        return self.xs.shape[0]  #len(self.ys)
    
    #[]
    # returns the item at index i
    # this tells how a single data point is loaded
    def __getitem__(self, i):
        return self.xs[i,], self.ys[i]

In [9]:
# Create instances of MyDataset class

train_data = MyDataset(x_train_std, y_train)
# train_data[0,]
test_data = MyDataset(x_test_std, y_test)

In [10]:
# Create instances of dataloader 

torch.manual_seed(1)
train_loader = DataLoader(dataset=train_data, batch_size=4, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=4, shuffle=True)

# for epoch in range(2):
#     print(f'epoch {epoch+1}')
#     for idx, batch in enumerate(train_loader):
#         print(f'batch {idx}:', 'x:', batch[0], '\n y:', batch[1])

### Task 2: predict if re78>0 using all other variables

In [11]:
# Split data into training features and labels
X, y = raw_data_all.loc[:, raw_data_all.columns != 're78'], (raw_data_all['re78']>0).astype(int)
# y= re78 is binary
# split data into train and test sets using train_test_split()

seed = 2022
test_size = 0.3
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed, stratify=pd.concat([y, X['T']], axis=1))
# x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed, stratify=y)

In [12]:
np.bincount(y_train), np.bincount(y_test), np.bincount(x_train['T']), np.bincount(x_test['T'])

(array([ 232, 1640]),
 array([ 99, 704]),
 array([1742,  130]),
 array([748,  55]))

The rest procedure is similar to Task 1.