In [112]:
%matplotlib inline

import pandas
import matplotlib.pyplot as plt
import numpy as np

# Scikit imports
from sklearn.datasets import make_classification, make_regression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score

# Overfitting notebook

Notebook for overfitting simulation and notes.

## Discrete labels

### `make_classification`

- clusters of points normally distributed with $\sigma=1$ about vertices of an `n_informative`-dimensional hypercube with sides of length 2*`class_sep`


### Create synthetic data

In [365]:
# features
n_informative = 4
n_redundant = 2
n_total_feats = 20

# samples
n_train = 200
n_test = 200

# class parameters
n_classes = 2
weights = None # defaults to 50/50 split

# output params
shuffle = True
random_state = 0
class_sep = 1
n_clusters_per_class = 4

X, y = make_classification(
                            n_samples = n_train + n_test,
                            n_features = n_total_feats,
                            n_informative = n_informative,
                            n_redundant = n_redundant,
                            n_classes = n_classes,
                            weights = weights,
                            shuffle = shuffle,
                            random_state = random_state,
                            class_sep = class_sep,
                            n_clusters_per_class = n_clusters_per_class)

end_idx_0 = n_train // 2
start_idx_1 = (n_train // 2) + (n_test // 2)
end_idx_1 = n_train + (n_test // 2)
train_data = np.append(X[:end_idx_0, :], X[start_idx_1:end_idx_1, :], 0)
train_data = np.c_[train_data, np.append(y[:end_idx_0], y[start_idx_1:end_idx_1])]

test_data = np.append(X[end_idx_0:start_idx_1, :], X[end_idx_1:, :], 0)
test_data = np.c_[test_data, np.append(y[end_idx_0:start_idx_1], y[end_idx_1:])]

train_X = train_data[:, :-1]
train_y = train_data[:, -1]

test_X = test_data[:, :-1]
test_y = test_data[:, -1]

In [366]:
print(sum(train_y))
print(sum(test_y))

102.0
96.0


In [375]:
degrees = np.arange(1,5)


train_acc = []
test_proba = []
test_acc = []
pipelines = []
for deg in degrees:
    poly_features = PolynomialFeatures(degree=deg, 
                                       include_bias=False,
                                       interaction_only=True)
    lr_model = LogisticRegression(solver='lbfgs', 
                                  #penalty='none', 
                                  C=10000000,
                                  random_state=random_state,
                                  n_jobs=-1)
    lr_pipeline = Pipeline([("polynomial_features", poly_features),
                           ("logistic_regression", lr_model)])
    
    lr_pipeline.fit(train_X.copy(), train_y.copy())
    
    pipelines.append(lr_pipeline)
    
    train_acc.append(accuracy_score(train_y, lr_pipeline.predict(train_X)))
    test_proba.append(np.mean(lr_pipeline.predict_proba(test_X)[:, 0]))
    test_acc.append(accuracy_score(test_y, lr_pipeline.predict(test_X)))
    print(deg)
    
print(test_acc)

1
2
3
4
[0.62, 0.67, 0.575, 0.565]


In [363]:
train_acc

[0.75, 1.0, 1.0]

In [342]:
test_proba

[0.5034791175509974,
 0.5156326748810355,
 0.5255363858212497,
 0.5431345289257606,
 0.5251316294044792,
 0.5458332000115349,
 0.5396144508000041,
 0.522354837583173,
 0.5309405091323608]

[0.55, 0.6, 0.545, 0.505, 0.52, 0.51, 0.545, 0.52, 0.485]

In [344]:
np.max(train_X)

4.517117387804012

In [345]:
no_label_X = np.random.multivariate_normal(np.random.rand((n_total_feats)), 
                                           cov=np.eye(n_total_feats),
                                           size=n_test)

In [None]:
test_proba = []
test_pred = []
for pipe in pipelines:
    pred_prob = pipe.predict_proba(no_label_X)
    #print(pred_prob.shape)
    #print(pred_prob[:5,:])
    test_proba.append(np.mean(pred_prob[:, 1]))
    test_pred.append(np.sum(pipe.predict(no_label_X)))

In [349]:
test_proba

[0.43318725779486056,
 0.42784594267590864,
 0.5799342239135585,
 0.589066750563704,
 0.5723469159133173,
 0.5436274211233478,
 0.50859931179229,
 0.5219533978035353,
 0.532691651530601]

In [350]:
test_pred

[73.0, 85.0, 114.0, 118.0, 119.0, 113.0, 106.0, 111.0, 112.0]

In [353]:
for pipe in pipelines:
    print(pipe['polynomial_features'].n_output_features_)

20
210
1350
6195
21699
60459
137979
263949
431909


### Notes

- generate data from two separate distributions
- plots as $p$ increases
- train on 50/50 split
- plots of underfit, "fit," and overfit predictions as the distribution of out-of-sample data goes from 0 to 1 in terms of labels
- use `make_classification`
- use pipeline of polynomial features
- TODO do the out-of-sample people come from a different data distribution?