In [53]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

pd.set_option('future.no_silent_downcasting', True)

## 1. Preparing datasets

### Real dataset

In [9]:
banana_df = pd.read_csv('./data/banana_quality.csv')
banana_df.replace({'Quality': {'Good': 1, 'Bad': 0}}, inplace=True)
banana_df.head()

Unnamed: 0,Size,Weight,Sweetness,Softness,HarvestTime,Ripeness,Acidity,Quality
0,-1.924968,0.468078,3.077832,-1.472177,0.294799,2.43557,0.27129,1
1,-2.409751,0.48687,0.346921,-2.495099,-0.892213,2.067549,0.307325,1
2,-0.357607,1.483176,1.568452,-2.645145,-0.647267,3.090643,1.427322,1
3,-0.868524,1.566201,1.889605,-1.273761,-1.006278,1.873001,0.477862,1
4,0.651825,1.319199,-0.022459,-1.209709,-1.430692,1.078345,2.812442,1


### Artificial dataset

In [46]:
n_observations = 1000
b = 1
k = 20

In [50]:
n_features = 5 + k

betas = np.array([b] * 5 + [0] * k)

X = np.zeros((n_observations, n_features))
y = np.zeros(n_observations)

for n in range(n_observations):
    x_list = np.array([np.random.normal() for _ in range(n_features)])
    p = 1 / (1 + np.exp(-(np.matmul(betas.transpose(), x_list))))
    label = np.random.binomial(n=1, p=p)
    
    X[n] = x_list
    y[n] = label

## 2. Classification models

In [61]:
def calculate_classification_error(predictions, y_test):
    classificaiton_error = 0
    for y_pred, y in zip(predictions, y_test):
        if y_pred != y:
            classificaiton_error += 1 
    return classificaiton_error/len(y_test)


### Logistic Regression

In [52]:
model = LogisticRegression(penalty='l2') # C = 1000

* Train-test split

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [60]:
model.fit(X_train, y_train)
predictions = model.predict(X_test)

calculate_classification_error(predictions, y_test)

0.245

* Cross validation