In [26]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.datasets import make_classification
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

In [14]:
# Example: Create binary classification data (replace with your own X, y if available)
X, y = make_classification(n_samples=100, n_features=200, n_informative=10, n_classes=2, random_state=42)
indices = X >= 2
X[indices] = 1
X[~indices] = 0
data = pd.DataFrame(X)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [25]:
models = {
    "Logistic Regression":
        # Initialize Logistic Regression with Lasso (L1 regularization)
        LogisticRegression(penalty='l1', solver='saga', max_iter=1000, random_state=0),  # 'saga' solver works for L1 penalty
    "Gaussian Naive Bayes":
        GaussianNB(),

}

cv = KFold(n_splits=10, shuffle=True, random_state=0)

for key in models.keys():
    print(f"---------------\nmodel: {key}")

    # Evaluate the model using cross-validation
    scores = cross_val_score(models[key], X, y, cv=cv, scoring='accuracy')

    # Print the accuracy scores for each fold and the average accuracy score
    print("Accuracy scores for each fold:", scores)
    print("Average accuracy score:", scores.mean())

---------------
model: Logistic Regression
Accuracy scores for each fold: [0.5 0.5 0.7 0.7 0.5 0.8 0.4 0.4 0.7 0.4]
Average accuracy score: 0.56
---------------
model: Gaussian Naive Bayes
Accuracy scores for each fold: [0.6 0.8 0.6 0.7 0.9 0.8 0.4 0.6 0.4 0.5]
Average accuracy score: 0.6300000000000001


In [19]:
# Set the number of experiment repetitions (k)
k = 20

# Store the coefficients for each iteration (if desired)
coefficients = []

# Run logistic regression with Lasso (L1) for k iterations
for _ in range(k):
    # Split the data into training and test sets (e.g., 80% training, 20% testing)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=_)

    # Initialize Logistic Regression with Lasso (L1 regularization)
    model = LogisticRegression(penalty='l1', solver='saga', max_iter=1000, random_state=_)  # 'saga' solver works for L1 penalty

    # Fit the model
    clf = model.fit(X_train, y_train)
    # clf.score(X_test, y_test)
    # clf.predict_proba(X_test)

    # Optionally, store the coefficients for this iteration
    coefficients.append(model.coef_.flatten())

    # Print the coefficients for this iteration (if desired)
    # print(f"Iteration {_+1} - Coefficients: {model.coef_}")

# Example of accessing coefficients from all iterations
df = pd.DataFrame(index=range(k), columns=data.columns, data=coefficients)
nonzero_cols = np.any(df.to_numpy() != 0, axis=0)
df = df.iloc[:, nonzero_cols]
df

Accuracy scores for each fold: [0.6  0.65 0.6  0.55 0.7 ]
Average accuracy score: 0.6200000000000001
Accuracy scores for each fold: [0.6  0.65 0.6  0.55 0.7 ]
Average accuracy score: 0.6200000000000001
Accuracy scores for each fold: [0.6  0.65 0.6  0.55 0.7 ]
Average accuracy score: 0.6200000000000001
Accuracy scores for each fold: [0.6  0.65 0.6  0.55 0.7 ]
Average accuracy score: 0.6200000000000001
Accuracy scores for each fold: [0.6  0.65 0.6  0.55 0.7 ]
Average accuracy score: 0.6200000000000001
Accuracy scores for each fold: [0.6  0.65 0.6  0.55 0.7 ]
Average accuracy score: 0.6200000000000001
Accuracy scores for each fold: [0.6  0.65 0.6  0.55 0.7 ]
Average accuracy score: 0.6200000000000001
Accuracy scores for each fold: [0.6  0.65 0.6  0.55 0.7 ]
Average accuracy score: 0.6200000000000001
Accuracy scores for each fold: [0.6  0.65 0.6  0.55 0.7 ]
Average accuracy score: 0.6200000000000001
Accuracy scores for each fold: [0.6  0.65 0.6  0.55 0.7 ]
Average accuracy score: 0.6200000

Unnamed: 0,9,12,13,15,17,20,23,24,26,30,...,146,158,163,165,168,174,182,194,197,198
0,-0.049285,-0.392516,0.200994,0.0,0.0,0.0,-1.241031,1.895696,0.0,0.0,...,0.0,0.0,0.330888,0.0,-0.306569,0.0,0.0,0.0,0.0,0.0
1,0.0,-0.614295,0.0,0.0,0.0,0.0,-1.637231,1.004601,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.267918,0.0,0.0
2,-0.089979,-0.795482,0.499721,0.0,0.322376,0.0,-1.115489,1.486806,-0.222668,0.0,...,0.0,0.683829,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,-1.466712,0.615319,0.0,-0.113296,...,0.0,0.416777,0.363668,-0.011927,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.198256,-0.049086,0.0,0.0,0.0,0.0,-0.657309,1.511042,0.0,0.0,...,0.0,0.910133,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,-0.777721,1.380665,0.0,0.0,...,0.0,0.0,0.57938,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,-0.146327,-0.590204,1.149837,0.0,0.0,...,0.0,0.517177,0.484347,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,-0.003153,0.0,0.0,0.0,0.0,-0.71739,1.851319,0.0,-0.247144,...,0.0,0.633822,0.326717,0.0,0.0,0.50318,0.0,0.0,0.0,0.0
8,0.0,-0.396784,0.272607,0.0,0.145467,0.0,-2.598691,0.930071,-0.458774,0.0,...,0.0,0.0,0.110778,-0.014598,0.0,0.0,0.0,0.0,-0.401462,0.0
9,0.0,-0.482671,0.620176,0.0,0.0,-0.070911,-1.082903,1.194504,0.0,0.0,...,0.0,0.636146,0.0,0.0,0.0,0.0,0.046808,0.0,0.0,0.0
