In [3]:
# 1. Load libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
import pickle

# 2. Load processed data
df = pd.read_csv('/Users/tamanna/Documents/mlproject/task1/data/processed/online_course_completion_clean.csv')

# 2a. Sample for baseline to save memory & time
df = df.sample(20000, random_state=42)

# 3. Baseline: majority class
y = df['completed_course']
majority = y.mode()[0]
baseline_acc = (y == majority).mean()
print("Majority-class baseline:", majority, "accuracy:", baseline_acc)

# 4. Features & target
X = df.drop(columns=['completed_course'])
y = df['completed_course']

# 5. Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# 6. Numeric & categorical columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

# 7. Preprocessing
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

# 8. Pipeline
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', LogisticRegression(solver='liblinear', random_state=42))
])

# 9. Train
pipe.fit(X_train, y_train)

# 10. Predict
y_pred = pipe.predict(X_test)
y_proba = pipe.predict_proba(X_test)[:, 1]

# 11. Evaluation
metrics = {
    'accuracy': accuracy_score(y_test, y_pred),
    'precision': precision_score(y_test, y_pred),
    'recall': recall_score(y_test, y_pred),
    'f1': f1_score(y_test, y_pred),
    'roc_auc': roc_auc_score(y_test, y_proba),
    'confusion_matrix': confusion_matrix(y_test, y_pred).tolist()
}
print(metrics)

# 12. Save model as .pkl
with open('../saved_models/baseline_logreg.pkl', 'wb') as f:
    pickle.dump(pipe, f)

print("Saved model to ../saved_models/baseline_logreg.pkl")

Majority-class baseline: 0 accuracy: 0.75045
{'accuracy': 0.81, 'precision': 0.6685552407932012, 'recall': 0.4729458917835671, 'f1': 0.5539906103286385, 'roc_auc': 0.8378228809384259, 'confusion_matrix': [[2768, 234], [526, 472]]}
Saved model to ../saved_models/baseline_logreg.pkl
