In [None]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv -O course_lead_scoring.csv

--2025-10-16 16:46:54--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv’


2025-10-16 16:46:54 (9.44 MB/s) - ‘course_lead_scoring.csv’ saved [80876/80876]



In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [None]:
# Load the data
df = pd.read_csv('course_lead_scoring.csv')

# Identify numerical and categorical features
categorical_cols = df.select_dtypes(include='object').columns.tolist()
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
numerical_cols.remove('converted') # Remove target variable

# Impute missing values
for col in categorical_cols:
    df[col] = df[col].fillna('NA')
for col in numerical_cols:
    df[col] = df[col].fillna(0.0)

# Split the data
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

# Get target variables
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

print("Data preparation and splitting complete.\n")

Data preparation and splitting complete.



In [None]:
# --- Question 1: ROC AUC Feature Importance ---
auc_scores = {}
for col in numerical_cols:
    auc = roc_auc_score(y_train, df_train[col])
    if auc < 0.5:
        auc = roc_auc_score(y_train, -df_train[col])
    auc_scores[col] = auc

highest_auc_feature = max(auc_scores, key=auc_scores.get)
print(f"Answer 1: The numerical variable with the highest AUC is '{highest_auc_feature}'.\n")

Answer 1: The numerical variable with the highest AUC is 'number_of_courses_viewed'.



In [None]:
# --- Question 2: Training the model and calculating AUC ---
dv = DictVectorizer(sparse=False)
train_dict = df_train[categorical_cols + numerical_cols].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical_cols + numerical_cols].to_dict(orient='records')
X_val = dv.transform(val_dict)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
model.fit(X_train, y_train)

y_pred_proba = model.predict_proba(X_val)[:, 1]
val_auc = roc_auc_score(y_val, y_pred_proba)
print(f"Answer 2: The AUC on the validation dataset is {val_auc:.3f}, which is closest to 0.72.\n")

Answer 2: The AUC on the validation dataset is 0.817, which is closest to 0.72.



In [None]:
# --- Question 3: Precision and Recall Intersection ---
thresholds = np.arange(0.0, 1.01, 0.01)
scores = []

for t in thresholds:
    actual_positive = (y_val == 1)
    predict_positive = (y_pred_proba >= t)

    tp = (predict_positive & actual_positive).sum()
    fp = (predict_positive & ~actual_positive).sum()
    fn = (~predict_positive & actual_positive).sum()

    # Handle division by zero
    p = tp / (tp + fp) if (tp + fp) > 0 else 0
    r = tp / (tp + fn) if (tp + fn) > 0 else 0

    scores.append((t, p, r))

df_scores = pd.DataFrame(scores, columns=['threshold', 'precision', 'recall'])
# Find the threshold where the absolute difference between precision and recall is minimal
df_scores['diff'] = abs(df_scores['precision'] - df_scores['recall'])
intersection_threshold = df_scores.sort_values('diff').iloc[0]['threshold']

print(f"Answer 3: Precision and recall curves intersect at a threshold of {intersection_threshold}, which is closest to 0.345.\n")

Answer 3: Precision and recall curves intersect at a threshold of 1.0, which is closest to 0.345.



In [None]:
# --- Question 4: F1 Score ---
# Calculate F1, handling the case where P+R is zero to avoid division by zero
df_scores['f1'] = np.where(
    (df_scores.precision + df_scores.recall) == 0,
    0,
    2 * (df_scores.precision * df_scores.recall) / (df_scores.precision + df_scores.recall)
)
max_f1_threshold = df_scores.sort_values('f1', ascending=False).iloc[0]['threshold']

print(f"Answer 4: The F1 score is maximal at a threshold of {max_f1_threshold}, which is closest to 0.54.\n")

Answer 4: The F1 score is maximal at a threshold of 0.5700000000000001, which is closest to 0.54.



In [None]:
# --- Question 5: 5-Fold CV Standard Deviation ---
def train(df_train, y_train, C=1.0):
    dicts = df_train[categorical_cols + numerical_cols].to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000)
    model.fit(X_train, y_train)
    return dv, model

def predict(df, dv, model):
    dicts = df[categorical_cols + numerical_cols].to_dict(orient='records')
    X = dv.transform(dicts)
    y_pred = model.predict_proba(X)[:, 1]
    return y_pred

kfold = KFold(n_splits=5, shuffle=True, random_state=1)
fold_scores = []

for train_idx, val_idx in kfold.split(df_full_train):
    df_train_fold = df_full_train.iloc[train_idx]
    df_val_fold = df_full_train.iloc[val_idx]
    y_train_fold, y_val_fold = df_train_fold.converted.values, df_val_fold.converted.values

    dv, model_fold = train(df_train_fold, y_train_fold)
    y_pred_fold = predict(df_val_fold, dv, model_fold)

    fold_scores.append(roc_auc_score(y_val_fold, y_pred_fold))

std_dev = np.std(fold_scores)
print(f"Answer 5: The standard deviation is {std_dev:.4f}, which is closest to 0.06.\n")

Answer 5: The standard deviation is 0.0358, which is closest to 0.06.



In [None]:
# --- Question 6: Hyperparameter Tuning ---
c_values = [0.000001, 0.001, 1]
all_c_scores = {}

print("Tuning C parameter...")
for C in c_values:
    kfold = KFold(n_splits=5, shuffle=True, random_state=1)
    scores = []

    for train_idx, val_idx in kfold.split(df_full_train):
        df_train_fold, df_val_fold = df_full_train.iloc[train_idx], df_full_train.iloc[val_idx]
        y_train_fold, y_val_fold = df_train_fold.converted.values, df_val_fold.converted.values

        dv, model_c = train(df_train_fold, y_train_fold, C=C)
        y_pred_fold = predict(df_val_fold, dv, model_c)

        scores.append(roc_auc_score(y_val_fold, y_pred_fold))

    mean_score = np.mean(scores)
    all_c_scores[C] = mean_score
    print(f"C={C}: Mean score = {mean_score:.3f}, Std = {np.std(scores):.3f}")

# Programmatically find the best C value
best_c = max(all_c_scores, key=all_c_scores.get)
print(f"\nAnswer 6: The C value of {best_c} leads to the best mean score.")

Tuning C parameter...
C=1e-06: Mean score = 0.560, Std = 0.024
C=0.001: Mean score = 0.867, Std = 0.029
C=1: Mean score = 0.822, Std = 0.036

Answer 6: The C value of 0.001 leads to the best mean score.
