In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import mutual_info_classif

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 120)

RANDOM_STATE = 42

url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"
df = pd.read_csv(url)

print("Shape:", df.shape)
df.head()

Shape: (1462, 9)


Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [5]:
target_col = "converted"
feature_cols = [c for c in df.columns if c != target_col]

cat_cols = [c for c in feature_cols if df[c].dtype == "object"]
num_cols = [c for c in feature_cols if df[c].dtype != "object"]

missing_counts = df.isna().sum().sort_values(ascending=False)
print("Missing values per column (top 10):")
print(missing_counts.head(10))

df[cat_cols] = df[cat_cols].fillna("NA")
df[num_cols] = df[num_cols].fillna(0.0)

assert df[cat_cols].isna().sum().sum() == 0, "Still have NA in categorical"
assert df[num_cols].isna().sum().sum() == 0, "Still have NA in numerical"
print("Missing values handled.")

Missing values per column (top 10):
annual_income               181
industry                    134
lead_source                 128
employment_status           100
location                     63
number_of_courses_viewed      0
interaction_count             0
lead_score                    0
converted                     0
dtype: int64
Missing values handled.


In [6]:
mode_industry = df['industry'].mode(dropna=False)[0]
print("Q1 - Most frequent `industry`:", mode_industry)

Q1 - Most frequent `industry`: retail


In [7]:
num_df = df[num_cols].copy()
corr = num_df.corr()

pairs = [
    ("interaction_count", "lead_score"),
    ("number_of_courses_viewed", "lead_score"),
    ("number_of_courses_viewed", "interaction_count"),
    ("annual_income", "interaction_count"),
]

pair_corrs = {}
for a, b in pairs:
    if a in num_df.columns and b in num_df.columns:
        pair_corrs[(a, b)] = corr.loc[a, b]
    else:
        pair_corrs[(a, b)] = np.nan

print("Pair correlations:")
for k, v in pair_corrs.items():
    print(f"{k[0]} & {k[1]}: {v:.4f}" if pd.notna(v) else f"{k[0]} & {k[1]}: N/A")

# Identify max absolute correlation among given pairs (ignoring NaN)
valid_items = [(k, v) for k, v in pair_corrs.items() if pd.notna(v)]
best_pair, best_val = max(valid_items, key=lambda kv: abs(kv[1]))
print("\nQ2 - Biggest correlation pair:", best_pair, "with corr =", round(best_val, 4))

Pair correlations:
interaction_count & lead_score: 0.0099
number_of_courses_viewed & lead_score: -0.0049
number_of_courses_viewed & interaction_count: -0.0236
annual_income & interaction_count: 0.0270

Q2 - Biggest correlation pair: ('annual_income', 'interaction_count') with corr = 0.027


In [8]:
X = df[feature_cols].copy()
y = df[target_col].astype(int).values

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=RANDOM_STATE, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=RANDOM_STATE, stratify=y_temp
)

print("Shapes:")
print("Train:", X_train.shape, "Val:", X_val.shape, "Test:", X_test.shape)
print("Target balance (train):", np.mean(y_train))

Shapes:
Train: (877, 8) Val: (292, 8) Test: (293, 8)
Target balance (train): 0.6191562143671607


In [9]:
X_train_cat = X_train[cat_cols].copy()

encoders = {}
X_train_cat_enc = pd.DataFrame(index=X_train_cat.index)
for c in cat_cols:
    le = LabelEncoder()
    X_train_cat_enc[c] = le.fit_transform(X_train_cat[c])
    encoders[c] = le

mi_scores = mutual_info_classif(
    X_train_cat_enc.values, y_train, discrete_features=True, random_state=RANDOM_STATE
)

mi_by_col = {col: score for col, score in zip(cat_cols, mi_scores)}

options_q3 = ["industry", "location", "lead_source", "employment_status"]
filtered_mi = {k: round(mi_by_col.get(k, np.nan), 2) for k in options_q3}

print("Mutual Information (rounded to 2 decimals) for selected vars:")
for k in options_q3:
    print(f"{k}: {filtered_mi[k]}")

max_mi = np.nanmax(list(filtered_mi.values()))
best_vars = [k for k, v in filtered_mi.items() if v == max_mi]
print("\nQ3 - Variable(s) with biggest MI:", best_vars, "with score", max_mi)

Mutual Information (rounded to 2 decimals) for selected vars:
industry: 0.01
location: 0.0
lead_source: 0.03
employment_status: 0.01

Q3 - Variable(s) with biggest MI: ['lead_source'] with score 0.03


In [10]:
numeric_features = num_cols
categorical_features = cat_cols

preprocess = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

logreg = LogisticRegression(
    solver="liblinear", C=1.0, max_iter=1000, random_state=RANDOM_STATE
)

pipe = Pipeline(steps=[("preprocess", preprocess), ("model", logreg)])

pipe.fit(X_train, y_train)

y_val_pred = pipe.predict(X_val)
val_acc = accuracy_score(y_val, y_val_pred)

print("Q4 - Validation Accuracy:", round(val_acc, 2))

Q4 - Validation Accuracy: 0.68


In [11]:

baseline_acc = accuracy_score(y_val, pipe.predict(X_val))

features_to_test = ["industry", "employment_status", "lead_score"]
diffs = {}

for feat in features_to_test:
    reduced_num = [c for c in numeric_features if c != feat]
    reduced_cat = [c for c in categorical_features if c != feat]

    preprocess_reduced = ColumnTransformer(
        transformers=[
            ("num", "passthrough", reduced_num),
            ("cat", OneHotEncoder(handle_unknown="ignore"), reduced_cat),
        ]
    )

    pipe_reduced = Pipeline(steps=[("preprocess", preprocess_reduced), ("model", logreg)])
    pipe_reduced.fit(X_train, y_train)
    acc_reduced = accuracy_score(y_val, pipe_reduced.predict(X_val))
    diffs[feat] = baseline_acc - acc_reduced

print("Baseline val accuracy (all features):", round(baseline_acc, 4))
print("Accuracy differences (baseline - without feature):")
for k, v in diffs.items():
    print(f"{k}: {round(v, 4)}")

min_feat = min(diffs, key=lambda k: diffs[k])
print("\nQ5 - Feature with smallest difference:", min_feat, " (diff =", round(diffs[min_feat], 4), ")")


Baseline val accuracy (all features): 0.6815
Accuracy differences (baseline - without feature):
industry: -0.0068
employment_status: 0.0
lead_score: 0.0068

Q5 - Feature with smallest difference: industry  (diff = -0.0068 )


In [12]:
Cs = [0.01, 0.1, 1, 10, 100]
results = []

for C in Cs:
    model = LogisticRegression(solver="liblinear", C=C, max_iter=1000, random_state=RANDOM_STATE)
    pipeC = Pipeline(steps=[("preprocess", preprocess), ("model", model)])
    pipeC.fit(X_train, y_train)
    acc = accuracy_score(y_val, pipeC.predict(X_val))
    results.append((C, acc))

print("C grid results (rounded to 3 decimals):")
for C, acc in results:
    print(f"C={C}: {round(acc, 3)}")

best_acc = max(acc for _, acc in results)
best_candidates = [C for C, acc in results if acc == best_acc]
best_C = min(best_candidates)

print(f"\nQ6 - Best C on validation set: {best_C} (accuracy = {round(best_acc, 3)})")


C grid results (rounded to 3 decimals):
C=0.01: 0.688
C=0.1: 0.682
C=1: 0.682
C=10: 0.682
C=100: 0.682

Q6 - Best C on validation set: 0.01 (accuracy = 0.688)
