## Lead Scoring: Missing Values Handling

This notebook downloads the dataset, inspects missing values, and imputes them:
- Categorical features → 'NA'
- Numerical features → 0.0


In [7]:
import pandas as pd
import numpy as np


In [8]:
# Download data
url = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv'
!wget -q $url -O course_lead_scoring.csv

# Load
df = pd.read_csv('course_lead_scoring.csv')
df.head()


Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [9]:
# Missing values overview
missing_counts = df.isna().sum().sort_values(ascending=False)
print('Columns with missing values:', missing_counts[missing_counts > 0].to_dict())
print('Total missing:', int(missing_counts.sum()))


Columns with missing values: {'annual_income': 181, 'industry': 134, 'lead_source': 128, 'employment_status': 100, 'location': 63}
Total missing: 606


In [10]:
# Impute: categorical -> 'NA', numerical -> 0.0
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

# Replace
if categorical_cols:
    df[categorical_cols] = df[categorical_cols].fillna('NA')
if numeric_cols:
    df[numeric_cols] = df[numeric_cols].fillna(0.0)

# Verify
missing_after = df.isna().sum().sum()
print('Total missing after imputation:', int(missing_after))

# Show sample
df.head()


Total missing after imputation: 0


Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [11]:
# Mode of 'industry'
if 'industry' in df.columns:
    mode_vals = df['industry'].mode(dropna=True)
    if len(mode_vals) > 0:
        print('Mode for industry:', str(mode_vals.iloc[0]))
    else:
        print('No mode for industry (empty after dropna).')
else:
    print("Column 'industry' not found.")


Mode for industry: retail


In [16]:
# Correlation matrix (numeric features) and strongest pair
numeric_cols = df.select_dtypes(include=[np.number]).columns
if len(numeric_cols) < 2:
    print('Not enough numeric features to compute correlations.')
else:
    corr = df[numeric_cols].corr(method='pearson')
    # Find the pair with the largest absolute correlation (exclude diagonal)
    corr_abs = corr.abs()
    np.fill_diagonal(corr_abs.values, 0.0)
    max_idx = corr_abs.stack().idxmax()
    max_val = corr_abs.loc[max_idx]
    print('Top correlated pair:', corr_abs.stack())
    #print('Correlation:', corr.loc[max_idx], 4)


Top correlated pair: number_of_courses_viewed  number_of_courses_viewed    0.000000
                          annual_income               0.009770
                          interaction_count           0.023565
                          lead_score                  0.004879
                          converted                   0.435914
annual_income             number_of_courses_viewed    0.009770
                          annual_income               0.000000
                          interaction_count           0.027036
                          lead_score                  0.015610
                          converted                   0.053131
interaction_count         number_of_courses_viewed    0.023565
                          annual_income               0.027036
                          interaction_count           0.000000
                          lead_score                  0.009888
                          converted                   0.374573
lead_score                number_o

In [17]:
# Correlation table for selected pairs
pairs = [
    ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'),
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'interaction_count'),
]

# ensure columns exist and are numeric for correlation
present_pairs = []
for a, b in pairs:
    if a in df.columns and b in df.columns:
        a_vals = pd.to_numeric(df[a], errors='coerce')
        b_vals = pd.to_numeric(df[b], errors='coerce')
        corr = a_vals.corr(b_vals, method='pearson')
        present_pairs.append({'feature_a': a, 'feature_b': b, 'pearson_corr': round(float(corr), 4)})
    else:
        present_pairs.append({'feature_a': a, 'feature_b': b, 'pearson_corr': None})

corr_table = pd.DataFrame(present_pairs)
corr_table


Unnamed: 0,feature_a,feature_b,pearson_corr
0,interaction_count,lead_score,0.0099
1,number_of_courses_viewed,lead_score,-0.0049
2,number_of_courses_viewed,interaction_count,-0.0236
3,annual_income,interaction_count,0.027


In [18]:
# Split data: 60/20/20 with seed=42, ensure y not in X
from sklearn.model_selection import train_test_split

# Heuristic to pick target column (adjust if needed)
candidate_targets = ['lead_status', 'converted', 'is_enrolled', 'label', 'target']
for t in candidate_targets:
    if t in df.columns:
        y_col = t
        break
else:
    raise ValueError('Target column not found. Please set y_col to the correct target column name.')

# Features without target
y = df[y_col]
X = df.drop(columns=[y_col]).copy()

# First split: train vs temp (val+test)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42, stratify=y if y.nunique() > 1 else None
)

# Second split: val vs test (50/50 of temp)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp if y_temp.nunique() > 1 else None
)

print({
    'train': len(X_train),
    'val': len(X_val),
    'test': len(X_test),
    'target': y_col,
})


{'train': 877, 'val': 292, 'test': 293, 'target': 'converted'}


In [19]:
# Mutual information for selected categorical variables (train only)
from sklearn.metrics import mutual_info_score

candidates = ['industry', 'location', 'lead_source', 'employment_status']

mi_scores = {}
for col in candidates:
    if col in X_train.columns:
        x = X_train[col].astype(str)
        mi = mutual_info_score(x, y_train)
        mi_scores[col] = round(float(mi), 2)
    else:
        mi_scores[col] = None

print('MI scores (train):', mi_scores)
best = max((k for k in mi_scores if mi_scores[k] is not None), key=lambda k: mi_scores[k])
print('Best categorical by MI:', best)


MI scores (train): {'industry': 0.01, 'location': 0.0, 'lead_source': 0.03, 'employment_status': 0.01}
Best categorical by MI: lead_source


In [20]:
# Logistic Regression with One-Hot Encoding; report validation accuracy
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Ensure splits exist
assert 'X_train' in globals() and 'X_val' in globals() and 'y_train' in globals() and 'y_val' in globals(), 'Run the split cell first.'

categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
numeric_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()

preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
    ],
    remainder='passthrough'
)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

clf = Pipeline(steps=[('prep', preprocess), ('lr', model)])
clf.fit(X_train, y_train)

val_pred = clf.predict(X_val)
acc = accuracy_score(y_val, val_pred)
print('Validation accuracy:', round(float(acc), 2))


Validation accuracy: 0.68


In [21]:
# Feature elimination: drop-one evaluation with same pipeline as Q4 (no rounding)
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

assert 'X_train' in globals() and 'X_val' in globals() and 'y_train' in globals() and 'y_val' in globals(), 'Run the split cell first.'

# Build baseline pipeline on full features
cat_full = X_train.select_dtypes(include=['object']).columns.tolist()
prep_full = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_full)
], remainder='passthrough')

base_model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
base_clf = Pipeline([('prep', prep_full), ('lr', base_model)])
base_clf.fit(X_train, y_train)
base_acc = accuracy_score(y_val, base_clf.predict(X_val))
print('Baseline accuracy (full features):', base_acc)

# Evaluate drop-one for each feature present in X_train
all_features = X_train.columns.tolist()
acc_drop = {}

for feat in all_features:
    Xtr = X_train.drop(columns=[feat])
    Xva = X_val.drop(columns=[feat])

    cat_cols = Xtr.select_dtypes(include=['object']).columns.tolist()
    prep = ColumnTransformer([
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
    ], remainder='passthrough')

    clf = Pipeline([('prep', prep), ('lr', LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42))])
    clf.fit(Xtr, y_train)
    acc = accuracy_score(y_val, clf.predict(Xva))
    acc_drop[feat] = base_acc - acc

# Report drops for requested features
requested = ['industry', 'employment_status', 'lead_score']
report = {f: (acc_drop[f] if f in acc_drop else None) for f in requested}
print('Accuracy drop by feature:', report)

# Identify smallest drop among requested
available = {k: v for k, v in report.items() if v is not None}
if available:
    least = min(available, key=lambda k: available[k])
    print('Smallest drop feature:', least)
else:
    print('None of the requested features were found in X_train.')


Baseline accuracy (full features): 0.6815068493150684
Accuracy drop by feature: {'industry': -0.006849315068493178, 'employment_status': 0.0, 'lead_score': 0.006849315068493067}
Smallest drop feature: industry


In [None]:
# Regularized Logistic Regression: tune C over [0.01, 0.1, 1, 10, 100]
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

assert 'X_train' in globals() and 'X_val' in globals() and 'y_train' in globals() and 'y_val' in globals(), 'Run the split cell first.'

categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()

preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
    ],
    remainder='passthrough'
)

Cs = [0.01, 0.1, 1, 10, 100]
acc_by_C = {}

for C in Cs:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    clf = Pipeline(steps=[('prep', preprocess), ('lr', model)])
    clf.fit(X_train, y_train)
    val_pred = clf.predict(X_val)
    acc = accuracy_score(y_val, val_pred)
    acc_by_C[C] = round(float(acc), 3)

print('Validation accuracy by C:', acc_by_C)
best_C = max(acc_by_C, key=lambda k: acc_by_C[k])
print('Best C:', best_C)
