In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv')

In [3]:
df.shape

(1462, 9)

In [4]:
df.columns

Index(['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
       'employment_status', 'location', 'interaction_count', 'lead_score',
       'converted'],
      dtype='object')

In [5]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [6]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

# ------------------------
# Data preparation
# ------------------------

In [7]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [8]:
numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

categorical = ['lead_source', 'industry', 'employment_status', 'location']

In [9]:
df[categorical].nunique()


lead_source          5
industry             7
employment_status    4
location             7
dtype: int64

In [None]:
# Fill missing values
df[categorical] = df[categorical].fillna('NA')
df[numerical] = df[numerical].fillna(0.0)




In [11]:
# Confirm no missing
df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

# ------------------------
# Question 1
# ------------------------

In [12]:
industry_mode = df['industry'].mode()
print("Q1: industry mode(s):", industry_mode.tolist())
# If multiple modes, mode() returns all; take first
industry_mode_value = industry_mode.iloc[0]
print("Q1 answer (most frequent):", industry_mode_value)
print()

Q1: industry mode(s): ['retail']
Q1 answer (most frequent): retail



# ------------------------
# Question 2
# ------------------------

In [13]:
# Build correlation matrix for numerical features (only numeric cols)
numerical_df = df[numerical].copy()
corr = numerical_df.corr()
print("Numerical features correlation matrix:")
print(corr)
print()

Numerical features correlation matrix:
                          number_of_courses_viewed  annual_income  \
number_of_courses_viewed                  1.000000       0.009770   
annual_income                             0.009770       1.000000   
interaction_count                        -0.023565       0.027036   
lead_score                               -0.004879       0.015610   

                          interaction_count  lead_score  
number_of_courses_viewed          -0.023565   -0.004879  
annual_income                      0.027036    0.015610  
interaction_count                  1.000000    0.009888  
lead_score                         0.009888    1.000000  



In [14]:
# Compute correlations for the specified pairs (ensure they exist)
pairs = [
    ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'),
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'interaction_count'),
]

pair_corrs = {}
for a, b in pairs:
    if a in corr.index and b in corr.columns:
        pair_corrs[(a,b)] = corr.loc[a,b]
    else:
        pair_corrs[(a,b)] = np.nan

print("Q2: specified pair correlations:")
for k,v in pair_corrs.items():
    print(f"  {k}: {v}")
# find the pair with the biggest absolute correlation
best_pair = max(pair_corrs.items(), key=lambda kv: abs(kv[1]) if not np.isnan(kv[1]) else -1)
print("Q2 answer (biggest correlation among listed pairs):", best_pair[0], "corr =", best_pair[1])
print()

Q2: specified pair correlations:
  ('interaction_count', 'lead_score'): 0.009888182496913131
  ('number_of_courses_viewed', 'lead_score'): -0.004878998354681276
  ('number_of_courses_viewed', 'interaction_count'): -0.023565222882888037
  ('annual_income', 'interaction_count'): 0.02703647240481443
Q2 answer (biggest correlation among listed pairs): ('annual_income', 'interaction_count') corr = 0.02703647240481443



# ------------------------
# Split the data
# ------------------------

In [58]:
from sklearn.model_selection import train_test_split
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42, stratify=df.converted)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42, stratify=df_full_train.converted)

len(df_train), len(df_val), len(df_test)



(876, 293, 293)

In [59]:
df_train

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
1015,social_media,other,4,53556.0,self_employed,africa,2,0.18,0
693,referral,healthcare,3,66872.0,unemployed,,3,0.03,1
141,events,healthcare,3,60375.0,student,australia,9,0.30,1
1206,,manufacturing,3,51271.0,unemployed,europe,3,0.60,1
529,social_media,healthcare,1,57537.0,student,south_america,2,0.28,0
...,...,...,...,...,...,...,...,...,...
1129,,other,1,71395.0,student,,5,0.83,1
957,,education,3,89042.0,employed,asia,4,0.75,1
133,paid_ads,other,2,0.0,unemployed,north_america,4,0.18,0
1119,social_media,finance,2,43899.0,self_employed,australia,2,0.39,0


In [60]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [61]:
df_train

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,social_media,other,4,53556.0,self_employed,africa,2,0.18,0
1,referral,healthcare,3,66872.0,unemployed,,3,0.03,1
2,events,healthcare,3,60375.0,student,australia,9,0.30,1
3,,manufacturing,3,51271.0,unemployed,europe,3,0.60,1
4,social_media,healthcare,1,57537.0,student,south_america,2,0.28,0
...,...,...,...,...,...,...,...,...,...
871,,other,1,71395.0,student,,5,0.83,1
872,,education,3,89042.0,employed,asia,4,0.75,1
873,paid_ads,other,2,0.0,unemployed,north_america,4,0.18,0
874,social_media,finance,2,43899.0,self_employed,australia,2,0.39,0


In [None]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values


####or instead of above method we can do this####
#------------------------------------------------
#Prepare targets
#y_train = df_train.converted.astype(int)
#y_val = df_val.converted.astype(int)
#------------------------------------------------

del df_train['converted']
del df_val['converted']
del df_test['converted']

# ------------------------
# Question 3
# ------------------------

In [20]:
from sklearn.metrics import mutual_info_score

def mutual_info_converted_score(series):
    return mutual_info_score(series, df_full_train.converted)

mi = df_full_train[categorical].apply(mutual_info_converted_score).round(2)
mi.sort_values(ascending=False)

lead_source          0.03
industry             0.01
employment_status    0.01
location             0.00
dtype: float64

# ------------------------
# Question 4
# ------------------------


In [64]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)


In [65]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)





0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [66]:
model.intercept_[0]


-0.09245220977344637

In [41]:
model.coef_[0].round(3)


array([-0.   , -0.015,  0.034,  0.003,  0.012, -0.103, -0.025,  0.049,
       -0.02 , -0.013, -0.003, -0.009, -0.032, -0.016,  0.311,  0.051,
        0.02 , -0.012, -0.012, -0.115,  0.08 , -0.03 ,  0.004, -0.011,
       -0.011, -0.006,  0.008,  0.006, -0.033, -0.025,  0.454])

In [42]:
converted_decision = (y_pred >= 0.5)


In [70]:
#(converted_decision == y_val).mean()


y_pred = model.predict(X_val)
acc = (y_pred == y_val).mean()
print("Validation accuracy:", round(acc, 2))


Validation accuracy: 0.73


In [28]:
y_val

array([0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 1, 0])

In [68]:
client = dicts_val[-1]
client

{'lead_source': 'paid_ads',
 'industry': 'finance',
 'employment_status': 'NA',
 'location': 'south_america',
 'number_of_courses_viewed': 1,
 'annual_income': 47129.0,
 'interaction_count': 1,
 'lead_score': 0.93}

In [69]:
y_val[-1]

1

# ------------------------
# Question 5
# ------------------------

In [74]:
df_full_train.head()


Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
1391,events,finance,2,19529.0,employed,europe,1,0.87,0
466,social_media,,0,26438.0,self_employed,,1,0.98,0
646,referral,finance,3,45841.0,employed,north_america,3,0.34,1
1398,organic_search,technology,1,65013.0,employed,asia,2,0.53,0
19,paid_ads,,2,62809.0,self_employed,europe,5,0.33,1


In [72]:
# --- Function to train + evaluate a model given feature list ---
def train_and_eval(feature_list):
    dv = DictVectorizer(sparse=False)
    train_dict = df_train[feature_list].to_dict(orient='records')
    val_dict = df_val[feature_list].to_dict(orient='records')

    X_train = dv.fit_transform(train_dict)
    X_val = dv.transform(val_dict)

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)
    return (y_pred == y_val).mean()


In [73]:
# --- Base model accuracy (all features) ---
all_features = categorical + numerical
base_acc = train_and_eval(all_features)
print(f"Base accuracy: {base_acc:.4f}")

# --- Evaluate by removing each feature ---
results = {}
for f in ['industry', 'employment_status', 'lead_score']:
    reduced_features = [x for x in all_features if x != f]
    acc = train_and_eval(reduced_features)
    diff = base_acc - acc
    results[f] = {'acc_without': acc, 'diff': diff}
    print(f"{f:20s} | acc: {acc:.4f} | diff: {diff:.4f}")

# ---  Find the least useful feature ---
least_useful = min(results.items(), key=lambda x: x[1]['diff'])
print("\nLeast useful feature:", least_useful[0])

Base accuracy: 0.7304
industry             | acc: 0.7304 | diff: 0.0000
employment_status    | acc: 0.7338 | diff: -0.0034
lead_score           | acc: 0.7304 | diff: 0.0000

Least useful feature: employment_status


# ------------------------
# Question 6
# ------------------------

Compared to Manual Regularized Regression

In [75]:
def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    return w_full[0], w_full[1:]


This is Ridge Regression, where r is the regularization term.

In Ridge: r ↑ → more penalty → smaller weights.

In Logistic Regression: C ↓ → more penalty → smaller weights.

So conceptually:

r ∝ 1 / C

Thus, the two models behave similarly — one for regression (MSE loss) and one for classification (log loss).

In [77]:
# ---------------------------------------
# TRY DIFFERENT C VALUES
# ---------------------------------------
C_values = [0.01, 0.1, 1, 10, 100]
accuracies = {}

for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    acc = (y_pred == y_val).mean()
    accuracies[C] = acc
    print(f"C={C:<5} -> Validation Accuracy: {acc:.3f}")

# ---------------------------------------
# FIND BEST C
# ---------------------------------------
best_C = max(accuracies, key=accuracies.get)
print(f"\nBest C: {best_C} (Accuracy={accuracies[best_C]:.3f})")

C=0.01  -> Validation Accuracy: 0.734
C=0.1   -> Validation Accuracy: 0.730
C=1     -> Validation Accuracy: 0.730
C=10    -> Validation Accuracy: 0.730
C=100   -> Validation Accuracy: 0.730

Best C: 0.01 (Accuracy=0.734)
