In [181]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


In [182]:
# Download dataset (run only once!)
# !wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv

In [183]:
# Read csv file
df = pd.read_csv('course_lead_scoring.csv')
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [184]:
# Data preparation
# Check if the missing values are presented in the features. If there are missing values:
df.dtypes
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [185]:
# For categorical features, replace them with 'NA', and for numerical features, replace with with 0.0
df = df.fillna({col: 'NA' for col in df.select_dtypes('object')})
df = df.fillna(0.0)
df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [186]:
# Question 1: What is the most frequent observation (mode) for the column industry?
df['industry'].mode()[0]

'retail'

In [187]:
# Question 2: Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.
# What are the two features that have the biggest correlation?
num_df = df.select_dtypes(include=['number'])
corr = num_df.corr()
corr

# -> annual_income and interaction_count

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879,0.435914
annual_income,0.00977,1.0,0.027036,0.01561,0.053131
interaction_count,-0.023565,0.027036,1.0,0.009888,0.374573
lead_score,-0.004879,0.01561,0.009888,1.0,0.193673
converted,0.435914,0.053131,0.374573,0.193673,1.0


In [188]:
# Split your data in train/val/test sets with 60%/20%/20% distribution.
# Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
# Make sure that the target value y is not in your dataframe.
X = df.drop(columns=['converted'])
y = df['converted']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

len(X_train) / len(df), len(X_val) / len(df), len(X_test) / len(df)


(0.5998632010943913, 0.1997264021887825, 0.20041039671682626)

In [189]:
# Question 3: Calculate the mutual information score between y and other categorical variables in the dataset. 
# Use the training set only. 
# Round the scores to 2 decimals using round(score, 2).
cat_cols = X_train.select_dtypes(include=['object']).columns

mi_scores = {}

for col in cat_cols:
    score = mutual_info_score(X_train[col], y_train)
    mi_scores[col] = round(score, 2)

mi_scores


{'lead_source': 0.03,
 'industry': 0.02,
 'employment_status': 0.02,
 'location': 0.0}

In [190]:
# Question 4: 
'''
Now let's train a logistic regression.
Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
Fit the model on the training dataset.
To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
Calculate the accuracy on the validation dataset and round it to 2 decimal digits.
'''
# One-hot coding for categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ],
    remainder='passthrough'  # Keep all other columns
)

# Transform the datasets
X_train_enc = preprocessor.fit_transform(X_train)
X_val_enc = preprocessor.transform(X_val)
X_test_enc = preprocessor.transform(X_test)

# Train logistic regression model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_enc, y_train)

# Predict on validation set
y_val_pred = model.predict(X_val_enc)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_val_pred)
round(accuracy, 2)

0.74

In [191]:
# Question 5
'''
Let's find the least useful feature using the feature elimination technique.
Train a model using the same features and parameters as in Q4 (without rounding).
Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
Which of following feature has the smallest difference?
'''
# Leave-one-feature-out: drop each feature, retrain, and record accuracy difference
diffs = {}

for feat in list(X_train.columns):
    X_train_sub = X_train.drop(columns=[feat])
    X_val_sub = X_val.drop(columns=[feat])

    # Update categorical columns for this run
    cat_sub = [c for c in cat_cols if c != feat]

    preproc_sub = ColumnTransformer(
        transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), cat_sub)],
        remainder='passthrough'
    )

    X_train_enc_sub = preproc_sub.fit_transform(X_train_sub)
    X_val_enc_sub = preproc_sub.transform(X_val_sub)

    model_sub = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_sub.fit(X_train_enc_sub, y_train)

    y_val_pred_sub = model_sub.predict(X_val_enc_sub)
    accuracy_sub = accuracy_score(y_val, y_val_pred_sub)
    
    # Store the difference in accuracy
    diffs[feat] = accuracy - accuracy_sub


# Print the accuracy difference for the requested features
focus = ['industry', 'employment_status', 'lead_score']
for f in focus:
    print(f"{f}: {diffs[f]}")

industry: 0.0
employment_status: -0.003424657534246589
lead_score: 0.0


In [193]:
# Question 6
'''
Now let's train a regularized logistic regression.
Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].
Train models using all the features as in Q4.
Calculate the accuracy on the validation dataset and round it to 3 decimal digits.
Which of these C leads to the best accuracy on the validation set?
Note: If there are multiple options, select the smallest C
'''

# Try different values of C for regularized logistic regression
C_values = [0.01, 0.1, 1, 10, 100]
val_accuracies = {}

for c in C_values:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train_enc, y_train)
    y_val_pred = model.predict(X_val_enc)
    acc = accuracy_score(y_val, y_val_pred)
    val_accuracies[c] = round(acc, 3)

val_accuracies


{0.01: 0.743, 0.1: 0.743, 1: 0.743, 10: 0.743, 100: 0.743}