In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mutual_info_score, accuracy_score

# Data Loading and Preparation

In [2]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv

--2025-10-13 15:26:26--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv.1’


2025-10-13 15:26:27 (59.7 MB/s) - ‘course_lead_scoring.csv.1’ saved [80876/80876]



In [3]:
# Load the data with Pandas
df = pd.read_csv('course_lead_scoring.csv')

# Identify numerical and categorical features
categorical_cols = df.select_dtypes(include='object').columns.tolist()
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
# The target 'converted' is numerical but we'll treat it separately
numerical_cols.remove('converted')

# Impute missing values
for col in categorical_cols:
    df[col] = df[col].fillna('NA')

for col in numerical_cols:
    df[col] = df[col].fillna(0.0)

print("Data preparation complete. Missing values handled.\n")

Data preparation complete. Missing values handled.



In [4]:
# --- Question 1: Most frequent observation for 'industry' ---
industry_mode = df['industry'].mode()[0]
print(f"Answer 1: The mode for the 'industry' column is '{industry_mode}'.\n")

Answer 1: The mode for the 'industry' column is 'retail'.



In [5]:
# --- Question 2: Biggest correlation pair ---
# Calculate the correlation matrix for numerical features
correlation_matrix = df[numerical_cols].corr()

# Check the correlation for the specified pairs
corr1 = correlation_matrix.loc['interaction_count', 'lead_score']
corr2 = correlation_matrix.loc['number_of_courses_viewed', 'lead_score']
corr3 = correlation_matrix.loc['number_of_courses_viewed', 'interaction_count']
corr4 = correlation_matrix.loc['annual_income', 'interaction_count']

print(f"Correlation between 'interaction_count' and 'lead_score': {corr1:.4f}")
print(f"Correlation between 'number_of_courses_viewed' and 'lead_score': {corr2:.4f}")
print(f"Correlation between 'number_of_courses_viewed' and 'interaction_count': {corr3:.4f}")
print(f"Correlation between 'annual_income' and 'interaction_count': {corr4:.4f}")

print("\nAnswer 2: The pair with the biggest correlation is 'annual_income' and 'interaction_count'.\n")

Correlation between 'interaction_count' and 'lead_score': 0.0099
Correlation between 'number_of_courses_viewed' and 'lead_score': -0.0049
Correlation between 'number_of_courses_viewed' and 'interaction_count': -0.0236
Correlation between 'annual_income' and 'interaction_count': 0.0270

Answer 2: The pair with the biggest correlation is 'annual_income' and 'interaction_count'.



In [6]:
# --- Split the data ---
# Split data into train/val/test with a 60%/20%/20% distribution
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42) # 0.25 * 0.8 = 0.2

# Reset index for clean processing
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# Get the target variable y
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

# Remove the target from the dataframes
del df_train['converted']
del df_val['converted']
del df_test['converted']

print("Data splitting complete.\n")

Data splitting complete.



In [7]:
# --- Question 3: Mutual Information Score ---
# Calculate mutual information for categorical variables
def mutual_info_converted_score(series):
    return mutual_info_score(series, y_train)

mi_scores = df_train[categorical_cols].apply(mutual_info_converted_score)
mi_scores = mi_scores.sort_values(ascending=False).round(2)

print("Mutual Information Scores:\n", mi_scores)
print(f"\nAnswer 3: The variable with the biggest mutual information score is '{mi_scores.index[0]}'.\n")

Mutual Information Scores:
 lead_source          0.04
employment_status    0.01
industry             0.01
location             0.00
dtype: float64

Answer 3: The variable with the biggest mutual information score is 'lead_source'.



In [13]:
# --- Question 4: Logistic Regression Accuracy ---
# One-hot encode the features
dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

# Train the model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Calculate accuracy on the validation set
y_pred = model.predict(X_val)
accuracy = round(accuracy_score(y_val, y_pred), 2)
print(f"Answer 4: The accuracy on the validation dataset is {accuracy}.\n")

Answer 4: The accuracy on the validation dataset is 0.7.



In [12]:
# --- Question 5: Feature Elimination ---
# Get the original accuracy without rounding
original_accuracy = accuracy_score(y_val, y_pred)
features = categorical_cols + numerical_cols
accuracies_diff = {}

print("Running feature elimination...")
for feature in features:
    # Create a subset of features without the current one
    subset_features = features.copy()
    subset_features.remove(feature)

    # Prepare data with the reduced feature set
    train_dict_subset = df_train[subset_features].to_dict(orient='records')
    val_dict_subset = df_val[subset_features].to_dict(orient='records')

    dv_subset = DictVectorizer(sparse=False)
    X_train_subset = dv_subset.fit_transform(train_dict_subset)
    X_val_subset = dv_subset.transform(val_dict_subset)

    # Train and evaluate the model
    model_subset = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_subset.fit(X_train_subset, y_train)
    y_pred_subset = model_subset.predict(X_val_subset)

    accuracy_subset = accuracy_score(y_val, y_pred_subset)
    accuracies_diff[feature] = original_accuracy - accuracy_subset

# Check the differences for the features in the options
print(f"Difference for 'industry': {accuracies_diff.get('industry', 'N/A'):.6f}")
print(f"Difference for 'employment_status': {accuracies_diff.get('employment_status', 'N/A'):.6f}")
print(f"Difference for 'lead_score': {accuracies_diff.get('lead_score', 'N/A'):.6f}")

# The print statement below has been corrected
print("\nAnswer 5: The feature with the smallest difference is 'industry'.\n")

Running feature elimination...
Difference for 'industry': 0.000000
Difference for 'employment_status': 0.003413
Difference for 'lead_score': -0.006826

Answer 5: The feature with the smallest difference is 'industry'.



In [10]:
# --- Question 6: Regularization with C ---
print("Testing different values of C...")
c_values = [0.01, 0.1, 1, 10, 100]
accuracy_scores = {}

for C in c_values:
    # Train the model with the current C
    model_reg = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model_reg.fit(X_train, y_train)

    # Calculate and store accuracy
    y_pred_reg = model_reg.predict(X_val)
    acc = round(accuracy_score(y_val, y_pred_reg), 3)
    accuracy_scores[C] = acc
    print(f"C={C}, Accuracy={acc}")

# Find the best C value
best_c = max(accuracy_scores, key=accuracy_scores.get)
print(f"\nAnswer 6: The C value that leads to the best accuracy is {best_c}.")

Testing different values of C...
C=0.01, Accuracy=0.7
C=0.1, Accuracy=0.7
C=1, Accuracy=0.7
C=10, Accuracy=0.7
C=100, Accuracy=0.7

Answer 6: The C value that leads to the best accuracy is 0.01.
