<a href="https://colab.research.google.com/github/utyabia/1-notebook/blob/main/ML_Zoomcamp_Assignment_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import mutual_info_classif

# Load the data
df = pd.read_csv('/content/course_lead_scoring.csv')

# Check for missing values
print("Missing values before processing:")
print(df.isnull().sum())

# Handle missing values
categorical_cols = ['lead_source', 'industry', 'employment_status', 'location']
numerical_cols = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

# Fill categorical with 'NA', numerical with 0.0
for col in categorical_cols:
    df[col] = df[col].fillna('NA')

for col in numerical_cols:
    df[col] = df[col].fillna(0.0)

print("\nMissing values after processing:")
print(df.isnull().sum())

Missing values before processing:
lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

Missing values after processing:
lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64


In [2]:
# Question 1
industry_mode = df['industry'].mode()[0]
print(f"\nQuestion 1 - Most frequent industry: {industry_mode}")


Question 1 - Most frequent industry: retail


In [3]:
# Question 2
correlation_matrix = df[numerical_cols].corr()
print("\nQuestion 2 - Correlation matrix:")
print(correlation_matrix)

# Find the pair with highest correlation (excluding diagonal)
corr_pairs = []
for i in range(len(numerical_cols)):
    for j in range(i+1, len(numerical_cols)):
        corr_val = correlation_matrix.iloc[i, j]
        corr_pairs.append({
            'features': f"{numerical_cols[i]} and {numerical_cols[j]}",
            'correlation': corr_val
        })

# Sort by absolute correlation
corr_pairs_sorted = sorted(corr_pairs, key=lambda x: abs(x['correlation']), reverse=True)
print(f"\nTop correlated pair: {corr_pairs_sorted[0]}")


Question 2 - Correlation matrix:
                          number_of_courses_viewed  annual_income  \
number_of_courses_viewed                  1.000000       0.009770   
annual_income                             0.009770       1.000000   
interaction_count                        -0.023565       0.027036   
lead_score                               -0.004879       0.015610   

                          interaction_count  lead_score  
number_of_courses_viewed          -0.023565   -0.004879  
annual_income                      0.027036    0.015610  
interaction_count                  1.000000    0.009888  
lead_score                         0.009888    1.000000  

Top correlated pair: {'features': 'annual_income and interaction_count', 'correlation': np.float64(0.02703647240481443)}


In [4]:
# Split the data
X = df.drop('converted', axis=1)
y = df['converted']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"\nData split:")
print(f"Train: {X_train.shape[0]} samples")
print(f"Validation: {X_val.shape[0]} samples")
print(f"Test: {X_test.shape[0]} samples")


Data split:
Train: 877 samples
Validation: 292 samples
Test: 293 samples


In [5]:
# Question 3 - Mutual Information
# Prepare categorical features for MI calculation
X_train_categorical = X_train[categorical_cols].copy()

# Convert categorical variables to numerical for MI calculation
for col in categorical_cols:
    X_train_categorical[col] = X_train_categorical[col].astype('category').cat.codes

mi_scores = mutual_info_classif(X_train_categorical, y_train, random_state=42)
mi_results = dict(zip(categorical_cols, mi_scores))

print("\nQuestion 3 - Mutual Information Scores:")
for feature, score in mi_results.items():
    print(f"{feature}: {round(score, 2)}")

max_mi_feature = max(mi_results, key=mi_results.get)
print(f"Feature with highest MI: {max_mi_feature}")


Question 3 - Mutual Information Scores:
lead_source: 0.04
industry: 0.03
employment_status: 0.02
location: 0.02
Feature with highest MI: lead_source


In [6]:
# Question 4 - Logistic Regression with all features
# Prepare data for modeling with one-hot encoding
X_train_encoded = pd.get_dummies(X_train, columns=categorical_cols)
X_val_encoded = pd.get_dummies(X_val, columns=categorical_cols)

# Ensure both train and val have same columns
X_val_encoded = X_val_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

# Train model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_encoded, y_train)

# Predict and calculate accuracy
y_val_pred = model.predict(X_val_encoded)
accuracy_q4 = accuracy_score(y_val, y_val_pred)

print(f"\nQuestion 4 - Validation Accuracy: {round(accuracy_q4, 2)}")


Question 4 - Validation Accuracy: 0.74


In [7]:
# Question 5 - Feature Elimination
base_accuracy = accuracy_q4
feature_differences = {}

features_to_test = ['industry', 'employment_status', 'lead_score']

for feature in features_to_test:
    # Create dataset without the feature
    if feature == 'lead_score':  # numerical feature
        X_train_reduced = X_train.drop(feature, axis=1)
        X_val_reduced = X_val.drop(feature, axis=1)
    else:  # categorical feature
        X_train_reduced = X_train.drop(feature, axis=1)
        X_val_reduced = X_val.drop(feature, axis=1)

    # One-hot encode
    X_train_reduced_encoded = pd.get_dummies(X_train_reduced, columns=[col for col in categorical_cols if col != feature])
    X_val_reduced_encoded = pd.get_dummies(X_val_reduced, columns=[col for col in categorical_cols if col != feature])

    # Align columns
    X_val_reduced_encoded = X_val_reduced_encoded.reindex(columns=X_train_reduced_encoded.columns, fill_value=0)

    # Train model
    model_reduced = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_reduced.fit(X_train_reduced_encoded, y_train)

    # Calculate accuracy
    y_val_pred_reduced = model_reduced.predict(X_val_reduced_encoded)
    accuracy_reduced = accuracy_score(y_val, y_val_pred_reduced)

    # Calculate difference
    difference = base_accuracy - accuracy_reduced
    feature_differences[feature] = abs(difference)

    print(f"Without {feature}: Accuracy = {accuracy_reduced:.4f}, Difference = {difference:.4f}")

smallest_diff_feature = min(feature_differences, key=feature_differences.get)
print(f"\nQuestion 5 - Feature with smallest difference: {smallest_diff_feature}")

Without industry: Accuracy = 0.7432, Difference = 0.0000
Without employment_status: Accuracy = 0.7466, Difference = -0.0034
Without lead_score: Accuracy = 0.7432, Difference = 0.0000

Question 5 - Feature with smallest difference: industry


In [9]:
# Question 6 - Regularization
C_values = [0.01, 0.1, 1, 10, 100]
best_accuracy = 0
best_C = None

for C in C_values:
    model_reg = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model_reg.fit(X_train_encoded, y_train)

    y_val_pred_reg = model_reg.predict(X_val_encoded)
    accuracy_reg = accuracy_score(y_val, y_val_pred_reg)

    print(f"C = {C}: Accuracy = {round(accuracy_reg, 3)}")

    if accuracy_reg > best_accuracy:
        best_accuracy = accuracy_reg
        best_C = C

print(f"\nQuestion 6 - Best C: {best_C}")

C = 0.01: Accuracy = 0.743
C = 0.1: Accuracy = 0.743
C = 1: Accuracy = 0.743
C = 10: Accuracy = 0.743
C = 100: Accuracy = 0.743

Question 6 - Best C: 0.01
