# Homework 3 - Classification

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, mutual_info_score

In [22]:
df = pd.read_csv('course_lead_scoring.csv')
df.columns = df.columns.str.lower().str.replace(' ', '_')
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)
for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')
df.fillna(0, inplace=True)

## Question 1: What is the mode of the `industry` variable?

In [23]:
df['industry'].mode()[0]

'retail'

## Question 2: What are the two features that have the biggest correlation?

Create the correlation matrix for the numerical features. Check these pairs:
- interaction_count and lead_score
- number_of_courses_viewed and lead_score
- number_of_courses_viewed and interaction_count
- annual_income and interaction_count

In [25]:
# Identify numerical features
numerical_features = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

# Create correlation matrix
corr_matrix = df[numerical_features].corr()
print("Correlation Matrix:")
print(corr_matrix)
print("\n" + "="*50 + "\n")

# Check the specific pairs mentioned in the question
pairs = [
    ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'),
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'interaction_count')
]

print("Correlation for specific pairs:")
for feature1, feature2 in pairs:
    correlation = corr_matrix.loc[feature1, feature2]
    print(f"{feature1} and {feature2}: {correlation:.6f}")

# Find the pair with the highest correlation
max_corr = 0
max_pair = None
for feature1, feature2 in pairs:
    correlation = abs(corr_matrix.loc[feature1, feature2])
    if correlation > max_corr:
        max_corr = correlation
        max_pair = (feature1, feature2)

print(f"\nThe pair with the biggest correlation: {max_pair[0]} and {max_pair[1]} ({corr_matrix.loc[max_pair[0], max_pair[1]]:.6f})")

Correlation Matrix:
                          number_of_courses_viewed  annual_income  \
number_of_courses_viewed                  1.000000       0.009770   
annual_income                             0.009770       1.000000   
interaction_count                        -0.023565       0.027036   
lead_score                               -0.004879       0.015610   

                          interaction_count  lead_score  
number_of_courses_viewed          -0.023565   -0.004879  
annual_income                      0.027036    0.015610  
interaction_count                  1.000000    0.009888  
lead_score                         0.009888    1.000000  


Correlation for specific pairs:
interaction_count and lead_score: 0.009888
number_of_courses_viewed and lead_score: -0.004879
number_of_courses_viewed and interaction_count: -0.023565
annual_income and interaction_count: 0.027036

The pair with the biggest correlation: annual_income and interaction_count (0.027036)


## Question 3: Which categorical variable has the biggest mutual information score?

Calculate the mutual information score between y and other categorical variables using the training set only. Round scores to 2 decimals.

In [26]:
# Calculate mutual information score for each categorical variable
categorical_vars = ['industry', 'location', 'lead_source', 'employment_status']

mi_scores = {}
for var in categorical_vars:
    score = mutual_info_score(df_train[var].astype(str), y_train.astype(str))
    mi_scores[var] = round(score, 2)
    print(f"{var}: {mi_scores[var]}")

# Find the variable with the highest MI score
max_var = max(mi_scores, key=mi_scores.get)
print(f"\nThe variable with the biggest mutual information score: {max_var} ({mi_scores[max_var]})")

industry: 0.01
location: 0.0
lead_source: 0.02
employment_status: 0.02

The variable with the biggest mutual information score: lead_source (0.02)


## Data Preparation

Split the data into train/val/test sets with 60%/20%/20% distribution using seed 42.

In [27]:
# Split data: 60% train, 20% val, 20% test with seed 42
# First split: 60% train, 40% temp (val + test)
df_train, df_temp = train_test_split(df, test_size=0.4, random_state=42)

# Second split: split the 40% into 50/50 (20% val, 20% test)
df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=42)

# Reset indices
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

print(f"Train size: {len(df_train)} ({len(df_train)/len(df)*100:.1f}%)")
print(f"Val size: {len(df_val)} ({len(df_val)/len(df)*100:.1f}%)")
print(f"Test size: {len(df_test)} ({len(df_test)/len(df)*100:.1f}%)")

# Extract target variable y
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

# Remove target variable from dataframes
del df_train['converted']
del df_val['converted']
del df_test['converted']

print(f"\nTarget distribution:")
print(f"Train: {y_train.sum()} converted out of {len(y_train)} ({y_train.mean()*100:.1f}%)")
print(f"Val: {y_val.sum()} converted out of {len(y_val)} ({y_val.mean()*100:.1f}%)")
print(f"Test: {y_test.sum()} converted out of {len(y_test)} ({y_test.mean()*100:.1f}%)")

Train size: 877 (60.0%)
Val size: 292 (20.0%)
Test size: 293 (20.0%)

Target distribution:
Train: 535 converted out of 877 (61.0%)
Val: 197 converted out of 292 (67.5%)
Test: 173 converted out of 293 (59.0%)


## Question 4: What's the accuracy on the validation dataset?

Train a logistic regression model with one-hot encoding for categorical variables. Use these parameters for reproducibility:
- solver='liblinear'
- C=1.0
- max_iter=1000
- random_state=42

Round the accuracy to 2 decimal digits.

In [28]:
# Define features
numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
categorical = ['lead_source', 'industry', 'employment_status', 'location']

# One-hot encoding using DictVectorizer
dv = DictVectorizer(sparse=False)

# Prepare training data
train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

# Train logistic regression model with specified parameters
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Prepare validation data
val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

# Calculate accuracy on validation set
y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)

print(f"Validation accuracy: {accuracy}")
print(f"Validation accuracy (rounded to 2 decimals): {round(accuracy, 2)}")

round(accuracy, 2)

Validation accuracy: 0.7431506849315068
Validation accuracy (rounded to 2 decimals): 0.74


0.74

## Question 5: Which feature has the smallest difference?

Find the least useful feature using feature elimination technique:
1. Train a model with all features (same parameters as Q4, without rounding)
2. Exclude each feature one at a time and train a model without it
3. Record the accuracy for each model
4. Calculate the difference: (original accuracy - accuracy without feature)

Which of these features has the smallest difference?
- 'industry'
- 'employment_status'
- 'lead_score'

Note: The difference doesn't have to be positive.

In [29]:
# Define all features
numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
categorical = ['lead_source', 'industry', 'employment_status', 'location']
all_features = categorical + numerical

# Train baseline model with all features (same as Q4)
dv = DictVectorizer(sparse=False)
train_dict = df_train[all_features].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

val_dict = df_val[all_features].to_dict(orient='records')
X_val = dv.transform(val_dict)
y_pred = model.predict(X_val)
baseline_accuracy = accuracy_score(y_val, y_pred)

print(f"Baseline accuracy (with all features): {baseline_accuracy}\n")
print("="*60)

# Train models excluding each feature one at a time
accuracies_without = {}
differences = {}

for feature in all_features:
    # Create subset excluding current feature
    subset = [f for f in all_features if f != feature]
    
    # Train model without this feature
    dv = DictVectorizer(sparse=False)
    train_dict = df_train[subset].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)
    
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    val_dict = df_val[subset].to_dict(orient='records')
    X_val = dv.transform(val_dict)
    y_pred = model.predict(X_val)
    
    accuracy_without = accuracy_score(y_val, y_pred)
    accuracies_without[feature] = accuracy_without
    
    # Calculate difference: original - without
    diff = baseline_accuracy - accuracy_without
    differences[feature] = diff
    
    print(f"Without '{feature}': accuracy={accuracy_without:.6f}, diff={diff:.6f}")

print("\n" + "="*60)
print("\nFocus on the three specific features:")
for feature in ['industry', 'employment_status', 'lead_score']:
    print(f"'{feature}': difference = {differences[feature]:.6f}")

# Find feature with smallest difference among the three
target_features = ['industry', 'employment_status', 'lead_score']
min_diff_feature = min(target_features, key=lambda x: abs(differences[x]))
print(f"\nFeature with smallest difference: '{min_diff_feature}' ({differences[min_diff_feature]:.6f})")

Baseline accuracy (with all features): 0.7431506849315068

Without 'lead_source': accuracy=0.732877, diff=0.010274
Without 'industry': accuracy=0.743151, diff=0.000000
Without 'employment_status': accuracy=0.746575, diff=-0.003425
Without 'location': accuracy=0.743151, diff=0.000000
Without 'number_of_courses_viewed': accuracy=0.678082, diff=0.065068
Without 'annual_income': accuracy=0.852740, diff=-0.109589
Without 'interaction_count': accuracy=0.674658, diff=0.068493
Without 'lead_score': accuracy=0.743151, diff=0.000000


Focus on the three specific features:
'industry': difference = 0.000000
'employment_status': difference = -0.003425
'lead_score': difference = 0.000000

Feature with smallest difference: 'industry' (0.000000)


## Question 6: Which C parameter leads to the best accuracy?

Train regularized logistic regression models with different C values: [0.01, 0.1, 1, 10, 100]

Use all features as in Q4 with the same parameters (except varying C).
Calculate the accuracy on the validation dataset and round it to 3 decimal digits.

Which C leads to the best accuracy on the validation set?
- 0.01
- 0.1
- 1
- 10
- 100

Note: If there are multiple options with the same accuracy, select the smallest C.

In [30]:
# Define features (same as Q4)
numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
categorical = ['lead_source', 'industry', 'employment_status', 'location']
all_features = categorical + numerical

# Prepare data with one-hot encoding
dv = DictVectorizer(sparse=False)
train_dict = df_train[all_features].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[all_features].to_dict(orient='records')
X_val = dv.transform(val_dict)

# Try different C values
C_values = [0.01, 0.1, 1, 10, 100]
results = {}

print("Testing different C values:")
print("="*60)

for C in C_values:
    # Train model with current C value
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    # Predict and calculate accuracy
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    
    # Round to 3 decimal digits
    accuracy_rounded = round(accuracy, 3)
    results[C] = accuracy_rounded
    
    print(f"C={C:>6}: accuracy = {accuracy:.6f}, rounded = {accuracy_rounded}")

print("\n" + "="*60)

# Find the best C (highest accuracy, smallest C if tied)
max_accuracy = max(results.values())
best_C = min([c for c, acc in results.items() if acc == max_accuracy])

print(f"\nBest accuracy: {max_accuracy}")
print(f"Best C: {best_C}")

best_C

Testing different C values:
C=  0.01: accuracy = 0.739726, rounded = 0.74
C=   0.1: accuracy = 0.743151, rounded = 0.743
C=     1: accuracy = 0.743151, rounded = 0.743
C=    10: accuracy = 0.743151, rounded = 0.743
C=   100: accuracy = 0.743151, rounded = 0.743


Best accuracy: 0.743
Best C: 0.1


0.1