In [114]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,mutual_info_score
from sklearn.preprocessing import OneHotEncoder



In [115]:
df=pd.read_csv('course_lead_scoring.csv')

In [116]:
df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


In [117]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [118]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1334 non-null   object 
 1   industry                  1328 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1281 non-null   float64
 4   employment_status         1362 non-null   object 
 5   location                  1399 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 102.9+ KB


In [119]:
# Separate categorical and numerical features
categorical = df.select_dtypes(include=['object']).columns
numerical = df.select_dtypes(include=['number']).columns

# Fill missing values
df[categorical] = df[categorical].fillna('NA')   # For categorical features
df[numerical] = df[numerical].fillna(0.0)        # For numerical features


In [120]:
df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [121]:
# Question 1

df["industry"].mode()[0]

'retail'

In [122]:
# Question 2

corr = df.corr(numeric_only=True)
corr


Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879,0.435914
annual_income,0.00977,1.0,0.027036,0.01561,0.053131
interaction_count,-0.023565,0.027036,1.0,0.009888,0.374573
lead_score,-0.004879,0.01561,0.009888,1.0,0.193673
converted,0.435914,0.053131,0.374573,0.193673,1.0


In [123]:
pairs = [
    ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'),
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'interaction_count')
]

for a, b in pairs:
    print(f'{a} & {b}:', corr.loc[a, b])



    # ✅ Final Answer:
# annual_income and interaction_count


interaction_count & lead_score: 0.009888182496913131
number_of_courses_viewed & lead_score: -0.004878998354681276
number_of_courses_viewed & interaction_count: -0.023565222882888037
annual_income & interaction_count: 0.02703647240481443


In [124]:
# Separate features (X) and target (y)
X = df.drop(columns=['converted'])  # all columns except target
y = df['converted']                 # only the target column


In [125]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42) 

In [126]:
df.select_dtypes(include='object').nunique()

lead_source          6
industry             8
employment_status    5
location             8
dtype: int64

In [127]:
categorical_features = ['industry', 'location', 'lead_source', 'employment_status']
X_train_cat = df_train[categorical_features]


In [128]:
target="converted"

In [129]:
# Define a helper function
def mutual_info_target_score(series):
    return mutual_info_score(series,df[target])

# Apply the function to all categorical features
mi = df[categorical].apply(mutual_info_target_score)

# Sort and round for readability
mi = mi.sort_values(ascending=False).round(2)
print(mi)


lead_source          0.03
employment_status    0.01
industry             0.01
location             0.00
dtype: float64


In [130]:
from sklearn.feature_extraction import DictVectorizer

train_dicts = df_train.to_dict(orient='records')

In [131]:
dv = DictVectorizer(sparse=False)

In [132]:
dv.fit(train_dicts)
X_train = dv.transform(train_dicts)

In [133]:
y_train = df_train['converted'].values
y_val = df_val['converted'].values

In [134]:
df_train = df_train.drop(columns=['converted'])
df_val = df_val.drop(columns=['converted'])

In [135]:
train_dicts = df_train.to_dict(orient='records')
val_dicts = df_val.to_dict(orient='records')

In [136]:
dv.fit(train_dicts)
X_train = dv.transform(train_dicts)

In [137]:
X_val = dv.transform(val_dicts)

In [138]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [139]:
y_pred = model.predict(X_val)
base_acc = accuracy_score(y_val, y_pred)
acc_rounded = round(base_acc, 2)

print("Validation Accuracy:", acc_rounded)

Validation Accuracy: 0.7


In [140]:
feature_diffs = {}

for feature in ['industry', 'employment_status', 'lead_score']:
    # Drop the feature
    df_train_reduced = df_train.drop(columns=[feature])
    df_val_reduced = df_val.drop(columns=[feature])

    # Convert to dicts again
    train_dicts = df_train_reduced.to_dict(orient='records')
    val_dicts = df_val_reduced.to_dict(orient='records')

    # Transform with a new DictVectorizer
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(train_dicts)
    X_val = dv.transform(val_dicts)

    # Train new model
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    # Compute new accuracy
    y_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    
    # Store difference
    diff = base_acc - acc
    feature_diffs[feature] = diff

# 7️⃣ Print differences
for f, d in feature_diffs.items():
    print(f"{f}: {round(d, 4)}")


industry: 0.0
employment_status: 0.0034
lead_score: -0.0068


In [141]:
X_train.shape

(876, 30)

In [142]:
# train_dicts = df_full_train.drop(columns=[target]).to_dict(orient='records')
# val_dicts = df_val.drop(columns=[target]).to_dict(orient='records')

# Vectorize
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

# Confirm shapes before training
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("X_val:", X_val.shape)
print("y_val:", y_val.shape)

# --- Training with different C values ---
C_values = [0.01, 0.1, 1, 10, 100]
accuracies = {}

for c in C_values:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_pred)

    accuracies[c] = round(acc, 3)

# --- Display Results ---
for c, acc in accuracies.items():
    print(f"C={c}: Accuracy={acc}")

best_C = max(accuracies, key=accuracies.get)
print(f"\n✅ Best C value: {best_C} with Accuracy={accuracies[best_C]}")

X_train: (876, 30)
y_train: (876,)
X_val: (293, 30)
y_val: (293,)
C=0.01: Accuracy=0.696
C=0.1: Accuracy=0.7
C=1: Accuracy=0.706
C=10: Accuracy=0.706
C=100: Accuracy=0.706

✅ Best C value: 1 with Accuracy=0.706
