# ORIGINAL DATASET

In [None]:
#  dataset - Lead_Scoring.csv

# DATA CLEANING

In [None]:
# DATA CLEANING
import pandas as pd
import numpy as np
df = pd.read_csv('Lead_Scoring.csv')

columns_to_drop = [ 'Receive More Updates About Our Courses', 'Update me on Supply Chain Content',' Magazine '
                   'Get updates on DM Content','I agree to pay the amount through cheque']
df = df.drop(columns=columns_to_drop)

# Verify columns are dropped
print(df.columns)

numerical_columns = ['TotalVisits', 'Page Views Per Visit', 'Asymmetrique Activity Score', 
                     'Asymmetrique Profile Score']
df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].mean()).astype(int)

categorical_columns = ['Lead Source', 'Country', 'Specialization', 'Tags', 
                       'How did you hear about X Education', 'What is your current occupation', 
                       'What matters most to you in choosing a course', 'Lead Profile', 
                        'Last Activity']
df[categorical_columns] = df[categorical_columns].fillna('Other')
df['Lead Quality'] = df['Lead Quality'].fillna('Not Sure')
df['City'] = df['City'].fillna('Other Cities')

binary_columns = ['Do Not Email', 'Do Not Call', 'Search', 'Magazine', 'Newspaper Article',
                  'X Education Forums', 'Newspaper', 'Digital Advertisement', 'Through Recommendations',
                  'A free copy of Mastering The Interview']
for col in binary_columns:
    df[col] = df[col].replace('Yes', 1)
    df[col] = df[col].replace('No', 0)

df['Country'] = df['Country'].replace('unknown', 'Other')
df['Specialization'] = df['Specialization'].replace('Select', 'Other')
df['Lead Profile'] = df['Lead Profile'].replace('Select', 'Other')
df['City'] = df['City'].replace('Select', 'Other Cities')
df['City']= df['City'].replace('Other Metro Cities', 'Other Cities')
df['How did you hear about X Education'] = df['How did you hear about X Education'].replace('Select', 'Other')

df['Asymmetrique Activity Index'] = df['Asymmetrique Activity Index'].str.extract(r'(\w+)$')
df['Asymmetrique Profile Index'] = df['Asymmetrique Profile Index'].str.extract(r'(\w+)$')

def map_activity_index(score):
    
    if score >= 16:
        return 'High'
    elif 13 <= score < 16:
        return 'Medium'
    else:
        return 'Low'
df['Asymmetrique Activity Index'] = df['Asymmetrique Activity Index'].fillna(
    df['Asymmetrique Activity Score'].apply(map_activity_index)
)

def map_profile_index(score):
    
    if score >= 17:
        return 'High'
    elif 13 <= score < 17:
        return 'Medium'
    else:
        return 'Low'
df['Asymmetrique Profile Index'] = df['Asymmetrique Profile Index'].fillna(
    df['Asymmetrique Profile Score'].apply(map_profile_index)
)

df['Lead Profile']=df['Lead Profile'].replace('Other','Other Leads')

cities = ['Mumbai', 'Thane & Outskirts', 'Other Cities of Maharashtra']
def replace_country(row):
    if row['City'] in cities and row['Country'] == 'Other':
        return 'India'
    else:
        return row['Country']
df['Country'] = df.apply(replace_country, axis=1)
# save the cleaned data
df.to_csv('cleandata.csv', index=False)

# HYPOTHESIS

In [None]:
# HYPOTHESIS

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Set style
sns.set_style("whitegrid")
sns.set_palette("coolwarm")
# load data
df=pd.read_csv("cleandata.csv")

#hypothesis : If we give a free copy of the master class, the conversion rate will increase
plt.figure(figsize=(12, 6))

free_copy_users=df[df["A free copy of Mastering The Interview"]==1]

conversion_counts = free_copy_users["Converted"].value_counts()

# Prepare data for visualization
conversion_data = {
    "Conversion Status": ["Converted", "Not Converted"],
    "Count": [conversion_counts.get(1), conversion_counts.get(0)]
}

# Create a DataFrame for plotting
conversion_df = pd.DataFrame(conversion_data)

# Plot the data
plt.figure(figsize=(8, 6))
sns.barplot(x="Conversion Status", y="Count", data=conversion_df)
plt.title("Conversion Status of Users Who Opted for Free Copy")
plt.ylabel("Count")
plt.xlabel("Conversion Status")
plt.show()

# hypothesis 2: employees(workingprofessional ,business man) have higher conversion rate
plt.figure(figsize=(12, 6))
employee_conversion = df[df["What is your current occupation"].isin(["Working Professional", "Businessman"])]
unemployee_conversion = df[~df["What is your current occupation"].isin(["Working Professional", "Businessman"])]

employee_conversion_rate = employee_conversion["Converted"].mean()
unemployee_conversion_rate = unemployee_conversion["Converted"].mean()

conversion_data = {
    "Group": ["Employees", "Unemployees"],
    "Conversion Rate": [employee_conversion_rate, unemployee_conversion_rate]
}

conversion_df = pd.DataFrame(conversion_data)

plt.figure(figsize=(8, 6))
sns.barplot(x="Group", y="Conversion Rate", data=conversion_df)
plt.title("Conversion Rate: Employees vs Unemployees")
plt.ylabel("Conversion Rate")
plt.xlabel("Group")
plt.show()

# hypothesis 3: Prospects who have been in touch with the company have higher conversion rate
plt.figure(figsize=(12, 6))
contacted_conversion = df[(df['Do Not Call']==0) | ( df['Do Not Email']==0)]
not_contacted_conversion = df[(df['Do Not Call']==1 ) | (df['Do Not Email']==1)]

contacted_conversion_rate = contacted_conversion["Converted"].mean()
not_contacted_conversion_rate = not_contacted_conversion["Converted"].mean()

conversion_data = {
    "Group": ["Contacted", "Not Contacted"],
    "Conversion Rate": [contacted_conversion_rate, not_contacted_conversion_rate]
}

conversion_df = pd.DataFrame(conversion_data)

plt.figure(figsize=(8, 6))
sns.barplot(x="Group", y="Conversion Rate", data=conversion_df)
plt.title("Conversion Rate: Contacted vs Not Contacted")
plt.ylabel("Conversion Rate")
plt.xlabel("Group")
plt.show()

# hypo 4:potential leads has higher conversion rate
plt.figure(figsize=(6, 6))
df['Leads'] = df['Lead Profile']=='Potential Lead'
sns.barplot(x=df.groupby("Leads")["Converted"].mean().index, 
            y=df.groupby("Leads")["Converted"].mean().values)
plt.title("Conversion Rate by Lead Profile")
plt.ylabel("Conversion Rate")
plt.xlabel("Lead Profile")
plt.show()

# hypo 5: users who have visited the website have higher conversion rate
plt.figure(figsize=(6, 6))  
df['Visited Website'] = df['TotalVisits'] > 10
conversion_rates = df.groupby("Visited Website")["Converted"].mean()
sns.barplot(
    x=conversion_rates.index.astype(str),  
    y=conversion_rates.values,
    palette="viridis"  # Use a visually appealing color palette
)
plt.title("Conversion Rate by Website Visits", fontsize=16, fontweight='bold')
plt.ylabel("Conversion Rate (%)", fontsize=12)
plt.xlabel("Visited Website (More than 10 Visits)", fontsize=12)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
for i, value in enumerate(conversion_rates.values):
    plt.text(i, value + 0.01, f"{value:.2%}", ha='center', fontsize=10, color='black')
plt.show()

# Plot-d: Conversion Rate vs. Total Time Spent on Website
plt.figure(figsize=(8, 5))
sns.barplot(x="Converted", y="Total Time Spent on Website", data=df1)
plt.title("Impact of Website Engagement on Conversion")
plt.xlabel("Converted (1 = Yes, 0 = No)")
plt.ylabel("Total Time Spent on Website")
plt.show()

# hypo 7 :Most of our prospects are from India
plt.figure(figsize=(8, 6))
sns.countplot(x="Country", data=df, order=df["Country"].value_counts().index)
plt.title("Prospects by Country")
plt.ylabel("Count")
plt.xlabel("Country")
plt.xticks(rotation=90)
plt.show()

# hypo 8 :Prospects who have high , medium Activity index have higher conversion rate

plt.figure(figsize=(10, 6))
df["Asymmetrique Activity Index"] = pd.Categorical(df["Asymmetrique Activity Index"], 
                                                   categories=["Low", "Medium", "High"], 
                                                   ordered=True)
activity_conversion = df.groupby("Asymmetrique Activity Index")["Converted"].mean().reset_index()
sns.barplot(x="Asymmetrique Activity Index", 
            y="Converted", 
            data=activity_conversion, 
            order=["Low", "Medium", "High"], 
            palette="coolwarm")
plt.title("Impact of Asymmetrique Activity Index on Conversion Rate")
plt.ylabel("Average Conversion Rate")
plt.xlabel("Asymmetrique Activity Index")
plt.ylim(0, 1)  
plt.yticks(np.arange(0, 1.2, 0.2))
plt.show()

# hypo 9 :Prospects who have high , medium Profile index have higher conversion rate

plt.figure(figsize=(10, 6))
df["Asymmetrique Profile Index"] = pd.Categorical(df["Asymmetrique Profile Index"], 
                                                   categories=["Low", "Medium", "High"], 
                                                   ordered=True)
profile_conversion = df.groupby("Asymmetrique Profile Index")["Converted"].mean().reset_index()
sns.barplot(x="Asymmetrique Profile Index", 
            y="Converted", 
            data=profile_conversion, 
            order=["Low", "Medium", "High"],  
            palette="coolwarm")
plt.title("Impact of Asymmetrique Profile Index on Conversion Rate")
plt.ylabel("Average Conversion Rate")
plt.xlabel("Asymmetrique Profile Index")
plt.ylim(0, 1)  
plt.yticks(np.arange(0, 1.2, 0.2))
plt.show()

# hypo 10:Prospects who sent sms and who resubscribed to email  have higher conversion rate
plt.figure(figsize=(14, 8))
activity_conversion_counts = df.groupby(["Last Notable Activity", "Converted"]).size().unstack(fill_value=0)
activity_conversion_counts.plot(
    kind="bar", 
    stacked=True, 
    figsize=(14, 8), 
    color=["#FF9999", "#66B2FF"],  
    edgecolor="black"
)
plt.title("Conversion Counts by Last Notable Activity", fontsize=16, fontweight="bold")
plt.xlabel("Last Notable Activity", fontsize=14)
plt.ylabel("Count", fontsize=14)
plt.xticks(rotation=90, fontsize=12)
plt.yticks(fontsize=12)
plt.legend(["Not Converted", "Converted"], fontsize=12, title="Conversion Status")
plt.tight_layout()
plt.show()

# hypo-d:prospects from health and finance sectors have high conversion rate
specialization_conversion = df.groupby('Specialization')['Converted'].mean().reset_index()
specialization_conversion['Converted'] *= 100  
specialization_conversion = specialization_conversion.sort_values(by='Converted', ascending=False)
plt.figure(figsize=(12, 6))
sns.barplot(x='Converted', y='Specialization', data=specialization_conversion, palette='coolwarm')
plt.xlabel("Conversion Rate (%)")
plt.ylabel("Specialization")
plt.title("Conversion Rate by Specialization")
plt.xlim(0, 100)
plt.show()

# hypo-m: prospects who submitted landing page application have high conversion rate
df_converted = df[df['Converted'] == 1]
plt.figure(figsize=(12, 6))
ax = sns.countplot(
    data=df_converted,
    x='Lead Origin',
    palette='inferno',
    order=df_converted['Lead Origin'].value_counts().index
)
plt.title('Count of Converted Members by Referral Source', fontsize=16, pad=20)
plt.xlabel('Referral Source', fontsize=12)
plt.ylabel('Count of Converted', fontsize=12)
plt.xticks(rotation=45, ha='right')  
for p in ax.patches:
    ax.annotate(
        f'{int(p.get_height())}', 
        (p.get_x() + p.get_width() / 2., p.get_height()),
        ha='center', va='center', 
        xytext=(0, 5), 
        textcoords='offset points'
    )
plt.tight_layout()
plt.show()

# ANALYSIS

In [None]:
# ANALYSIS

sns.set_style("whitegrid")
sns.set_palette("coolwarm")

df=pd.read_csv("cleandata.csv")

#Lead Origin VS Conversion Rate
plt.figure(figsize=(12, 6))
sns.barplot(x=df.groupby("Lead Origin")["Converted"].mean().index, 
            y=df.groupby("Lead Origin")["Converted"].mean().values)
plt.title("Conversion Rate by Lead Origin")
plt.ylabel("Conversion Rate")
plt.xlabel("Lead Origin")
plt.xticks(rotation=90)
plt.show()

# Lead Source VS Conversion Rate
plt.figure(figsize=(12, 6))
sns.barplot(x=df.groupby("Lead Source")["Converted"].mean().index, 
            y=df.groupby("Lead Source")["Converted"].mean().values)
plt.title("Conversion Rate by Lead Source")
plt.ylabel("Conversion Rate")
plt.xlabel("Lead Source")
plt.xticks(rotation=90)
plt.show()

# Time Spent on Website VS Conversion Rate
plt.figure(figsize=(12, 6))
sns.boxplot(x=df["Converted"], y=df["Total Time Spent on Website"])
plt.title("Total Time Spent on Website vs Conversion")
plt.xlabel("Converted (Not Converted=0, Converted=1)")
plt.ylabel("Total Time Spent on Website")
plt.show()

# Page Views Per Visit VS Conversion Rate
plt.figure(figsize=(12, 6))
sns.boxplot(x=df["Converted"], y=df["Page Views Per Visit"], palette="coolwarm")
plt.title("Page Views Per Visit vs Conversion")
plt.xlabel("Converted (0=No, 1=Yes)")
plt.ylabel("Page Views Per Visit")
plt.show()

# Country-wise conversions show differences
country_conversion = df.groupby("Country")["Converted"].agg(['sum', 'count']).reset_index()
country_conversion["non_converted"] = country_conversion["count"] - country_conversion["sum"]  # Total - Converted
country_conversion = country_conversion.sort_values(by="sum", ascending=False)
plt.figure(figsize=(12, 6))
plt.bar(country_conversion["Country"], 
        country_conversion["non_converted"], 
        label="Non-Converted", 
        color="lightcoral",
        bottom=country_conversion["sum"])
plt.bar(country_conversion["Country"], 
        country_conversion["sum"], 
        label="Converted", 
        color="seagreen")
plt.title("Total Number of Leads (Converted vs Non-Converted) by Country")
plt.ylabel("Total Leads")
plt.xlabel("Country")
plt.xticks(rotation=90)
plt.legend()
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()

# Tags VS Conversion Rate
plt.figure(figsize=(12, 6))
sns.barplot(x=df.groupby("Tags")["Converted"].mean().index, 
            y=df.groupby("Tags")["Converted"].mean().values)
plt.title("Conversion Rate by Tags")
plt.ylabel("Conversion Rate")
plt.xlabel("Tags")
plt.xticks(rotation=90)
plt.show()

# FEATURE ENGINEERING

In [None]:
# FEATURE ENGINEERING

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder

# Load the data
df = pd.read_csv('cleandata.csv')

# FEATURE CREATION
df['Engagement Score']=df['Total Time Spent on Website']/df['TotalVisits']
df['Engagement Score'] = df['Engagement Score'].fillna(0).astype(int)
df['Do Not Contact'] = df.apply(lambda row: 1 if row['Do Not Email'] == 1 or row['Do Not Call'] == 1 else 0, axis=1)
print(df.head())

# FEATURE BUCKETING
df['Total Time Spent on Website'] = pd.qcut(df['Total Time Spent on Website'], 
                                                     q=3, labels=['Low', 'Medium', 'High'],
                                                     duplicates='drop')
df['Total Time Spent on Website'] = df['Total Time Spent on Website'].map({'Low':1,'Medium':2,'High':3})

df['Page Views Per Visit']=pd.qcut(df['Page Views Per Visit'],q=3,labels=['Low','Medium','High'])
df['Page Views Per Visit']=df['Page Views Per Visit'].map({'Low':1,'Medium':2,'High':3})

# FEATURE TRANSFORMATION
df['Activity Score'] = df['Asymmetrique Activity Index'].map({'Low':1,'Medium':2,'High':3})
df['Profile Score'] = df['Asymmetrique Profile Index'].map({'Low':1,'Medium':2,'High':3})
encoder = OneHotEncoder(drop=None, sparse_output=False)  
cat_columns = ['Lead Origin', 'Lead Source', 'Country', 'Specialization', 
               'What is your current occupation', 'Lead Quality', 'Lead Profile']
encoded_features = encoder.fit_transform(df[cat_columns])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(cat_columns))
df = df.drop(columns=cat_columns).reset_index(drop=True)
df = pd.concat([df, encoded_df], axis=1)

# FEATURE CORRELATION
df_numerical_col=df.select_dtypes(include=[np.number])
correlation = df_numerical_col.corr()
plt.figure(figsize=(20,10))
sns.heatmap(correlation, annot=True, cmap='coolwarm')
plt.show()

# FEATURE SELECTION
selected_features = [
    'Prospect ID', 'Do Not Contact', 'Converted', 'Engagement Score', 'Page Views Per Visit',
    'Activity Score', 'Profile Score', 'A free copy of Mastering The Interview',

    'Lead Origin_API', 'Lead Origin_Landing Page Submission', 'Lead Origin_Lead Add Form',
    'Lead Origin_Lead Import', 'Lead Origin_Quick Add Form',

    'Lead Source_Click2call', 'Lead Source_Direct Traffic', 'Lead Source_Facebook', 
    'Lead Source_Google', 'Lead Source_Live Chat', 'Lead Source_NC_EDM', 
    'Lead Source_Olark Chat', 'Lead Source_Organic Search', 'Lead Source_Other',
    'Lead Source_Pay per Click Ads', 'Lead Source_Press_Release', 'Lead Source_Reference',
    'Lead Source_Referral Sites', 'Lead Source_Social Media', 'Lead Source_WeLearn',
    'Lead Source_Welingak Website', 'Lead Source_bing', 'Lead Source_blog', 
    'Lead Source_google', 'Lead Source_testone', 'Lead Source_welearnblog_Home', 
    'Lead Source_youtubechannel',

    'Country_Asia/Pacific Region', 'Country_Australia', 'Country_Bahrain', 
    'Country_Bangladesh', 'Country_Belgium', 'Country_Canada', 'Country_China', 
    'Country_Denmark', 'Country_France', 'Country_Germany', 'Country_Ghana', 
    'Country_Hong Kong', 'Country_India', 'Country_Indonesia', 'Country_Italy', 
    'Country_Kenya', 'Country_Kuwait', 'Country_Liberia', 'Country_Malaysia', 
    'Country_Netherlands', 'Country_Nigeria', 'Country_Oman', 'Country_Other', 
    'Country_Philippines', 'Country_Qatar', 'Country_Russia', 'Country_Saudi Arabia', 
    'Country_Singapore', 'Country_South Africa', 'Country_Sri Lanka', 'Country_Sweden', 
    'Country_Switzerland', 'Country_Tanzania', 'Country_Uganda', 
    'Country_United Arab Emirates', 'Country_United Kingdom', 'Country_United States', 
    'Country_Vietnam',

    'Specialization_Banking, Investment And Insurance', 'Specialization_Business Administration', 
    'Specialization_E-Business', 'Specialization_E-COMMERCE', 'Specialization_Finance Management', 
    'Specialization_Healthcare Management', 'Specialization_Hospitality Management', 
    'Specialization_Human Resource Management', 'Specialization_IT Projects Management', 
    'Specialization_International Business', 'Specialization_Marketing Management', 
    'Specialization_Media and Advertising', 'Specialization_Operations Management', 
    'Specialization_Other', 'Specialization_Retail Management', 
    'Specialization_Rural and Agribusiness', 'Specialization_Services Excellence', 
    'Specialization_Supply Chain Management', 'Specialization_Travel and Tourism',

    'What is your current occupation_Businessman', 'What is your current occupation_Housewife',
    'What is your current occupation_Other', 'What is your current occupation_Student',
    'What is your current occupation_Unemployed', 'What is your current occupation_Working Professional',

    'Lead Quality_High in Relevance', 'Lead Quality_Low in Relevance', 'Lead Quality_Might be', 
    'Lead Quality_Not Sure', 'Lead Quality_Worst',

    'Lead Profile_Dual Specialization Student', 'Lead Profile_Lateral Student', 
    'Lead Profile_Other Leads', 'Lead Profile_Potential Lead', 'Lead Profile_Student of SomeSchool'
]
df_select = df[selected_features]

df_select.to_csv('updated_model_features.csv', index=False)

# MODEL TRAINING & MODEL EVALUATION

In [None]:
# MODEL TRAINING 

# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

# Load the dataset
data = pd.read_csv('updated_model_features.csv')

# Drop 'Prospect ID' as it's not useful for model training
data.drop('Prospect ID', axis=1, inplace=True)

# Save features used for training
data.to_csv('features_given_to_model.csv', index=False)

# Define features (X) and target variable (y)
X = data.drop('Converted', axis=1)
y = data['Converted']

# Split dataset into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Dictionary to store model results
results = {}

# Function to train, evaluate and store results of each model
def evaluate_model(model, model_name):
    """
    Trains the model, makes predictions, evaluates performance, and stores results.
    """
    model.fit(X_train, y_train)  # Train model
    predictions = model.predict(X_test)  # Predict on test data
    
    # Compute evaluation metrics
    accuracy = accuracy_score(y_test, predictions)
    roc_auc = roc_auc_score(y_test, predictions)
    
    # Store results
    results[model_name] = {
        'accuracy': accuracy,
        'roc_auc': roc_auc,
        'conf_matrix': confusion_matrix(y_test, predictions),
        'classification_report': classification_report(y_test, predictions)
    }
    
    # Display Confusion Matrix
    plt.figure(figsize=(5, 4))
    sns.heatmap(results[model_name]['conf_matrix'], annot=True, fmt="d", cmap="Blues", cbar=False)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.show()

    # Print Classification Report
    print(f"Classification Report for {model_name}:\n{results[model_name]['classification_report']}")

# 🔹 MODEL 1: Logistic Regression
from sklearn.linear_model import LogisticRegression
evaluate_model(LogisticRegression(max_iter=200, random_state=42), "Logistic Regression")

# 🔹 MODEL 2: Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
evaluate_model(DecisionTreeClassifier(random_state=42), "Decision Tree")

# 🔹 MODEL 3: Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
evaluate_model(RandomForestClassifier(random_state=42), "Random Forest")

# 🔹 MODEL 4: XGBoost Classifier
import xgboost as xgb
evaluate_model(xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'), "XGBoost")

# 🔹 MODEL 5: LightGBM Classifier
import lightgbm as lgb
evaluate_model(lgb.LGBMClassifier(random_state=42), "LightGBM")

# 🔹 Model Comparison Table
comparison = pd.DataFrame({
    'Model': list(results.keys()),
    'Accuracy': [res['accuracy'] for res in results.values()],
    'ROC AUC': [res['roc_auc'] for res in results.values()]
}).sort_values('ROC AUC', ascending=False)

# Display the model comparison table
print("\n🔹 Model Comparison Table:")
print(comparison)
