In [None]:
# original dataset - Lead_Scoring.csv

In [None]:
# DATA CLEANING
import pandas as pd
import numpy as np
df = pd.read_csv('Lead_Scoring.csv')

columns_to_drop = [ 'Receive More Updates About Our Courses', 'Update me on Supply Chain Content',' Magazine '
                   'Get updates on DM Content','I agree to pay the amount through cheque']
df = df.drop(columns=columns_to_drop)

# Verify columns are dropped
print(df.columns)

numerical_columns = ['TotalVisits', 'Page Views Per Visit', 'Asymmetrique Activity Score', 
                     'Asymmetrique Profile Score']
df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].mean()).astype(int)

categorical_columns = ['Lead Source', 'Country', 'Specialization', 'Tags', 
                       'How did you hear about X Education', 'What is your current occupation', 
                       'What matters most to you in choosing a course', 'Lead Profile', 
                        'Last Activity']
df[categorical_columns] = df[categorical_columns].fillna('Other')
df['Lead Quality'] = df['Lead Quality'].fillna('Not Sure')
df['City'] = df['City'].fillna('Other Cities')

binary_columns = ['Do Not Email', 'Do Not Call', 'Search', 'Magazine', 'Newspaper Article',
                  'X Education Forums', 'Newspaper', 'Digital Advertisement', 'Through Recommendations',
                  'A free copy of Mastering The Interview']
for col in binary_columns:
    df[col] = df[col].replace('Yes', 1)
    df[col] = df[col].replace('No', 0)

df['Country'] = df['Country'].replace('unknown', 'Other')
df['Specialization'] = df['Specialization'].replace('Select', 'Other')
df['Lead Profile'] = df['Lead Profile'].replace('Select', 'Other')
df['City'] = df['City'].replace('Select', 'Other Cities')
df['City']= df['City'].replace('Other Metro Cities', 'Other Cities')
df['How did you hear about X Education'] = df['How did you hear about X Education'].replace('Select', 'Other')

df['Asymmetrique Activity Index'] = df['Asymmetrique Activity Index'].str.extract(r'(\w+)$')
df['Asymmetrique Profile Index'] = df['Asymmetrique Profile Index'].str.extract(r'(\w+)$')

def map_activity_index(score):
    
    if score >= 16:
        return 'High'
    elif 13 <= score < 16:
        return 'Medium'
    else:
        return 'Low'
df['Asymmetrique Activity Index'] = df['Asymmetrique Activity Index'].fillna(
    df['Asymmetrique Activity Score'].apply(map_activity_index)
)

def map_profile_index(score):
    
    if score >= 17:
        return 'High'
    elif 13 <= score < 17:
        return 'Medium'
    else:
        return 'Low'
df['Asymmetrique Profile Index'] = df['Asymmetrique Profile Index'].fillna(
    df['Asymmetrique Profile Score'].apply(map_profile_index)
)

df['Lead Profile']=df['Lead Profile'].replace('Other','Other Leads')

cities = ['Mumbai', 'Thane & Outskirts', 'Other Cities of Maharashtra']
def replace_country(row):
    if row['City'] in cities and row['Country'] == 'Other':
        return 'India'
    else:
        return row['Country']
df['Country'] = df.apply(replace_country, axis=1)
# save the cleaned data
df.to_csv('cleandata.csv', index=False)

In [None]:
# HYPOTHESIS

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Set style
sns.set_style("whitegrid")
sns.set_palette("coolwarm")
# load data
df=pd.read_csv("cleandata.csv")

#hypothesis : If we give a free copy of the master class, the conversion rate will increase
plt.figure(figsize=(12, 6))

free_copy_users=df[df["A free copy of Mastering The Interview"]==1]

conversion_counts = free_copy_users["Converted"].value_counts()

# Prepare data for visualization
conversion_data = {
    "Conversion Status": ["Converted", "Not Converted"],
    "Count": [conversion_counts.get(1), conversion_counts.get(0)]
}

# Create a DataFrame for plotting
conversion_df = pd.DataFrame(conversion_data)

# Plot the data
plt.figure(figsize=(8, 6))
sns.barplot(x="Conversion Status", y="Count", data=conversion_df)
plt.title("Conversion Status of Users Who Opted for Free Copy")
plt.ylabel("Count")
plt.xlabel("Conversion Status")
plt.show()

# hypothesis 2: employees(workingprofessional ,business man) have higher conversion rate
plt.figure(figsize=(12, 6))
employee_conversion = df[df["What is your current occupation"].isin(["Working Professional", "Businessman"])]
unemployee_conversion = df[~df["What is your current occupation"].isin(["Working Professional", "Businessman"])]

employee_conversion_rate = employee_conversion["Converted"].mean()
unemployee_conversion_rate = unemployee_conversion["Converted"].mean()

conversion_data = {
    "Group": ["Employees", "Unemployees"],
    "Conversion Rate": [employee_conversion_rate, unemployee_conversion_rate]
}

conversion_df = pd.DataFrame(conversion_data)

plt.figure(figsize=(8, 6))
sns.barplot(x="Group", y="Conversion Rate", data=conversion_df)
plt.title("Conversion Rate: Employees vs Unemployees")
plt.ylabel("Conversion Rate")
plt.xlabel("Group")
plt.show()

# hypothesis 3: Prospects who have been in touch with the company have higher conversion rate
plt.figure(figsize=(12, 6))
contacted_conversion = df[(df['Do Not Call']==0) | ( df['Do Not Email']==0)]
not_contacted_conversion = df[(df['Do Not Call']==1 ) | (df['Do Not Email']==1)]

contacted_conversion_rate = contacted_conversion["Converted"].mean()
not_contacted_conversion_rate = not_contacted_conversion["Converted"].mean()

conversion_data = {
    "Group": ["Contacted", "Not Contacted"],
    "Conversion Rate": [contacted_conversion_rate, not_contacted_conversion_rate]
}

conversion_df = pd.DataFrame(conversion_data)

plt.figure(figsize=(8, 6))
sns.barplot(x="Group", y="Conversion Rate", data=conversion_df)
plt.title("Conversion Rate: Contacted vs Not Contacted")
plt.ylabel("Conversion Rate")
plt.xlabel("Group")
plt.show()

# hypo 4:potential leads has higher conversion rate
plt.figure(figsize=(6, 6))
df['Leads'] = df['Lead Profile']=='Potential Lead'
sns.barplot(x=df.groupby("Leads")["Converted"].mean().index, 
            y=df.groupby("Leads")["Converted"].mean().values)
plt.title("Conversion Rate by Lead Profile")
plt.ylabel("Conversion Rate")
plt.xlabel("Lead Profile")
plt.show()

# hypo 5: users who have visited the website have higher conversion rate
plt.figure(figsize=(6, 6))  
df['Visited Website'] = df['TotalVisits'] > 10
conversion_rates = df.groupby("Visited Website")["Converted"].mean()
sns.barplot(
    x=conversion_rates.index.astype(str),  
    y=conversion_rates.values,
    palette="viridis"  # Use a visually appealing color palette
)
plt.title("Conversion Rate by Website Visits", fontsize=16, fontweight='bold')
plt.ylabel("Conversion Rate (%)", fontsize=12)
plt.xlabel("Visited Website (More than 10 Visits)", fontsize=12)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
for i, value in enumerate(conversion_rates.values):
    plt.text(i, value + 0.01, f"{value:.2%}", ha='center', fontsize=10, color='black')
plt.show()