# Library for analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Importing Dataset

In [None]:
ld = pd.read_csv('lead_dataset.csv')
ld

Unnamed: 0,Lead_id,Location,College Name,Year of Study,Program Interested,Source
0,LD1,Jaipur,Jadavpur University,3rd,Cloud Computing,LinkedIn
1,LD2,Hyderabad,Anna University,4th,AI,Whatsapp
2,LD3,Kolkata,Savitribai Phule Pune University,4th,Robotics,LinkedIn
3,LD4,Pune,Savitribai Phule Pune University,2nd,AI,Instagram
4,LD5,Pune,Nirma University,4th,Blockchain,Whatsapp
...,...,...,...,...,...,...
9995,LD9996,Ahmedabad,Manipal University,3rd,Cybersecurity,LinkedIn
9996,LD9997,Jaipur,IIT Bombay,1st,Cloud Computing,LinkedIn
9997,LD9998,Chennai,Savitribai Phule Pune University,4th,Robotics,College Collaboration
9998,LD9999,Mumbai,Savitribai Phule Pune University,3rd,IoT,LinkedIn


In [None]:
# Checking colums
ld.columns

Index(['Lead_id', 'Location', 'College Name', 'Year of Study',
       'Program Interested', 'Source'],
      dtype='object')

In [None]:
ld['Location'].unique()

array(['Jaipur', 'Hyderabad', 'Kolkata', 'Pune', 'Delhi', 'Ahmedabad',
       'Lucknow', 'Mumbai', 'Chennai', 'Bangalore'], dtype=object)

In [None]:
ld['College Name'].unique()

array(['Jadavpur University', 'Anna University',
       'Savitribai Phule Pune University', 'Nirma University',
       'Amity University', 'Delhi University', 'Osmania University',
       'Christ University', 'Manipal University', 'IIT Bombay'],
      dtype=object)

In [None]:
ld['Year of Study'].unique()

array(['3rd', '4th', '2nd', '1st'], dtype=object)

In [None]:
ld['Program Interested'].unique()

array(['Cloud Computing', 'AI', 'Robotics', 'Blockchain', 'Data Science',
       'Business Analytics', 'Cybersecurity', 'Electric Vehicle',
       'Biotechnology', 'IoT'], dtype=object)

In [None]:
ld['Source'].unique()

array(['LinkedIn', 'Whatsapp', 'Instagram', 'Mass-Mailing', 'Google Form',
       'College Collaboration'], dtype=object)

In [None]:
ld.head()

Unnamed: 0,Lead_id,Location,College Name,Year of Study,Program Interested,Source
0,LD1,Jaipur,Jadavpur University,3rd,Cloud Computing,LinkedIn
1,LD2,Hyderabad,Anna University,4th,AI,Whatsapp
2,LD3,Kolkata,Savitribai Phule Pune University,4th,Robotics,LinkedIn
3,LD4,Pune,Savitribai Phule Pune University,2nd,AI,Instagram
4,LD5,Pune,Nirma University,4th,Blockchain,Whatsapp


In [None]:
ld.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Lead_id             10000 non-null  object
 1   Location            10000 non-null  object
 2   College Name        10000 non-null  object
 3   Year of Study       10000 non-null  object
 4   Program Interested  10000 non-null  object
 5   Source              10000 non-null  object
dtypes: object(6)
memory usage: 468.9+ KB


In [None]:
ld.describe()

Unnamed: 0,Lead_id,Location,College Name,Year of Study,Program Interested,Source
count,10000,10000,10000,10000,10000,10000
unique,10000,10,10,4,10,6
top,LD1,Kolkata,Manipal University,1st,Data Science,College Collaboration
freq,1,1040,1054,2551,1033,1704


In [None]:
# checking for null values
ld.isnull().sum()
#no null values in data set

Unnamed: 0,0
Lead_id,0
Location,0
College Name,0
Year of Study,0
Program Interested,0
Source,0


In [None]:
# changing Lead_id colum to numeruic because all the id is contiaing LD so we can remove it and make it numerice
ld['Lead_id'] = ld['Lead_id'].str.extract('(\d+)').astype(int)

# Demographic Analysis:

*  Analyze the data to identify trends in lead sourcing based on location, college, and
year of study.

* Highlight any significant patterns or anomalies

In [None]:
# For Location
location_source_trends = ld.groupby(['Location', 'Source']).size().reset_index(name='Count')
top_location_source = location_source_trends.sort_values(by="Count", ascending=False).head(5)
print(top_location_source)

   Location                 Source  Count
47  Lucknow               Whatsapp    193
39  Kolkata               LinkedIn    187
37  Kolkata            Google Form    185
33   Jaipur               LinkedIn    183
42  Lucknow  College Collaboration    181


In [None]:
# for college
college_source_trends = ld.groupby(['College Name', 'Source']).size().reset_index(name='Count')
top_college_source = college_source_trends.sort_values(by="Count", ascending=False).head(5)
print(top_college_source)

                        College Name                 Source  Count
33               Jadavpur University               LinkedIn    203
36                Manipal University  College Collaboration    200
30               Jadavpur University  College Collaboration    189
52                Osmania University           Mass-Mailing    188
55  Savitribai Phule Pune University            Google Form    185


In [None]:
# for year of study
year_source_trends = ld.groupby(['Year of Study', 'Source']).size().reset_index(name='Count')
top_year_source = year_source_trends.sort_values(by="Count", ascending=False).head(5)
print(top_year_source)

   Year of Study                 Source  Count
3            1st               LinkedIn    453
8            2nd              Instagram    446
1            1st            Google Form    440
18           4th  College Collaboration    440
19           4th            Google Form    432




### **1. Location and Source Trends:**

- **Patterns:**
  
 *  **Lucknow** has strong leads from **Whatsapp** (193) and **College Collaboration** (181), suggesting diversified outreach.
  
 *  **Kolkata** shows near-equal engagement from **LinkedIn** (187) and **Google Form** (185), indicating balanced channel effectiveness.

- **Anomalies:**
  * Unexpected high counts from sources like **Whatsapp** or **College Collaboration** in Lucknow might suggest targeted campaigns or data inconsistencies.



### **2. College and Source Trends:**

- **Patterns:**
  - **Jadavpur University** sees significant results from **LinkedIn** (203), with **College Collaboration** (189) being secondary.
  - **Manipal University** is effectively reached through **College Collaboration** (200), indicating a focused outreach strategy.

- **Anomalies:**
  - **Savitribai Phule Pune University**’s reliance on **Google Form** (185) might indicate an unusual trend, as other colleges show more conventional sources like **LinkedIn**.



### **3. Year of Study and Source Trends:**

- **Patterns:**
  - **1st Year** students are mostly reached through **LinkedIn** (453) and **Google Form** (440), highlighting these platforms’ effectiveness.
  - **2nd Year** students show strong engagement with **Instagram** (446), indicating its appeal for this group.
  - **4th Year** students engage evenly with **Google Form** (432) and **College Collaboration** (440), pointing to targeted, formal outreach.

- **Anomalies:**
  - Unexpected sources, like **Mass-Mailing** for **4th Year**, would indicate a significant anomaly in lead generation or reporting.





# **`Program Analysis:`**

○ Compare the interest levels for various e-learning programs.

○ Provide a recommendation on which demographics should be targeted more
aggressively for each program.

In [None]:
program_interest = ld['Program Interested'].value_counts()
program_interest

Unnamed: 0_level_0,count
Program Interested,Unnamed: 1_level_1
Data Science,1033
Business Analytics,1027
IoT,1020
Robotics,1010
Cybersecurity,996
Cloud Computing,992
AI,991
Blockchain,984
Electric Vehicle,974
Biotechnology,973


In [None]:
location_interest = ld.groupby(['Program Interested', 'Location']).size()
location_interest

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Program Interested,Location,Unnamed: 2_level_1
AI,Ahmedabad,105
AI,Bangalore,95
AI,Chennai,95
AI,Delhi,104
AI,Hyderabad,92
...,...,...
Robotics,Jaipur,101
Robotics,Kolkata,113
Robotics,Lucknow,88
Robotics,Mumbai,100


In [None]:
year_interest = ld.groupby(['Program Interested', 'Year of Study']).size()
year_interest

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Program Interested,Year of Study,Unnamed: 2_level_1
AI,1st,249
AI,2nd,238
AI,3rd,256
AI,4th,248
Biotechnology,1st,244
Biotechnology,2nd,230
Biotechnology,3rd,251
Biotechnology,4th,248
Blockchain,1st,252
Blockchain,2nd,252


In [None]:
source_interest = ld.groupby(['Program Interested', 'Source']).size()
source_interest

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Program Interested,Source,Unnamed: 2_level_1
AI,College Collaboration,165
AI,Google Form,136
AI,Instagram,174
AI,LinkedIn,167
AI,Mass-Mailing,169
AI,Whatsapp,180
Biotechnology,College Collaboration,177
Biotechnology,Google Form,164
Biotechnology,Instagram,163
Biotechnology,LinkedIn,168


# **Interest Levels for Programs**
**Top Programs:**
*  Data Science (1033 leads)
*  Business Analytics (1027 leads)
*  IoT (1020 leads)

**Least Popular Programs:**
*  Biotechnology (973 leads)
*  Electric Vehicle (974 leads)
*  Blockchain (984 leads)

###**Demographic Breakdown**



**Location**


*  AI: High interest from Ahmedabad, Delhi, and Bangalore.

*  Data Science: High interest from Mumbai, Hyderabad, and Chennai.

*  IoT: Popular in Kolkata, Pune, and Jaipur.

**Year of Study:**

*  Programs like Data Science, Business Analytics, and AI have relatively even interest across all years.

*  Electric Vehicle and Biotechnology show higher interest in 4th-year students.

**Source of Leads:**

*  WhatsApp and LinkedIn are prominent sources for most programs.

*  College Collaboration is particularly effective for programs like Business
Analytics and Blockchain.

*  Instagram attracts higher leads for creative or tech-focused programs like Robotics and Cybersecurity.

## **Recommendations for Targeting Demographics**

**AI, Data Science, and Business Analytics:**

*  **Target Locations**: Ahmedabad, Bangalore, Delhi, and Mumbai.

*  **Year of Study:** Focus evenly on 3rd and 4th-year students.

*  **Lead Sources:** Emphasize LinkedIn, WhatsApp, and College Collaborations.

**IoT and Robotics:**

*  **Target Locations:** Pune, Kolkata, and Jaipur.

*  **Year of Study:** Prioritize 2nd and 3rd-year students.

*  **Lead Sources:** Leverage Instagram and Google Forms.

**Electric Vehicle and Biotechnology:**

*  **Target Locations:** Focus on emerging cities like Lucknow and Pune.

*  **Year of Study:** Aggressively target 4th-year students.

*  **Lead Sources:** Use Mass-Mailing and LinkedIn campaigns.

**Blockchain, Cybersecurity, and Cloud Computing:**

*  **Target Locations** Chennai, Hyderabad, and Jaipur.

*  **Year of Study:** Balance outreach across all years.

*  **Lead Sources:** Focus on College Collaborations and Google Forms.

# **Projections and Predictions:**

*  Based on the generated data, make a data-driven projection about lead conversion
rates and suggest a budget allocation strategy for marketing and targeting.


In [None]:
# assumed conversion rates for each source in percentage
conversion_rates = {"College Collaboration": 20,"LinkedIn": 18,"Whatsapp": 15,"Instagram": 10,"Google Form": 12,"Mass-Mailing": 8}

In [None]:
# projected conversions for each program and source
ld['Projected Conversions'] = ld['Source'].map(conversion_rates) / 100

In [None]:
# aggregate by program and source for projections
conversion_projection = ld.groupby(['Program Interested', 'Source'])['Projected Conversions'].sum()
print(conversion_projection)

Program Interested  Source               
AI                  College Collaboration    33.00
                    Google Form              16.32
                    Instagram                17.40
                    LinkedIn                 30.06
                    Mass-Mailing             13.52
                    Whatsapp                 27.00
Biotechnology       College Collaboration    35.40
                    Google Form              19.68
                    Instagram                16.30
                    LinkedIn                 30.24
                    Mass-Mailing             11.76
                    Whatsapp                 23.10
Blockchain          College Collaboration    37.40
                    Google Form              19.92
                    Instagram                15.40
                    LinkedIn                 28.62
                    Mass-Mailing             12.16
                    Whatsapp                 24.90
Business Analytics  College Collaboratio

In [None]:
# summarize total projected conversions per program
program_conversion_totals = conversion_projection.groupby(level=0).sum()
print(program_conversion_totals)

Program Interested
AI                    137.30
Biotechnology         136.48
Blockchain            138.40
Business Analytics    144.58
Cloud Computing       138.79
Cybersecurity         137.70
Data Science          142.48
Electric Vehicle      131.83
IoT                   142.52
Robotics              137.39
Name: Projected Conversions, dtype: float64


In [None]:
# summarize total projected conversions per source for budget strategy
source_conversion_totals = conversion_projection.groupby(level=1).sum()
print(source_conversion_totals)

Source
College Collaboration    340.80
Google Form              202.08
Instagram                166.10
LinkedIn                 301.32
Mass-Mailing             130.72
Whatsapp                 246.45
Name: Projected Conversions, dtype: float64


### **Summary of Lead Conversion Projections and Budget Allocation Strategy**

#### **Lead Conversion Projections:**
- **Top-Performing Programs:**
  - **Business Analytics**: 144.58 conversions
  - **IoT**: 142.52 conversions
  - **Data Science**: 142.48 conversions
  - **Strategy**: Prioritize these programs for marketing efforts.

- **Moderate-Performing Programs:**
  - **Cloud Computing**: 138.79 conversions
  - **Blockchain**: 138.40 conversions
  - **Cybersecurity**: 137.70 conversions
  - **Strategy**: Support with targeted campaigns to boost conversions.

- **Lower-Performing Programs:**
  - **Electric Vehicle**: 131.83 conversions
  - **Biotechnology**: 136.48 conversions
  - **Strategy**: Enhance with specialized campaigns or optimized lead sources.


#### **Lead Source Contribution:**
- **Top Sources:**
  - **College Collaboration**: 340.80 conversions
  - **LinkedIn**: 301.32 conversions
  - **Strategy**: Allocate 40-50% of the budget to these high-conversion sources.

- **Moderate Sources:**
  - **Whatsapp**: 246.45 conversions
  - **Google Form**: 202.08 conversions
  - **Strategy**: Allocate 25-30% to maintain their contribution.

- **Low Sources:**
  - **Instagram**: 166.10 conversions
  - **Mass-Mailing**: 130.72 conversions
  - **Strategy**: Minimize spend (10-15%) or optimize these channels for better cost-efficiency.

#### **Budget Allocation Strategy:**
- **High-Performing Sources (College Collaboration & LinkedIn)**: 40-50% of budget.
- **Moderate Sources (Whatsapp & Google Form)**: 25-30% of budget.
- **Low Sources (Instagram & Mass-Mailing)**: 10-15% of budget, unless cost-effective.
- **Program-Specific Focus**: Prioritize **Business Analytics**, **IoT**, and **Data Science** for tailored campaigns and budget allocation.


#### **Final Recommendation:**
**Concentrate marketing efforts on high-conversion programs and sources. Dynamically allocate resources based on performance data, and continuously adjust the strategy to maximize lead conversions.**

# **Data Preprocessing:**
**○ Document the steps taken to clean and preprocess the data for analysis.**



**Data Cleaning and Preprocessing:**
1. Loaded necessary libraries and imported the dataset.
2. Checked data info and null values, ensuring readiness for analysis.

**Problem 1: Demographic Analysis**
- **Location and Source Trends:**
  
  
- **College and Source Trends:**
  

- **Year of Study and Source Trends:**
  

**Problem 2: Program Analysis**
- **Interest Levels:** Data Science, Business Analytics, and IoT lead in popularity. Biotechnology, Electric Vehicle, and Blockchain are least popular.
- **Demographic Breakdown**


**Problem 3: Projections and Predictions**
- **Lead Conversion Projections:**


- **Lead Source Contribution:**


# **Summary of this project**


**Analysis and Recommendations for Lead Sourcing and Program Engagement**

**1. Demographic and Source Trends Analysis**

The analysis of lead sourcing based on location, college, and year of study reveals important patterns and anomalies that can guide marketing and outreach strategies.

- **Location and Source Trends:**
  - **Patterns:**
    - **Lucknow** has strong leads from WhatsApp and College Collaboration, suggesting a diversified outreach strategy.
    - **Kolkata** sees near-equal engagement from LinkedIn and Google Forms, indicating balanced channel effectiveness.
  - **Anomalies:**
    - The unexpectedly high lead counts from sources like WhatsApp and College Collaboration in Lucknow may indicate targeted campaigns or potential data inconsistencies.

- **College and Source Trends:**
  - **Patterns:**
    - **Jadavpur University** sees significant results from LinkedIn, with College Collaboration being secondary.
    - **Manipal University** is effectively reached via College Collaboration, indicating a focused outreach strategy.
  - **Anomalies:**
    - **Savitribai Phule Pune University**’s reliance on Google Forms stands out as unusual when compared to other colleges, where LinkedIn is more dominant.

- **Year of Study and Source Trends:**
  - **Patterns:**
    - **1st Year** students are primarily reached through LinkedIn and Google Forms.
    - **2nd Year** students engage heavily with Instagram, signaling its appeal to this group.
    - **4th Year** students show more formal engagement through Google Forms and College Collaboration.
  - **Anomalies:**
    - The use of Mass-Mailing for **4th Year** students is an unexpected trend, which might suggest either a reporting anomaly or an unusual lead generation approach.

**2. Program Analysis and Recommendations**

In comparing the interest levels for various e-learning programs, several key insights and recommendations emerge.

- **Interest Levels:**
  - **Top Programs:** Data Science, Business Analytics, and IoT lead in interest, while Biotechnology, Electric Vehicle, and Blockchain show lower engagement levels.
  - **Demographic Breakdown:**
    - **AI** programs see high interest in cities like Ahmedabad, Delhi, and Bangalore, while **Data Science** performs well in Mumbai, Hyderabad, and Chennai.
    - **IoT** garners attention in cities like Kolkata, Pune, and Jaipur.
    - Programs like **Data Science**, **Business Analytics**, and **AI** see relatively even interest across all years, while **Electric Vehicle** and **Biotechnology** show a stronger inclination from **4th Year** students.

- **Lead Sources:**
  - WhatsApp and LinkedIn are prominent across all programs.
  - College Collaboration works particularly well for programs like **Business Analytics** and **Blockchain**.
  - Instagram has higher lead attraction for creative and tech-focused programs such as **Robotics** and **Cybersecurity**.

- **Recommendations for Targeting Demographics:**
  - **AI, Data Science, and Business Analytics:** Target locations such as Ahmedabad, Delhi, and Mumbai, and focus outreach efforts on **3rd and 4th Year** students. Emphasize LinkedIn, WhatsApp, and College Collaborations for lead sourcing.
  - **IoT and Robotics:** Focus on **2nd and 3rd Year** students, especially in locations like Pune, Kolkata, and Jaipur. Instagram and Google Forms should be leveraged for sourcing leads.
  - **Electric Vehicle and Biotechnology:** Target **4th Year** students, especially in emerging cities like Lucknow and Pune, using Mass-Mailing and LinkedIn.
  - **Blockchain, Cybersecurity, and Cloud Computing:** Target locations like Chennai, Hyderabad, and Jaipur, and balance outreach across all years. Focus on College Collaborations and Google Forms for sourcing.

**3. Lead Conversion Projections and Budget Allocation Strategy**

A data-driven approach to lead conversion projections and marketing budget allocation is critical for maximizing returns.

- **Lead Conversion Projections:**
  - **Top-Performing Programs:** Business Analytics, IoT, and Data Science are projected to have the highest conversions. These programs should be prioritized for marketing.
  - **Moderate-Performing Programs:** Cloud Computing, Blockchain, and Cybersecurity are moderately successful and should be supported with targeted campaigns.
  - **Lower-Performing Programs:** Electric Vehicle and Biotechnology show the lowest conversion rates. Specialized campaigns or optimized lead sources are recommended for these programs.

- **Lead Source Contribution:**
  - **Top Sources:** College Collaboration and LinkedIn contribute the most to conversions. 40-50% of the marketing budget should be allocated here.
  - **Moderate Sources:** WhatsApp and Google Forms are important, contributing to 25-30% of conversions. These sources should receive a moderate budget allocation.
  - **Low Sources:** Instagram and Mass-Mailing generate fewer conversions and should receive 10-15% of the budget, unless cost-effective optimizations are possible.

- **Budget Allocation Strategy:**
  - High-Performing Sources (College Collaboration & LinkedIn): Allocate 40-50% of the budget.
  - Moderate Sources (WhatsApp & Google Forms): Allocate 25-30% of the budget.
  - Low Sources (Instagram & Mass-Mailing): Allocate 10-15%, unless cost-efficient improvements can be made.
  - Program-Specific Focus: Prioritize Business Analytics, IoT, and Data Science for tailored campaigns and budget allocation.

**Final Recommendation:**
To maximize lead conversions, focus marketing efforts on high-performing programs (Business Analytics, IoT, Data Science) and the most effective sources (College Collaboration, LinkedIn). Dynamically allocate resources based on ongoing performance data, adjusting strategies as needed to continually optimize lead generation and conversion efforts.