In [94]:
import pandas as pd

def validate_and_clean_data(df):
    # check for missing values
    missing_values = (df.isnull().sum()/len(df))*100
    print(missing_values)
    
    # drop rows with missing values
    df = df.dropna()
    
    # check for duplicate rows
    duplicate_rows = df.duplicated().sum()
    print(duplicate_rows)
    
    # drop duplicate rows
    df = df.drop_duplicates()
    
    # check for outliers
    for col in df.select_dtypes(include=['float']).columns:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - (1.5 * iqr)
        upper_bound = q3 + (1.5 * iqr)
        df = df[(df[col] > lower_bound) & (df[col] < upper_bound)]
        
    # check for invalid data types
    invalid_data_types = df.dtypes[df.dtypes == 'object']
    print(invalid_data_types)
    
    # convert invalid data types to appropriate data types
    for col in invalid_data_types.index:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # return cleaned dataframe
    return df.info()


In [80]:
df = pd.read_csv('Leads.csv')
df1 = pd.read_csv('Leads.csv')

In [95]:
validate_and_clean_data(df)

Prospect ID                                       0.000000
Lead Number                                       0.000000
Lead Origin                                       0.000000
Lead Source                                       0.389610
Do Not Email                                      0.000000
Do Not Call                                       0.000000
Converted                                         0.000000
TotalVisits                                       1.482684
Total Time Spent on Website                       0.000000
Page Views Per Visit                              1.482684
Last Activity                                     1.114719
Country                                          26.634199
Specialization                                   15.562771
How did you hear about X Education               23.885281
What is your current occupation                  29.112554
What matters most to you in choosing a course    29.318182
Search                                            0.0000

In [82]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9240 entries, 0 to 9239
Data columns (total 37 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Prospect ID                                    9240 non-null   object 
 1   Lead Number                                    9240 non-null   int64  
 2   Lead Origin                                    9240 non-null   object 
 3   Lead Source                                    9204 non-null   object 
 4   Do Not Email                                   9240 non-null   object 
 5   Do Not Call                                    9240 non-null   object 
 6   Converted                                      9240 non-null   int64  
 7   TotalVisits                                    9103 non-null   float64
 8   Total Time Spent on Website                    9240 non-null   int64  
 9   Page Views Per Visit                           9103 

In [83]:
validate_and_clean_data(df)

Unnamed: 0,Prospect ID,Lead Number,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,...,Get updates on DM Content,Lead Profile,City,Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score,I agree to pay the amount through cheque,A free copy of Mastering The Interview,Last Notable Activity
2,,660727,,,,,1,2.0,1532,2.00,...,,,,,,14.0,20.0,,,
3,,660719,,,,,0,1.0,305,1.00,...,,,,,,13.0,17.0,,,
4,,660681,,,,,1,2.0,1428,1.00,...,,,,,,15.0,18.0,,,
6,,660673,,,,,1,2.0,1640,2.00,...,,,,,,14.0,20.0,,,
15,,660547,,,,,1,6.0,1012,6.00,...,,,,,,14.0,15.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9234,,579615,,,,,1,5.0,210,2.50,...,,,,,,14.0,20.0,,,
9235,,579564,,,,,1,8.0,1845,2.67,...,,,,,,15.0,17.0,,,
9236,,579546,,,,,0,2.0,238,2.00,...,,,,,,14.0,19.0,,,
9237,,579545,,,,,0,2.0,199,2.00,...,,,,,,13.0,20.0,,,


In [93]:
df1.describe()

Unnamed: 0,Lead Number,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Asymmetrique Activity Score,Asymmetrique Profile Score
count,9240.0,9240.0,9103.0,9240.0,9103.0,5022.0,5022.0
mean,617188.435606,0.38539,3.445238,487.698268,2.36282,14.306252,16.344883
std,23405.995698,0.486714,4.854853,548.021466,2.161418,1.386694,1.811395
min,579533.0,0.0,0.0,0.0,0.0,7.0,11.0
25%,596484.5,0.0,1.0,12.0,1.0,14.0,15.0
50%,615479.0,0.0,3.0,248.0,2.0,14.0,16.0
75%,637387.25,1.0,5.0,936.0,3.0,15.0,18.0
max,660737.0,1.0,251.0,2272.0,55.0,18.0,20.0


In [16]:
df.duplicated().sum()/len(df)

0.0