In [1]:
import pandas as pd
import numpy as np

## Loading Data

In [11]:
df = pd.read_csv(
    "https://raw.githubusercontent.com/campusx-official/100-days-of-machine-learning/refs/heads/main/day24-standardization/Social_Network_Ads.csv")
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


## Data Exploration

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [13]:
df.describe()

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased
count,400.0,400.0,400.0,400.0
mean,15691540.0,37.655,69742.5,0.3575
std,71658.32,10.482877,34096.960282,0.479864
min,15566690.0,18.0,15000.0,0.0
25%,15626760.0,29.75,43000.0,0.0
50%,15694340.0,37.0,70000.0,0.0
75%,15750360.0,46.0,88000.0,1.0
max,15815240.0,60.0,150000.0,1.0


## Handling Missing Values

In [14]:
df.isnull().sum()

User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

In [15]:
df.dropna()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [16]:
df['EstimatedSalary'].fillna(df['EstimatedSalary'].mean())

0      19000
1      20000
2      43000
3      57000
4      76000
       ...  
395    41000
396    23000
397    20000
398    33000
399    36000
Name: EstimatedSalary, Length: 400, dtype: int64

## Removing Duplicate

In [17]:
df.drop_duplicates()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


## Data Type Conversion

In [18]:
df['Purchased'].astype("bool")

0      False
1      False
2      False
3      False
4      False
       ...  
395     True
396     True
397     True
398    False
399     True
Name: Purchased, Length: 400, dtype: bool

## Filtering

In [35]:
df[(df['Gender'] == "Male") & (df['EstimatedSalary'] > df['EstimatedSalary'].mean())]

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
4,15804002,Male,19,76000,0
12,15746139,Male,20,86000,0
14,15628972,Male,18,82000,0
15,15697686,Male,29,80000,0
30,15581198,Male,31,74000,0
...,...,...,...,...,...
368,15779744,Male,38,71000,0
371,15774744,Male,60,83000,1
373,15708791,Male,59,130000,1
378,15577806,Male,41,87000,1


## Subsetting

In [36]:
df[['Gender', 'Age']]

Unnamed: 0,Gender,Age
0,Male,19
1,Male,35
2,Female,26
3,Female,27
4,Male,19
...,...,...
395,Female,46
396,Male,51
397,Female,50
398,Male,36


## Groupby

In [39]:
df.groupby('Gender').mean().sort_values(by='Age')

Unnamed: 0_level_0,User ID,Age,EstimatedSalary,Purchased
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Male,15689700.0,36.867347,67642.857143,0.336735
Female,15693310.0,38.411765,71759.803922,0.377451


## Merge

In [42]:
pd.merge(df, df, how='left', on='User ID')

Unnamed: 0,User ID,Gender_x,Age_x,EstimatedSalary_x,Purchased_x,Gender_y,Age_y,EstimatedSalary_y,Purchased_y
0,15624510,Male,19,19000,0,Male,19,19000,0
1,15810944,Male,35,20000,0,Male,35,20000,0
2,15668575,Female,26,43000,0,Female,26,43000,0
3,15603246,Female,27,57000,0,Female,27,57000,0
4,15804002,Male,19,76000,0,Male,19,76000,0
...,...,...,...,...,...,...,...,...,...
395,15691863,Female,46,41000,1,Female,46,41000,1
396,15706071,Male,51,23000,1,Male,51,23000,1
397,15654296,Female,50,20000,1,Female,50,20000,1
398,15755018,Male,36,33000,0,Male,36,33000,0


# Applying Functions

In [43]:
df.columns

Index(['User ID', 'Gender', 'Age', 'EstimatedSalary', 'Purchased'], dtype='object')

In [45]:
df['EstimatedSalary'].apply(lambda x: x*2)

0       38000
1       40000
2       86000
3      114000
4      152000
        ...  
395     82000
396     46000
397     40000
398     66000
399     72000
Name: EstimatedSalary, Length: 400, dtype: int64

## String Manipulation

In [47]:
df['Gender'].str.upper()

0        MALE
1        MALE
2      FEMALE
3      FEMALE
4        MALE
        ...  
395    FEMALE
396      MALE
397    FEMALE
398      MALE
399    FEMALE
Name: Gender, Length: 400, dtype: object