In [26]:
import pandas as pd
import numpy as np
from scipy import stats

In [27]:
# Load data from a CSV file
df = pd.read_csv('data.csv')

In [51]:
# Print column names to check for differences
print("Column Names in the DataFrame:")
print(df.columns)

Column Names in the DataFrame:
Index(['User ID', 'country', 'Gender', 'Age', 'salary', 'Purchased'], dtype='object')


In [28]:
# Inspect the data
print("Original DataFrame:")
print(df.head())
print("\nDataFrame Info:")
print(df.info())
print("\nStatistical Summary:")
print(df.describe())

Original DataFrame:
    User ID country  Gender   Age   salary  Purchased
0  15624510   India    Male  19.0  19000.0          0
1  15810944     USA    Male  35.0      NaN          1
2  15668575  France  Female  26.0  43000.0          0
3  15603246     USA  Female   NaN  57000.0          0
4  15804002  France    Male  19.0  76000.0          0

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   User ID    20 non-null     int64  
 1   country    19 non-null     object 
 2   Gender     20 non-null     object 
 3   Age        17 non-null     float64
 4   salary     19 non-null     float64
 5   Purchased  20 non-null     int64  
dtypes: float64(2), int64(2), object(2)
memory usage: 1.1+ KB
None

Statistical Summary:
            User ID        Age         salary  Purchased
count  2.000000e+01  17.000000      19.000000  20.000000
mean   1.56788

In [29]:
# Handling Missing Values
print("\nMissing Values in Each Column:")
print(df.isnull().sum())


Missing Values in Each Column:
User ID      0
country      1
Gender       0
Age          3
salary       1
Purchased    0
dtype: int64


In [31]:
# Drop rows with any missing values (optional)
# df_cleaned = df.dropna()

# Fill missing values with a specific value (e.g., 0)
df.fillna(value=0, inplace=True)

# Fill missing values with the mean of the column for a specific column
# df['column_name'].fillna(df['column_name'].mean(), inplace=True)

In [32]:
# Removing Duplicates
print("\nNumber of Duplicate Rows:", df.duplicated().sum())
df_cleaned = df.drop_duplicates()


Number of Duplicate Rows: 0


In [59]:
# Fill missing values in 'salary' and 'Age' columns with their mean values
df['salary'] = df['salary'].fillna(df['salary'].mean())
df['Age'] = df['Age'].fillna(df['Age'].mean())

In [49]:
# Check for duplicates and remove them
print("\nNumber of Duplicate Rows:", df.duplicated().sum())
df_cleaned = df.drop_duplicates()


Number of Duplicate Rows: 0


In [52]:
# Standardizing Text by Converting 'Country' and 'Gender' columns to lowercase and strip whitespace
df_cleaned['country'] = df_cleaned['country'].str.lower().str.strip()
df_cleaned['Gender'] = df_cleaned['Gender'].str.lower().str.strip()

In [53]:
# Bin the 'Age' column into categories for better readability
age_bins = [0, 25, 40, 60]
age_labels = ['Young', 'Middle-aged', 'Older']
df_cleaned['Age_Group'] = pd.cut(df_cleaned['Age'], bins=age_bins, labels=age_labels)

In [54]:
#Outlier Detection and Removal in 'salary'
# Remove rows where salary is an outlier (using Z-score threshold of 3)
df_cleaned = df_cleaned[(np.abs(stats.zscore(df_cleaned['salary'])) < 3)]

In [57]:
# Display information of Cleaned DataFrame
print("\nCleaned DataFrame Info:")
df_cleaned.info()
print("\nCleaned DataFrame:")
df_cleaned.head()


Cleaned DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   User ID    20 non-null     int64   
 1   country    19 non-null     object  
 2   Gender     20 non-null     object  
 3   Age        20 non-null     float64 
 4   salary     20 non-null     float64 
 5   Purchased  20 non-null     int64   
 6   Age_Group  17 non-null     category
dtypes: category(1), float64(2), int64(2), object(2)
memory usage: 1.2+ KB

Cleaned DataFrame:


Unnamed: 0,User ID,country,Gender,Age,salary,Purchased,Age_Group
0,15624510,india,male,19.0,19000.0,0,Young
1,15810944,usa,male,35.0,0.0,1,Middle-aged
2,15668575,france,female,26.0,43000.0,0,Middle-aged
3,15603246,usa,female,0.0,57000.0,0,
4,15804002,france,male,19.0,76000.0,0,Young


In [58]:
# Save the cleaned DataFrame to a new CSV file
df_cleaned.to_csv('cleaned_data.csv', index=False)
print("Cleaned data saved to 'cleaned_data.csv'")

Cleaned data saved to 'cleaned_data.csv'
