DATA PREPROCESSING TECHNIQUES

In [10]:
import pandas as pd

# i. Load .csv file into the DataFrame
df = pd.read_csv("housing.csv")

# ii. Display information of all columns
print("Information of all columns:")
print(df.info())
print("\n")

# iii. Display statistical information of all numerical columns
print("Statistical information of all numerical columns:")
print(df.describe())
print("\n")

# iv. Display the count of unique labels for the 'Ocean Proximity' column
print("Count of unique labels for 'Ocean Proximity' column:")
print(df['ocean_proximity'].value_counts())
print("\n")

# v. Display which attributes (columns) have missing values count greater than zero
print("Columns with missing values count greater than zero:")
print(df.isnull().sum()[df.isnull().sum() > 0])


Information of all columns:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  int64  
 3   total_rooms         20640 non-null  int64  
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  int64  
 6   households          20640 non-null  int64  
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  int64  
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(4), int64(5), object(1)
memory usage: 1.6+ MB
None


Statistical information of all numerical columns:
          longitude      latitude  housing_median_age   total_rooms  \
count  20640.000000  20640.000000        20640.000000  20640.000000   
mean    -119.5

DIABETES DATSET ANALYSIS

In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

# Load Diabetes Dataset
diabetes_df = pd.read_csv("Dataset_of_Diabetes.csv")
print("Diabetes Dataset Loaded")
print(diabetes_df.head())

# -------------------- Data Preprocessing for Diabetes Dataset --------------------
# 1. Handling Missing Values in Diabetes Dataset
# Impute missing values for numerical columns with mean and categorical with most frequent value
numerical_cols = diabetes_df.select_dtypes(include=[np.number]).columns
categorical_cols = diabetes_df.select_dtypes(include=['object']).columns

# Impute missing values for numerical columns (mean strategy)
num_imputer = SimpleImputer(strategy='mean')
diabetes_df[numerical_cols] = num_imputer.fit_transform(diabetes_df[numerical_cols])

# Impute missing values for categorical columns (most frequent strategy)
cat_imputer = SimpleImputer(strategy='most_frequent')
diabetes_df[categorical_cols] = cat_imputer.fit_transform(diabetes_df[categorical_cols])

# 2. Handling Categorical Data (Label Encoding for categorical columns like 'Gender')
label_encoder = LabelEncoder()
for col in categorical_cols:
    diabetes_df[col] = label_encoder.fit_transform(diabetes_df[col])

# 3. Handling Outliers in Diabetes Dataset
# Removing outliers based on Z-score (values beyond 3 standard deviations)
from scipy import stats
z_scores = np.abs(stats.zscore(diabetes_df[numerical_cols]))
outliers = (z_scores > 3).all(axis=1)
diabetes_df_cleaned = diabetes_df[~outliers]

# 4. Data Transformation (Min-Max Scaling and Standardization)
# Min-Max Scaling (Normalization)
min_max_scaler = MinMaxScaler()
diabetes_df_scaled = pd.DataFrame(min_max_scaler.fit_transform(diabetes_df_cleaned), columns=diabetes_df_cleaned.columns)

# Standard Scaling (Z-score Normalization)
standard_scaler = StandardScaler()
diabetes_df_standardized = pd.DataFrame(standard_scaler.fit_transform(diabetes_df_cleaned), columns=diabetes_df_cleaned.columns)

# Display results
print("\nProcessed Diabetes Data (after preprocessing):")
print(diabetes_df_standardized.head())


Diabetes Dataset Loaded
    ID  No_Pation Gender  AGE  Urea  Cr  HbA1c  Chol   TG  HDL  LDL  VLDL  \
0  502      17975      F   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5   
1  735      34221      M   26   4.5  62    4.9   3.7  1.4  1.1  2.1   0.6   
2  420      47975      F   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5   
3  680      87656      F   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5   
4  504      34223      M   33   7.1  46    4.9   4.9  1.0  0.8  2.0   0.4   

    BMI CLASS  
0  24.0     N  
1  23.0     N  
2  24.0     N  
3  24.0     N  
4  21.0     N  

Processed Diabetes Data (after preprocessing):
         ID  No_Pation    Gender       AGE      Urea        Cr     HbA1c  \
0  0.672140  -0.074747 -1.139688 -0.401144 -0.144781 -0.382672 -1.334983   
1  1.641852  -0.069940  0.870343 -3.130017 -0.212954 -0.115804 -1.334983   
2  0.330868  -0.065869 -1.139688 -0.401144 -0.144781 -0.382672 -1.334983   
3  1.412950  -0.054126 -1.139688 -0.401144 -0.144781 -0.382672 -1

ADULT INCOME DATASET ANALYSIS

In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from scipy import stats

# Load Adult Income Dataset
adult_df = pd.read_csv("adult.csv")
print("Adult Income Dataset Loaded")
print(adult_df.head())

# -------------------- Data Preprocessing for Adult Income Dataset --------------------
# 1. Handling Missing Values in Adult Income Dataset
# Impute missing values for numerical columns with mean and categorical with most frequent value
numerical_cols_adult = adult_df.select_dtypes(include=[np.number]).columns
categorical_cols_adult = adult_df.select_dtypes(include=['object']).columns

# Impute missing values for numerical columns (mean strategy)
num_imputer_adult = SimpleImputer(strategy='mean')
adult_df[numerical_cols_adult] = num_imputer_adult.fit_transform(adult_df[numerical_cols_adult])

# Impute missing values for categorical columns (most frequent strategy)
cat_imputer_adult = SimpleImputer(strategy='most_frequent')
adult_df[categorical_cols_adult] = cat_imputer_adult.fit_transform(adult_df[categorical_cols_adult])

# 2. Handling Categorical Data (Label Encoding for categorical columns)
label_encoder = LabelEncoder()
for col in categorical_cols_adult:
    adult_df[col] = label_encoder.fit_transform(adult_df[col])

# 3. Handling Outliers in Adult Income Dataset
# Removing outliers based on Z-score (values beyond 3 standard deviations)
z_scores_adult = np.abs(stats.zscore(adult_df[numerical_cols_adult]))
outliers_adult = (z_scores_adult > 3).all(axis=1)
adult_df_cleaned = adult_df[~outliers_adult]

# -------------------- Data Transformation (Normalization and Scaling) --------------------
# 1. Min-Max Scaling (Normalization)
min_max_scaler_adult = MinMaxScaler()
adult_df_scaled = pd.DataFrame(min_max_scaler_adult.fit_transform(adult_df_cleaned), columns=adult_df_cleaned.columns)

# 2. Standard Scaling (Z-Score Normalization)
standard_scaler_adult = StandardScaler()
adult_df_standardized = pd.DataFrame(standard_scaler_adult.fit_transform(adult_df_cleaned), columns=adult_df_cleaned.columns)

# Display results
print("\nProcessed Adult Income Data (after preprocessing):")
print(adult_df_standardized.head())


Adult Income Dataset Loaded
   age  workclass  fnlwgt     education  educational-num      marital-status  \
0   25    Private  226802          11th                7       Never-married   
1   38    Private   89814       HS-grad                9  Married-civ-spouse   
2   28  Local-gov  336951    Assoc-acdm               12  Married-civ-spouse   
3   44    Private  160323  Some-college               10  Married-civ-spouse   
4   18          ?  103497  Some-college               10       Never-married   

          occupation relationship   race  gender  capital-gain  capital-loss  \
0  Machine-op-inspct    Own-child  Black    Male             0             0   
1    Farming-fishing      Husband  White    Male             0             0   
2    Protective-serv      Husband  White    Male             0             0   
3  Machine-op-inspct      Husband  Black    Male          7688             0   
4                  ?    Own-child  White  Female             0             0   

   hours-p