<a href="https://colab.research.google.com/github/sunitha-18577/Python-Fundamentals/blob/main/EDA2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import MinMaxScaler,LabelEncoder
from scipy.stats import pearsonr


In [None]:
try:
  df=pd.read_csv('/content/adult_with_headers.csv')
except FileNotFoundError:
  print('File not found')
  exit()

In [None]:
print("Summary Statistics:")
print(df.describe())
print("\nMissing Values:")
print(df.isnull().sum())
print("\nData types:")
print(df.dtypes)

Summary Statistics:
                age        fnlwgt  education_num  capital_gain  capital_loss  \
count  32561.000000  3.256100e+04   32561.000000  32561.000000  32561.000000   
mean      38.581647  1.897784e+05      10.080679   1077.648844     87.303830   
std       13.640433  1.055500e+05       2.572720   7385.292085    402.960219   
min       17.000000  1.228500e+04       1.000000      0.000000      0.000000   
25%       28.000000  1.178270e+05       9.000000      0.000000      0.000000   
50%       37.000000  1.783560e+05      10.000000      0.000000      0.000000   
75%       48.000000  2.370510e+05      12.000000      0.000000      0.000000   
max       90.000000  1.484705e+06      16.000000  99999.000000   4356.000000   

       hours_per_week  
count    32561.000000  
mean        40.437456  
std         12.347429  
min          1.000000  
25%         40.000000  
50%         40.000000  
75%         45.000000  
max         99.000000  

Missing Values:
age               0
workcl

In [None]:
for col in df.columns:
  if df[col].isnull().sum()>0:
    if pd.api.types.is_numeric_dtypes(df[col]):
      df[col]=df[col].fillna(df[col].mean())
    else:
      df[col]=df[col].fillna(df[col].mode()[0])

In [None]:
numerical_cols=df.select_dtypes(include=np.number).columns.tolist()
df_standard_scaled=df.copy()
df_minmax_scaled=df.copy()
for col in numerical_cols:
  df_standard_scaled[col]=(df_standard_scaled[col]-df_standard_scaled[col].mean()/df_standard_scaled[col].std())
scaler=MinMaxScaler()
df_minmax_scaled[numerical_cols]=scaler.fit_transform(df_minmax_scaled[numerical_cols])
print("\nStandard scaled data(first 5 rows):")
print(df_standard_scaled.head())
print("\nMin-Max scaled data(first 5 rows):")
print(df_minmax_scaled.head())


Standard scaled data(first 5 rows):
         age          workclass         fnlwgt   education  education_num  \
0  36.171523          State-gov   77514.202005   Bachelors       9.081704   
1  47.171523   Self-emp-not-inc   83309.202005   Bachelors       9.081704   
2  35.171523            Private  215644.202005     HS-grad       5.081704   
3  50.171523            Private  234719.202005        11th       3.081704   
4  25.171523            Private  338407.202005   Bachelors       9.081704   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_

In [None]:
df_encoded=df.copy()
categorical_cols=df.select_dtypes(include='object').columns.tolist()
for col in categorical_cols:
  if len(df[col].unique())<=5:
    df_encoded=pd.get_dummies(df_encoded,columns=[col],prefix=col)

In [None]:
categorical_cols_encoded = df_encoded.select_dtypes(include='object').columns.tolist()
for col in categorical_cols_encoded:
  if len(df_encoded[col].unique())>=5:
    le=LabelEncoder()
    df_encoded[col]=le.fit_transform(df_encoded[col])
print("\nEncoded data(first 5 rows):")
print(df_encoded.head())


Encoded data(first 5 rows):
   age  workclass  fnlwgt  education  education_num  marital_status  \
0   39          7   77516          9             13               4   
1   50          6   83311          9             13               2   
2   38          4  215646         11              9               0   
3   53          4  234721          1              7               2   
4   28          4  338409          9             13               2   

   occupation  relationship  capital_gain  capital_loss  ...  native_country  \
0           1             1          2174             0  ...              39   
1           4             0             0             0  ...              39   
2           6             1             0             0  ...              39   
3           6             0             0             0  ...              39   
4          10             5             0             0  ...               5   

   race_ Amer-Indian-Eskimo  race_ Asian-Pac-Islander  race_ Bl

In [None]:
df_engineered=df_encoded.copy()
df_engineered['age_squared']=df_engineered['age']**2
df_engineered['hours_per_week_squared']=df_engineered['hours_per_week']**2
print("\nEngineered data(first 5 rows):")
print(df_engineered.head())


Engineered data(first 5 rows):
   age  workclass  fnlwgt  education  education_num  marital_status  \
0   39          7   77516          9             13               4   
1   50          6   83311          9             13               2   
2   38          4  215646         11              9               0   
3   53          4  234721          1              7               2   
4   28          4  338409          9             13               2   

   occupation  relationship  capital_gain  capital_loss  ...  \
0           1             1          2174             0  ...   
1           4             0             0             0  ...   
2           6             1             0             0  ...   
3           6             0             0             0  ...   
4          10             5             0             0  ...   

   race_ Asian-Pac-Islander  race_ Black  race_ Other  race_ White  \
0                     False        False        False         True   
1               

In [None]:
iso_forest=IsolationForest(n_estimators=100,contamination='auto',random_state=42)
outlier_scores=iso_forest.fit_predict(df_engineered[numerical_cols])
outlier_labels=outlier_scores
df_no_outliers=df_engineered[outlier_labels !=-1]
print("\nShape before removing outliers:",df_engineered.shape)
print("Shape after removing outliers:",df_no_outliers.shape)



Shape before removing outliers: (32561, 23)
Shape after removing outliers: (29247, 23)


In [None]:
correlation_matrix=df_no_outliers[numerical_cols].corr(method =lambda x,y: pearsonr(x,y)[0])
print("\nCorrelation Matrix:")
print(correlation_matrix)


Correlation Matrix:
                     age    fnlwgt  education_num  capital_gain  capital_loss  \
age             1.000000 -0.075740       0.040510      0.074323      0.017255   
fnlwgt         -0.075740  1.000000      -0.033550     -0.018473      0.000313   
education_num   0.040510 -0.033550       1.000000      0.090800      0.012009   
capital_gain    0.074323 -0.018473       0.090800      1.000000     -0.008829   
capital_loss    0.017255  0.000313       0.012009     -0.008829      1.000000   
hours_per_week  0.135762 -0.018476       0.128471      0.055781      0.000865   

                hours_per_week  
age                   0.135762  
fnlwgt               -0.018476  
education_num         0.128471  
capital_gain          0.055781  
capital_loss          0.000865  
hours_per_week        1.000000  
