In [1]:
%load_ext kedro

In [2]:
df = catalog.load('raw')
df.head()

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0,0.0,No,6.0,4.0,No,15.0,5.0,Extrovert
1,1,1.0,No,7.0,3.0,No,10.0,8.0,Extrovert
2,2,6.0,Yes,1.0,0.0,,3.0,0.0,Introvert
3,3,3.0,No,7.0,3.0,No,11.0,5.0,Extrovert
4,4,1.0,No,4.0,4.0,No,13.0,,Extrovert


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18524 entries, 0 to 18523
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         18524 non-null  int64  
 1   Time_spent_Alone           17334 non-null  float64
 2   Stage_fear                 16631 non-null  object 
 3   Social_event_attendance    17344 non-null  float64
 4   Going_outside              17058 non-null  float64
 5   Drained_after_socializing  17375 non-null  object 
 6   Friends_circle_size        17470 non-null  float64
 7   Post_frequency             17260 non-null  float64
 8   Personality                18524 non-null  object 
dtypes: float64(5), int64(1), object(3)
memory usage: 1.3+ MB


In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

In [5]:
nulls = pd.Series([np.round(df[col].isna().mean(), 4) * 100 for col in df.columns], index=df.columns)
nulls


id                            [1;36m0.00[0m
Time_spent_Alone              [1;36m6.42[0m
Stage_fear                   [1;36m10.22[0m
Social_event_attendance       [1;36m6.37[0m
Going_outside                 [1;36m7.91[0m
Drained_after_socializing     [1;36m6.20[0m
Friends_circle_size           [1;36m5.69[0m
Post_frequency                [1;36m6.82[0m
Personality                   [1;36m0.00[0m
dtype: float64

In [6]:
num_cols = [col for col in df.columns if df[col].dtype in ['int64', 'float64']]

In [7]:
skews = pd.Series([df[col].skew() for col in num_cols], index=num_cols)
skews


id                         [1;36m0.000000[0m
Time_spent_Alone           [1;36m1.133777[0m
Social_event_attendance   [1;36m-0.229238[0m
Going_outside             [1;36m-0.367221[0m
Friends_circle_size       [1;36m-0.051806[0m
Post_frequency            [1;36m-0.058706[0m
dtype: float64

In [8]:
def _impute_mode_and_convert_bool(x: pd.Series) -> pd.Series:
    return x.fillna('most_frequent') == 'Yes'

In [9]:
df['Stage_fear'] = _impute_mode_and_convert_bool(df['Stage_fear'])
df['Drained_after_socializing'] = _impute_mode_and_convert_bool(df['Drained_after_socializing'])

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18524 entries, 0 to 18523
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         18524 non-null  int64  
 1   Time_spent_Alone           17334 non-null  float64
 2   Stage_fear                 18524 non-null  bool   
 3   Social_event_attendance    17344 non-null  float64
 4   Going_outside              17058 non-null  float64
 5   Drained_after_socializing  18524 non-null  bool   
 6   Friends_circle_size        17470 non-null  float64
 7   Post_frequency             17260 non-null  float64
 8   Personality                18524 non-null  object 
dtypes: bool(2), float64(5), int64(1), object(1)
memory usage: 1.0+ MB


In [11]:
df['Time_spent_Alone'] = df['Time_spent_Alone'].fillna(df[num_cols].median())

In [16]:
num_cols = [
    'Time_spent_Alone',
    'Social_event_attendance',
    'Going_outside', 
    'Friends_circle_size',
    'Post_frequency'
]

for col in fill_median:
    df[col] = df[col].fillna(df[col].mean())

In [13]:
df['Time_spent_Alone'].fillna()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18524 entries, 0 to 18523
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         18524 non-null  int64  
 1   Time_spent_Alone           18524 non-null  float64
 2   Stage_fear                 18524 non-null  bool   
 3   Social_event_attendance    18524 non-null  float64
 4   Going_outside              18524 non-null  float64
 5   Drained_after_socializing  18524 non-null  bool   
 6   Friends_circle_size        18524 non-null  float64
 7   Post_frequency             18524 non-null  float64
 8   Personality                18524 non-null  object 
dtypes: bool(2), float64(5), int64(1), object(1)
memory usage: 1.0+ MB


In [14]:
df['Time_spent_Alone'] = df['Time_spent_Alone'].fillna('mean')

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18524 entries, 0 to 18523
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         18524 non-null  int64  
 1   Time_spent_Alone           18524 non-null  float64
 2   Stage_fear                 18524 non-null  bool   
 3   Social_event_attendance    18524 non-null  float64
 4   Going_outside              18524 non-null  float64
 5   Drained_after_socializing  18524 non-null  bool   
 6   Friends_circle_size        18524 non-null  float64
 7   Post_frequency             18524 non-null  float64
 8   Personality                18524 non-null  object 
dtypes: bool(2), float64(5), int64(1), object(1)
memory usage: 1.0+ MB
