In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

In [4]:
pd.set_option("display.max_columns",None)
pd.set_option("display.width",500)
df = sns.load_dataset("titanic")

In [5]:
for col in df.columns:
    if df[col].dtypes == "bool":
        df[col] = df[col].astype(int)

In [6]:
def grab_col_names(dataframe, cat_th = 10, car_th = 20):
    """
    Veri setindeki kategorik, numerik ve kategorik fakat kardinal değişkenlerin isimlerini verir.

    Parameters
    ----------
    dataframe : dataframe
        Değişken isimleri alınmak istenen dataframe'dir
    cat_th : int,float
        Numerik fakat kategorik olan değişkenler için sınıf eşik değeri
    car_th : int,float
        Kategorik fakat kardinal olan değişkenler için sınıf eşik değeri
    
    Returns
    ----------
    cat_cols : list
        Kategorik değişken listesi
    num_cols : list
        Numerik değişken listesi
    cat_but_car : list
        Kategorik görünümlü kardinal değişken listesi

    """

    cat_cols = [col for col in df.columns if str(df[col].dtypes) in ["category","object","bool"]]
    num_but_cat = [col for col in df.columns if (df[col].dtypes in ["int","float"]) and (df[col].nunique() < 10)]
    cat_but_car = [col for col in df.columns if df[col].nunique() > 20 and str(df[col].dtypes) in ["category","object"]]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]
    num_cols = [col for col in df.columns if df[col].dtypes in ["int","float"]]
    num_cols = [col for col in num_cols if col not in cat_cols]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f"cat_cols: {len(cat_cols)}")
    print(f"num_cols: {len(num_cols)}")
    print(f"cat_but_car: {len(cat_but_car)}")
    print(f"num_but_cat: {len(num_but_cat)}")

    return cat_cols,num_cols,cat_but_car


In [7]:
cat_cols,num_cols,cat_but_car = grab_col_names(df)

Observations: 891
Variables: 15
cat_cols: 13
num_cols: 2
cat_but_car: 0
num_but_cat: 6


# Hedef Değişkenin Kategorik Değişkenler ile Analizi

In [8]:
df.groupby("sex")["survived"].mean()

sex
female    0.742038
male      0.188908
Name: survived, dtype: float64

In [9]:
def target_summary_with_cat(dataframe, target, categorical_col):
    print(pd.DataFrame({"Target_Mean" : dataframe.groupby(categorical_col)[target].mean()}))

In [10]:
target_summary_with_cat(df, "survived", "sex")

        Target_Mean
sex                
female     0.742038
male       0.188908


In [11]:
target_summary_with_cat(df, "survived", "pclass")

        Target_Mean
pclass             
1          0.629630
2          0.472826
3          0.242363


In [12]:
for col in cat_cols:
    target_summary_with_cat(df, "survived",col)


        Target_Mean
sex                
female     0.742038
male       0.188908
          Target_Mean
embarked             
C            0.553571
Q            0.389610
S            0.336957
        Target_Mean
class              
First      0.629630
Second     0.472826
Third      0.242363
       Target_Mean
who               
child     0.590361
man       0.163873
woman     0.756458
      Target_Mean
deck             
A        0.466667
B        0.744681
C        0.593220
D        0.757576
E        0.750000
F        0.615385
G        0.500000
             Target_Mean
embark_town             
Cherbourg       0.553571
Queenstown      0.389610
Southampton     0.336957
       Target_Mean
alive             
no             0.0
yes            1.0
          Target_Mean
survived             
0                 0.0
1                 1.0
        Target_Mean
pclass             
1          0.629630
2          0.472826
3          0.242363
       Target_Mean
sibsp             
0         0.345395
1      

# Hedef Değişkenin Sayısal Değişkenlerle Analizi

In [13]:
#df.groupby("survived")["age"].mean()
df.groupby("survived").agg({"age": "mean"})

Unnamed: 0_level_0,age
survived,Unnamed: 1_level_1
0,30.626179
1,28.34369


In [14]:
def target_summary_with_num(dataframe, target, numerical_col):
    print(pd.DataFrame({f"{numerical_col} mean" : dataframe.groupby(target)[numerical_col].mean()}))

In [15]:
target_summary_with_num(df, "survived", "age")

           age mean
survived           
0         30.626179
1         28.343690


In [17]:
for col in num_cols:
    target_summary_with_num(df, "survived", col)

           age mean
survived           
0         30.626179
1         28.343690
          fare mean
survived           
0         22.117887
1         48.395408
