# Pandas

pip install seaborn

pip show seaborn

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [None]:
### Veri setini yükleme.
df = pd.read_csv("titanic.csv") # normalde böyle okuyacağız veriyi.
# Elimizde hazır olmadığı için seaborn kütüphanesinden çekeceğiz.

In [None]:
# pd.read_json() #json dosyasını okuma

###############################################
# pd.read_excel()
# excel dosyasını okuyabilmek için;
    # pip install openpyxl 
    # pip install xlrd 
# bu iki kütüphaneyi indirmemiz gerekir.
###############################################

In [2]:
df = sns.load_dataset("titanic")

### Veriye İlk Bakış

In [5]:
df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True


In [10]:
df.tail(3)           # Son 5 satır

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
888,0,3,female,,1,2,23.45,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0,C,First,man,True,C,Cherbourg,yes,True
890,0,3,male,32.0,0,0,7.75,Q,Third,man,True,,Queenstown,no,True


In [7]:
df.shape            # Satır Sütun sayısı

(891, 15)

In [8]:
df.info()           # Genel bilgi - veri tipleri falan önemli (kategorik, nümerik)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [None]:
df.describe()       # İstatistiksel özet

    # Q1 (1.çeyrek) --> En küçük %25'lik kesimin en büyük değeri.
    #                   Alt %25'lik dilimin üst sınırı
    #
    # Q2 (Medyan)   --> Ortadaki değer (veriler sıralandığında tam ortadaki)
    #
    # Q3 (3.çeyrek) --> En küçük %75'lik kesimin en büyük değeri.
    #                   Üst %25'lik dilimin alt sınırı

# Peki bu veriler bizim için neden önemli?
    # Verinin nasıl dağıldığını görebilmek,
    # Aykırı değerleri (outliers) tespit edebiliriz,
    # Kutu grafiği (boxplot) çiziminde kullanılır.


Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


########
### Temel Keşif İşlemleri
########

In [12]:
# Sütun İsimleri
df.columns 
print(df.columns.tolist())

['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town', 'alive', 'alone']


In [11]:
df.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [18]:
# Eksik Veri Kontrolü
df.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [19]:
# Ver Tipleri
print(df.dtypes)

survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object


In [21]:
# Benzersiz Değerler
print(df["sex"].unique()) # Cinsiyet sütunundaki benzersiz değerleri getirir.
df["sex"].value_counts() # Cinsiyet Sütunundaki benzersiz değerin toplamını verir.

['male' 'female']


sex
male      577
female    314
Name: count, dtype: int64

########
### Basit Filtreleme ve Seçim
########

In [29]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [28]:
#df.columns.tolist()
df["pclass"]

0      3
1      1
2      3
3      1
4      3
      ..
886    2
887    1
888    3
889    1
890    3
Name: pclass, Length: 891, dtype: int64

In [31]:
# Birden fazla sütun seçimi
df[["age", "sex", "survived"]]

Unnamed: 0,age,sex,survived
0,22.0,male,0
1,38.0,female,1
2,26.0,female,1
3,35.0,female,1
4,35.0,male,0
...,...,...,...
886,27.0,male,0
887,19.0,female,1
888,,female,0
889,26.0,male,1


In [None]:
# Basit filtreleme
df[ df["survived"] == 1] # Hayatta kalanların listesini ver 

In [40]:
#df[ (df["survived"] == 1) & (df["age"]<20)]
df[ (df["survived"] == 1) | (df["age"]<20)]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
7,0,3,male,2.0,3,1,21.0750,S,Third,child,False,,Southampton,no,False
8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
877,0,3,male,19.0,0,0,7.8958,S,Third,man,True,,Southampton,no,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
880,1,2,female,25.0,0,1,26.0000,S,Second,woman,False,,Southampton,yes,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True


In [42]:
df[ ~(df["survived"] == 0)] # Hayatta kalamayanların listesini ver

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
875,1,3,female,15.0,0,0,7.2250,C,Third,child,False,,Cherbourg,yes,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
880,1,2,female,25.0,0,1,26.0000,S,Second,woman,False,,Southampton,yes,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True


In [43]:
df["survived"].sum() # Hayatta kalanların sayısını verir

np.int64(342)

In [47]:
df["survived"].value_counts(normalize=True) # Hayatta kalanların oranlarını verir.
# %61'i hayatını kaybederken, %38'i hayatta kalmış

survived
0    0.616162
1    0.383838
Name: proportion, dtype: float64

In [50]:
df[ df["sex"] == "male"] # erkek sayısı
df[ df["sex"] == "female"] # kadın sayısı

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
880,1,2,female,25.0,0,1,26.0000,S,Second,woman,False,,Southampton,yes,False
882,0,3,female,22.0,0,0,10.5167,S,Third,woman,False,,Southampton,no,True
885,0,3,female,39.0,0,5,29.1250,Q,Third,woman,False,,Queenstown,no,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True


In [None]:
df[ df["pclass"] == 1] # pclass'ı 1 olan değerler   #216
df[ df["pclass"] == 2] # pclass'ı 2 olan değerler   #184
df[ df["pclass"] == 3] # pclass'ı 3 olan değeler    #491
df["pclass"].value_counts()

pclass
3    491
1    216
2    184
Name: count, dtype: int64

In [57]:
df[ (df["pclass"] == 3) & (df["age"] > 20) & (df["sex"] =="female") ] 

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
18,0,3,female,31.0,1,0,18.0,S,Third,woman,False,,Southampton,no,False
25,1,3,female,38.0,1,5,31.3875,S,Third,woman,False,,Southampton,yes,False
40,0,3,female,40.0,1,0,9.475,S,Third,woman,False,,Southampton,no,False
79,1,3,female,30.0,0,0,12.475,S,Third,woman,False,,Southampton,yes,True
85,1,3,female,33.0,3,0,15.85,S,Third,woman,False,,Southampton,yes,False
100,0,3,female,28.0,0,0,7.8958,S,Third,woman,False,,Southampton,no,True
106,1,3,female,21.0,0,0,7.65,S,Third,woman,False,,Southampton,yes,True
132,0,3,female,47.0,1,0,14.5,S,Third,woman,False,,Southampton,no,False


In [60]:
df[ (df["pclass"] == 3) & (df["age"] > 20) & (df["sex"] =="female") | (df["alive"] =="yes") ] 

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
880,1,2,female,25.0,0,1,26.0000,S,Second,woman,False,,Southampton,yes,False
882,0,3,female,22.0,0,0,10.5167,S,Third,woman,False,,Southampton,no,True
885,0,3,female,39.0,0,5,29.1250,Q,Third,woman,False,,Queenstown,no,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True


In [61]:
(df["age"] < 18).sum() # 18 yaşından küçük olan kişi sayısı
#df[df["age"]<18] # 18 yaşından küçük olan kişilerin verisi

np.int64(113)

In [64]:
((df["age"]<18) & (df["survived"]==0)).sum()

np.int64(52)

########
### Gelişmiş Filtreleme
########

### Çoklu koşul filtreleme

In [71]:
# Yaşı 18'den küçük olan ve hayatta kalanların bilgilerini gösterelim.
df[(df["age"] < 18) & (df["survived"] == 1)]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
9,1,2,female,14.00,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False
10,1,3,female,4.00,1,1,16.7000,S,Third,child,False,G,Southampton,yes,False
22,1,3,female,15.00,0,0,8.0292,Q,Third,child,False,,Queenstown,yes,True
39,1,3,female,14.00,1,0,11.2417,C,Third,child,False,,Cherbourg,yes,False
43,1,2,female,3.00,1,2,41.5792,C,Second,child,False,,Cherbourg,yes,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
830,1,3,female,15.00,1,0,14.4542,C,Third,child,False,,Cherbourg,yes,False
831,1,2,male,0.83,1,1,18.7500,S,Second,child,False,,Southampton,yes,False
853,1,1,female,16.00,0,1,39.4000,S,First,woman,False,D,Southampton,yes,False
869,1,3,male,4.00,1,1,11.1333,S,Third,child,False,,Southampton,yes,False


In [72]:
# Yaşı 18'den küçük olan ve hayatta kalanların sayısı
((df["age"] < 18) & (df["survived"] == 1)).sum()

np.int64(61)

In [73]:
# pclass'ı 1 olan ve cinsiyeti kadın olanların verisini getir
df[ (df["pclass"] == 1) & (df["sex"] == "female") ]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
11,1,1,female,58.0,0,0,26.5500,S,First,woman,False,C,Southampton,yes,True
31,1,1,female,,1,0,146.5208,C,First,woman,False,B,Cherbourg,yes,False
52,1,1,female,49.0,1,0,76.7292,C,First,woman,False,D,Cherbourg,yes,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
856,1,1,female,45.0,1,1,164.8667,S,First,woman,False,,Southampton,yes,False
862,1,1,female,48.0,0,0,25.9292,S,First,woman,False,D,Southampton,yes,True
871,1,1,female,47.0,1,1,52.5542,S,First,woman,False,D,Southampton,yes,False
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False


In [74]:
# pclass'ı 1 olan ve cinsiyeti kadın olanların sayısını getir.
((df ["pclass"] == 1) & (df["sex"] == "female")).sum()

np.int64(94)

### isin() kullanımı

In [77]:
# embarked sütunundaki her satırdaki değerlere gider ve değerlerin S ve C mi diye baktıktan sonra da onların verisini getirir.
df[df["embarked"].isin(["S", "C"])]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884,0,3,male,25.0,0,0,7.0500,S,Third,man,True,,Southampton,no,True
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False


In [78]:
df[ df["embarked"].isin(["S","C"]) & (df["class"] == "First")]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
11,1,1,female,58.0,0,0,26.5500,S,First,woman,False,C,Southampton,yes,True
23,1,1,male,28.0,0,0,35.5000,S,First,man,True,A,Southampton,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,1,1,female,47.0,1,1,52.5542,S,First,woman,False,D,Southampton,yes,False
872,0,1,male,33.0,0,0,5.0000,S,First,man,True,B,Southampton,no,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True


In [85]:
df[ df["class"].isin(["First"])]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
11,1,1,female,58.0,0,0,26.5500,S,First,woman,False,C,Southampton,yes,True
23,1,1,male,28.0,0,0,35.5000,S,First,man,True,A,Southampton,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,1,1,female,47.0,1,1,52.5542,S,First,woman,False,D,Southampton,yes,False
872,0,1,male,33.0,0,0,5.0000,S,First,man,True,B,Southampton,no,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True


In [87]:
# Embarked sütunundaki her satırdaki S ve C değerlerin sayısı
df["embarked"].isin(["S","C"]).sum()

np.int64(812)

### String işlemleri

In [None]:
# sütundaki S değeri içerip içermediğine bakıyor.
df["embarked"].str.contains("S").sum()

In [93]:
df["alive"].str.contains("o").sum()

np.int64(549)

In [94]:
df.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [95]:
df.columns.str.contains("age")

array([False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False])

In [None]:
age2 --> age * 2

In [None]:
df["age2"] = df["age"]*2

In [101]:
df[["age","age2"]].head()

Unnamed: 0,age,age2
0,22.0,44.0
1,38.0,76.0
2,26.0,52.0
3,35.0,70.0
4,35.0,70.0


In [102]:
df.columns.str.contains("age")

array([False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False,  True])

########
### Eksik Veri Yönetimi
########

In [109]:
liste1 = [1,2,3,4,5]
liste2 = liste1.copy()

In [110]:
liste2[0] = 7

In [111]:
liste2

[7, 2, 3, 4, 5]

In [112]:
liste1

[1, 2, 3, 4, 5]

In [113]:
### Eksik veriyi doldurma
df_filled = df.copy() # orijinal veri üzerinde değil kopyası üzerinde çalışmak için yaptık.

In [114]:
# Yaş için medyan ile doldurma
df["age"].median() # yaşın medyan değerini verir.
#df_filled["age"].fillna(df_filled["age"].median(), inplace=True)

np.float64(28.0)

In [118]:
df_filled["age"].fillna(df_filled["age"].median(), inplace=True)
df_filled["age"].isnull().value_counts()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_filled["age"].fillna(df_filled["age"].median(), inplace=True)


age
False    891
Name: count, dtype: int64

In [123]:
df_filled.fillna({"age": df_filled["age"].median()}, inplace=True)
#df[col] = df[col].method(value)

In [124]:
df_filled["age"]

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888    28.0
889    26.0
890    32.0
Name: age, Length: 891, dtype: float64

In [None]:
# Embarked için mod ile doldurma
# Embarked sütunundaki eksik (NaN) değerleri, o sütunun en sık tekrar eden değeriyle (mod) doldurur.
df_filled['Embarked'].fillna(df_filled['Embarked'].mode()[0], inplace=True)

In [128]:
df["age"].isnull()

0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888     True
889    False
890    False
Name: age, Length: 891, dtype: bool

In [None]:
df_filled["age"].fillna(df_filled["age"].mean(),inplace=True)

In [131]:
df_filled["age"].isnull()

0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Name: age, Length: 891, dtype: bool