# Sous ensemble d'un dataset

## Sélection de colonne(s)

In [30]:
import pandas as pd

titanic = pd.read_csv("titanic.csv")

ages = titanic["Age"]
ages.head()

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: Age, dtype: float64

In [31]:
ages.shape

(891,)

In [32]:
titanic.shape

(891, 12)

In [33]:
columns = ["Age", "Sex"]
age_sex = titanic[columns]
age_sex.head()

Unnamed: 0,Age,Sex
0,22.0,male
1,38.0,female
2,26.0,female
3,35.0,female
4,35.0,male


In [34]:
type(ages)

pandas.core.series.Series

In [35]:
type(titanic)

pandas.core.frame.DataFrame

In [36]:
type(age_sex)

pandas.core.frame.DataFrame

## Filtrage

- Opérateurs de comparaison
- `isin()`
- `notna()` (na -> not available) notna signifie inverse de Not available, donc disponible.

In [37]:
greater_than_35 = titanic[titanic["Age"] > 35]
greater_than_35.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
11,12,1,1,"Bonnell, Miss Elizabeth",female,58.0,0,0,113783,26.55,C103,S
13,14,0,3,"Andersson, Mr. Anders Johan",male,39.0,1,5,347082,31.275,,S
15,16,1,2,"Hewlett, Mrs. (Mary D Kingcome)",female,55.0,0,0,248706,16.0,,S


In [38]:
titanic["Age"] > 35

0      False
1       True
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Name: Age, Length: 891, dtype: bool

In [39]:
# | singifie "ou"
class_23 = titanic[(titanic["Pclass"] == 2) | (titanic["Pclass"] == 3)]
# ou ça
class_23 = titanic[titanic["Pclass"].isin([2, 3])]
class_23.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,3,1,3,"Heikkinen, Miss Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,8,0,3,"Palsson, Master Gosta Leonard",male,2.0,3,1,349909,21.075,,S


In [40]:
age_no_na = titanic[titanic["Age"].notna()]
age_no_na.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Sélection de colonnes avec conditions

- `loc[]` : permet de spécifier un filtre et une sélection de colonnes (where suivi d'un select)
- `iloc[]`
- `loc` se base sur la valeur de l'élément, `iloc` se base sur la position de l'élément.
- On peut changer les valeurs d'une sélection de colonnes avec `loc[]` ou `iloc[]`

In [41]:
# nom des passagers > 35 ans
df1 = titanic.loc[titanic["Age"] > 35]
df2 = titanic["Name"]
# équivalent à ça
adult_names = titanic.loc[titanic["Age"] > 35, "Name"]
adult_names.head()

1     Cumings, Mrs. John Bradley (Florence Briggs Th...
6                               McCarthy, Mr. Timothy J
11                              Bonnell, Miss Elizabeth
13                          Andersson, Mr. Anders Johan
15                     Hewlett, Mrs. (Mary D Kingcome) 
Name: Name, dtype: object

In [42]:
type(adult_names)

pandas.core.series.Series

In [43]:
# on prendre les lignes entre les positions 10 et 25 (exclu) et les colonnes entres les positions 3 et 5 (exclu) 
(titanic.iloc[9:25, 3:5]).head()

Unnamed: 0,Name,Sex
9,"Nasser, Mrs. Nicholas (Adele Achem)",female
10,"Sandstrom, Miss Marguerite Rut",female
11,"Bonnell, Miss Elizabeth",female
12,"Saundercock, Mr. William Henry",male
13,"Andersson, Mr. Anders Johan",male


In [44]:
adult_names.iloc[:] = "Anonymous"
adult_names.head()

1     Anonymous
6     Anonymous
11    Anonymous
13    Anonymous
15    Anonymous
Name: Name, dtype: object

In [45]:
df = titanic.copy()
df.iloc[0:3, 3:5] = "NA"
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,,,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,,,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,,,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
