# Pandas introduction
* https://pandas.pydata.org/pandas-docs/stable/user_guide/index.html
* https://pandas.pydata.org/

In [1]:
import pandas as pd
print(pd.__version__)

0.25.1


## 1. IO
Input/Output operations (reading and writting data).

### 1.1. pd.DataFrame
This allows to create a dataframe from a python object. You can use list, dictonaries and almost every python object.

In [2]:
df = pd.DataFrame({"A": range(5), "B": list("ABCDE")})
df.head()

Unnamed: 0,A,B
0,0,A
1,1,B
2,2,C
3,3,D
4,4,E


### 1.2. Read
The most common reading function is `pd.read_csv`. But there are also functions to read almost everything ([more info](https://pandas.pydata.org/pandas-docs/stable/reference/io.html)).

In [3]:
dfg = pd.read_csv("datasets/titanic_train.csv")
dfg.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df_iris = pd.read_excel("datasets/iris.xlsx")
df_iris.head()

Unnamed: 0,SL,SW,PL,PW,Classification
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


### 1.3. Writte
Dataframes have functions that allows you to export them to almost any format. For example you can export it to `parquet` (a really eficient file format).

In [5]:
dfg.to_parquet("datasets/output.parquet") # You will need pyarrow or fasparquet installed

## 2. Overview data
###  2.1. Preview data

In [6]:
# Create a copy. Any change in `df` won't be applied in `dfg`
df = dfg.copy()

In [7]:
# Show first N rows (default = 5)
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [8]:
# Show last N rows (default = 5)
df.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [9]:
# Show N random rows
df.sample(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
750,751,1,2,"Wells, Miss. Joan",female,4.0,1,1,29103,23.0,,S
247,248,1,2,"Hamalainen, Mrs. William (Anna)",female,24.0,0,2,250649,14.5,,S
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6,S


### 2.2. Describe the dataframe

In [10]:
# Show a basic stats for numerical columns
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [11]:
# Describe all columns
df.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Frost, Mr. Anthony Wood ""Archie""",male,,,,1601.0,,B96 B98,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


In [12]:
# Show info of each column and total memory usage
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 318.5 KB


In [13]:
# Show memory usage of each column and the index itself.
df.memory_usage(deep=True)

Index            128
PassengerId     7128
Survived        7128
Pclass          7128
Name           74813
Sex            54979
Age             7128
SibSp           7128
Parch           7128
Ticket         56802
Fare            7128
Cabin          34360
Embarked       55182
dtype: int64

### 2.3. View a column (serie)

In [14]:
# Selecting a column
df["Sex"]

0        male
1      female
2      female
3      female
4        male
        ...  
886      male
887    female
888    female
889      male
890      male
Name: Sex, Length: 891, dtype: object

In [15]:
# It is possible to apply `head`, `sample`, `describe` and other methods from the dataframe
df["Sex"].head()

0      male
1    female
2    female
3    female
4      male
Name: Sex, dtype: object

In [16]:
# Unique values in a column
df["Sex"].unique()

array(['male', 'female'], dtype=object)

In [17]:
# Count values
df["Sex"].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [18]:
df["Sex"].value_counts(normalize=True)

male      0.647587
female    0.352413
Name: Sex, dtype: float64

### 2.4. Summarize
You can retrive some basic stats for the whole dataframe, like:
* mean
* median
* max
* min
* count
* sum
* var / std

In [19]:
# You can retrive it for the whole dataframe
df.mean()

PassengerId    446.000000
Survived         0.383838
Pclass           2.308642
Age             29.699118
SibSp            0.523008
Parch            0.381594
Fare            32.204208
dtype: float64

And it is also possible to get them for a Series (one column). There are extra stats that can only be retrived for a series, for example:
* **nunique:** count of unique values

In [20]:
# Or only for a column
df["Age"].nunique()

88

## 3. Slicing
**Pandas** is usually used to work with a lot of information. So it is really important to know how to slice the dataframe to get a subset of it.

### 3.1. Getting one value with `df.at`

In [21]:
df.at[1, "Sex"] # Row = 1, Column = Sex

'female'

### 3.2. Accessing one or more columns

In [22]:
df["Sex"] # This returns a series object (a part of a dataframe)

0        male
1      female
2      female
3      female
4        male
        ...  
886      male
887    female
888    female
889      male
890      male
Name: Sex, Length: 891, dtype: object

In [23]:
df[["Sex", "Age"]] # This retuns a dataframe

Unnamed: 0,Sex,Age
0,male,22.0
1,female,38.0
2,female,26.0
3,female,35.0
4,male,35.0
...,...,...
886,male,27.0
887,female,19.0
888,female,
889,male,26.0


### 3.3. Filter rows
To filter some rows you will need to call `df[some_filter]`. This filter could be complex by using the and operator (`&`), the or (`|`) or the not (`~`). You can't use the regular python operators (`and`, `or`, `not`).

In [24]:
df[df["Sex"] == "male"] # Only rows with male people

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
883,884,0,2,"Banfield, Mr. Frederick James",male,28.0,0,0,C.A./SOTON 34068,10.5000,,S
884,885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [25]:
df[(df["Age"] > 22) & (df["Age"] < 30)]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
23,24,1,1,"Sloper, Mr. William Thompson",male,28.0,0,0,113788,35.5000,A6,S
34,35,0,1,"Meyer, Mr. Edgar Joseph",male,28.0,1,0,PC 17604,82.1708,,C
41,42,0,2,"Turpin, Mrs. William John Robert (Dorothy Ann ...",female,27.0,1,0,11668,21.0000,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
880,881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25.0,0,1,230433,26.0000,,S
883,884,0,2,"Banfield, Mr. Frederick James",male,28.0,0,0,C.A./SOTON 34068,10.5000,,S
884,885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S


You can also create the filter separately and then use it. The filter will be a series with only booleans.

In [26]:
mfilter = df["Age"] > 20
mfilter # Show mfilter

0       True
1       True
2       True
3       True
4       True
       ...  
886     True
887    False
888    False
889     True
890     True
Name: Age, Length: 891, dtype: bool

In [27]:
df[mfilter]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
884,885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.1250,,Q
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


There are special functions to filter values that are inside a list (`isin`) and to filter **null** values (`isna`).

In [28]:
df[df["Pclass"].isin([1, 2])]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.5500,C103,S
...,...,...,...,...,...,...,...,...,...,...,...,...
880,881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25.0,0,1,230433,26.0000,,S
883,884,0,2,"Banfield, Mr. Frederick James",male,28.0,0,0,C.A./SOTON 34068,10.5000,,S
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S


In [29]:
df[df["Cabin"].isna()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
884,885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.1250,,Q
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S


### 3.4. Filtering rows and columns at the same time
You can use `loc` when using rows/columns names or `iloc` when using the position (number) for rows/columns.
You might find the function `ix` but it has been deprecated in favor of `loc/iloc`. Please do not use `ix`.

In [30]:
df.loc[df["Age"] > 30, "Survived"] # First the filter for rows, then the column or columns

1      1
3      1
4      0
6      0
11     1
      ..
873    0
879    1
881    0
885    0
890    0
Name: Survived, Length: 305, dtype: int64

In [31]:
df.loc[df["Age"] > 30, ["Sex", "Survived"]].head() # You can always concatenate functions

Unnamed: 0,Sex,Survived
1,female,1
3,female,1
4,male,0
6,male,0
11,female,1


In [32]:
df.iloc[0:5, 1:3] # Slice for rows, slice for columns

Unnamed: 0,Survived,Pclass
0,0,3
1,1,1
2,1,3
3,1,1
4,0,3


## 4. Modify data

### 4.1. Basic modifications
In general you can assing values by using the same functions that were used to acces data

In [33]:
df.at[1, "Sex"] = "Other" # Assign one value
df.head(2) # View the changes

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",Other,38.0,1,0,PC 17599,71.2833,C85,C


In [34]:
df["Ticket"] = "no ticket" # Assign a value to all rows
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,no ticket,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",Other,38.0,1,0,no ticket,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,no ticket,7.925,,S


In [35]:
df["Dummy"] = df["Age"] + df["Survived"] # sum to columns and assign the value to a new column

### 4.2. Special types
#### 4.2.1. Strings

In [36]:
df["Name"].str.replace("Mr", "Hello") # This only shows the modification but it does not replace the original value

0                             Braund, Hello. Owen Harris
1      Cumings, Hellos. John Bradley (Florence Briggs...
2                                 Heikkinen, Miss. Laina
3        Futrelle, Hellos. Jacques Heath (Lily May Peel)
4                            Allen, Hello. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                             Behr, Hello. Karl Howell
890                               Dooley, Hello. Patrick
Name: Name, Length: 891, dtype: object

In [37]:
df["Name"] = df["Name"].str.replace("Mr", "Hello") # This updates the values

In [38]:
df["Name"].str[:10]

0      Braund, He
1      Cumings, H
2      Heikkinen,
3      Futrelle, 
4      Allen, Hel
          ...    
886    Montvila, 
887    Graham, Mi
888    Johnston, 
889    Behr, Hell
890    Dooley, He
Name: Name, Length: 891, dtype: object

#### 4.2.2. Datetime

In [39]:
df_taxi = pd.read_csv("datasets/taxi.csv")
df_taxi.head()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24 UTC,-73.97332,40.763805,-73.98143,40.743835,1
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24 UTC,-73.986862,40.719383,-73.998886,40.739201,1
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44 UTC,-73.982524,40.75126,-73.979654,40.746139,1
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12 UTC,-73.98116,40.767807,-73.990448,40.751635,1
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12 UTC,-73.966046,40.789775,-73.988565,40.744427,1


In [40]:
df_taxi["key"] = pd.to_datetime(df_taxi["key"]) # Transform the column to datetime

In [41]:
df_taxi["key"].dt.day

0       27
1       27
2        8
3        1
4        1
        ..
9909    10
9910    12
9911    19
9912    31
9913    18
Name: key, Length: 9914, dtype: int64

In [42]:
df_taxi["key"].dt.strftime("%Y-%m-01") # Get first day of the month

0       2015-01-01
1       2015-01-01
2       2011-10-01
3       2012-12-01
4       2012-12-01
           ...    
9909    2015-05-01
9910    2015-01-01
9911    2015-04-01
9912    2015-01-01
9913    2015-01-01
Name: key, Length: 9914, dtype: object

### 4.3. Missing values
The function `fillna` will fill the missing values. In order to update the original values you need to set **inplace** to true (`fillna.(fill_value, inplace=True)`)

In [43]:
df["Cabin"].fillna("Unknown") # Fill missing values

0      Unknown
1          C85
2      Unknown
3         C123
4      Unknown
        ...   
886    Unknown
887        B42
888    Unknown
889       C148
890    Unknown
Name: Cabin, Length: 891, dtype: object

In [44]:
df["Cabin"].dropna() # This will discard rows with missing values

1              C85
3             C123
6              E46
10              G6
11            C103
          ...     
871            D35
872    B51 B53 B55
879            C50
887            B42
889           C148
Name: Cabin, Length: 204, dtype: object

### 4.4. Sorting

In [45]:
df.sort_values("Cabin", ascending=False) # Default ascending=True

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Dummy
339,340,0,1,"Blackwell, Hello. Stephen Weart",male,45.0,0,0,no ticket,35.5000,T,S,45.0
394,395,1,3,"Sandstrom, Hellos. Hjalmar (Agnes Charlotta Be...",female,24.0,0,2,no ticket,16.7000,G6,S,25.0
205,206,0,3,"Strom, Miss. Telma Matilda",female,2.0,0,1,no ticket,10.4625,G6,S,2.0
251,252,0,3,"Strom, Hellos. Wilhelm (Elna Matilda Persson)",female,29.0,1,1,no ticket,10.4625,G6,S,29.0
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,no ticket,16.7000,G6,S,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
884,885,0,3,"Sutehall, Hello. Henry Jr",male,25.0,0,0,no ticket,7.0500,,S,25.0
885,886,0,3,"Rice, Hellos. William (Margaret Norton)",female,39.0,0,5,no ticket,29.1250,,Q,39.0
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,no ticket,13.0000,,S,27.0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,no ticket,23.4500,,S,


### 4.5. Handling duplicates

In [46]:
df.drop_duplicates(["Pclass"]) # Discard the duplicates based on certain columns

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Dummy
0,1,0,3,"Braund, Hello. Owen Harris",male,22.0,1,0,no ticket,7.25,,S,22.0
1,2,1,1,"Cumings, Hellos. John Bradley (Florence Briggs...",Other,38.0,1,0,no ticket,71.2833,C85,C,39.0
9,10,1,2,"Nasser, Hellos. Nicholas (Adele Achem)",female,14.0,1,0,no ticket,30.0708,,C,15.0


### 4.6. Custom functions
You can apply custom functions. Remeber that usually functions have names but you can define unamed functions using lambdas

In [47]:
def sum_1(x):
    return x + 1

sum_2 = lambda x: x + 2

You have three functions that will work when applying custom functions:
* `DataFrame.apply` operates on entire rows or columns at a time.
* `DataFrame.applymap`, `Series.apply`, and `Series.map` operate on one element at time.

In [48]:
numerical_columns = ["SL", "SW", "PL", "PW"]
df_iris[numerical_columns].apply(sum_1)

Unnamed: 0,SL,SW,PL,PW
0,6.1,4.5,2.4,1.2
1,5.9,4.0,2.4,1.2
2,5.7,4.2,2.3,1.2
3,5.6,4.1,2.5,1.2
4,6.0,4.6,2.4,1.2
...,...,...,...,...
145,7.7,4.0,6.2,3.3
146,7.3,3.5,6.0,2.9
147,7.5,4.0,6.2,3.0
148,7.2,4.4,6.4,3.3


In [49]:
def custom_categories(x):
    """ This needs to be applied to one element at a time """
    if x > 5:
        return "L"
    if x < 1:
        return "S"
    return "M"

df_iris["SL"].apply(custom_categories)

0      L
1      M
2      M
3      M
4      M
      ..
145    L
146    L
147    L
148    L
149    L
Name: SL, Length: 150, dtype: object

### 4.7. Apply changes to a slice
It is posible to apply changes to only part of the dataframe using `loc`

In [50]:
df.loc[df["Sex"] == "male", "Pclass"] = 77

In [51]:
mfilter = df["Embarked"] == "S"
df.loc[mfilter, "Age"] = df.loc[mfilter, "Age"]*2

## 5. Modify the dataframe
### 5.1. Delete rows/columns

In [52]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Dummy
0,1,0,77,"Braund, Hello. Owen Harris",male,44.0,1,0,no ticket,7.25,,S,22.0
1,2,1,1,"Cumings, Hellos. John Bradley (Florence Briggs...",Other,38.0,1,0,no ticket,71.2833,C85,C,39.0
2,3,1,3,"Heikkinen, Miss. Laina",female,52.0,0,0,no ticket,7.925,,S,27.0
3,4,1,1,"Futrelle, Hellos. Jacques Heath (Lily May Peel)",female,70.0,1,0,no ticket,53.1,C123,S,36.0
4,5,0,77,"Allen, Hello. William Henry",male,70.0,0,0,no ticket,8.05,,S,35.0


In [53]:
df.drop("Dummy", axis=1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,77,"Braund, Hello. Owen Harris",male,44.0,1,0,no ticket,7.2500,,S
1,2,1,1,"Cumings, Hellos. John Bradley (Florence Briggs...",Other,38.0,1,0,no ticket,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,52.0,0,0,no ticket,7.9250,,S
3,4,1,1,"Futrelle, Hellos. Jacques Heath (Lily May Peel)",female,70.0,1,0,no ticket,53.1000,C123,S
4,5,0,77,"Allen, Hello. William Henry",male,70.0,0,0,no ticket,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,77,"Montvila, Rev. Juozas",male,54.0,0,0,no ticket,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,38.0,0,0,no ticket,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,no ticket,23.4500,,S
889,890,1,77,"Behr, Hello. Karl Howell",male,26.0,0,0,no ticket,30.0000,C148,C


In [54]:
df.drop(["Dummy", "Fare", "Cabin"], axis=1, inplace=True) # inplace=True to update the original dataframe

### 5.2. Rename columns

In [55]:
df.columns # Retrive column names

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Embarked'],
      dtype='object')

In [56]:
df.columns = ['id', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'embarked']

## 6. Transformations
### 6.1. Group by values

In [57]:
df.groupby("pclass").sum()

Unnamed: 0_level_0,id,survived,age,sibsp,parch
pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,44106,91,4381.0,52,43
2,33676,70,4087.0,37,46
3,57561,72,3983.5,129,115
77,262043,109,25066.42,248,136


You can use `agg` on a `groupby` object to perform multiple aggregations at once.

In [58]:
df.groupby("pclass").agg(["min", "max"])

Unnamed: 0_level_0,id,id,survived,survived,name,name,sex,sex,age,age,sibsp,sibsp,parch,parch,ticket,ticket
Unnamed: 0_level_1,min,max,min,max,min,max,min,max,min,max,min,max,min,max,min,max
pclass,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
1,2,888,0,1,"Allen, Miss. Elisabeth Walton","Young, Miss. Marie Grice",Other,female,4.0,126.0,0,3,0,2,no ticket,no ticket
2,10,881,0,1,"Abelson, Hellos. Samuel (Hannah Wizosky)","Yrois, Miss. Henriette (""Hellos Harbeck"")",female,female,3.0,114.0,0,3,0,3,no ticket,no ticket
3,3,889,0,1,"Abbott, Hellos. Stanton (Rosa Hunt)","de Messemaeker, Hellos. Guillaume Joseph (Emma)",female,female,0.75,126.0,0,8,0,6,no ticket,no ticket
77,1,891,0,1,"Abbing, Hello. Anthony","van Melkebeke, Hello. Philemon",male,male,0.42,160.0,0,8,0,5,no ticket,no ticket


With pandas `0.25` or higher you can specify what to do to what column and the name of the output column. This is done with the following syntax: `agg.(new_name=(old_name, aggregation_function))`

In [59]:
df.groupby("pclass").agg(age_avg=("age", "mean"), age_min=("age", "min"))

Unnamed: 0_level_0,age_avg,age_min
pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,51.541176,4.0
2,55.22973,3.0
3,39.053922,0.75
77,55.33426,0.42
