# Pandas introduction
* https://pandas.pydata.org/pandas-docs/stable/user_guide/index.html
* https://pandas.pydata.org/

In [2]:
import pandas as pd
print(pd.__version__)

0.25.0


## 1. IO
* pd.DataFrame
* read_csv, read_excel, read_parquet
* to_csv, to_excel, to_parquet

In [10]:
df = pd.DataFrame({"A": range(5), "B": list("ABCDE")})
df.head()

Unnamed: 0,A,B
0,0,A
1,1,B
2,2,C
3,3,D
4,4,E


In [11]:
dfg = pd.read_csv("datasets/titanic_train.csv")
dfg.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# 3. Overview data

##  3.1. Preview data

In [17]:
# Create a copy. Any change in `df` won't be applied in `dfg`
df = dfg.copy()

In [18]:
# Show first N rows (default = 5)
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [19]:
# Show last N rows (default = 5)
df.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [27]:
# Show N random rows
df.sample(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
687,688,0,3,"Dakic, Mr. Branko",male,19.0,0,0,349228,10.1708,,S
59,60,0,3,"Goodwin, Master. William Frederick",male,11.0,5,2,CA 2144,46.9,,S
665,666,0,2,"Hickman, Mr. Lewis",male,32.0,2,0,S.O.C. 14879,73.5,,S


## 3.2. Describe the dataframe

In [24]:
# Show a basic stats for numerical columns
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [25]:
# Describe all columns
df.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Farrell, Mr. James",male,,,,347082.0,,B96 B98,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


In [30]:
# Show info of each column and total memory usage
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 318.5 KB


In [28]:
# Show memory usage of each column and the index itself.
df.memory_usage(deep=True)

Index            128
PassengerId     7128
Survived        7128
Pclass          7128
Name           74813
Sex            54979
Age             7128
SibSp           7128
Parch           7128
Ticket         56802
Fare            7128
Cabin          34360
Embarked       55182
dtype: int64

## 3.3. View a column (serie)

In [37]:
# Selecting a column
df["Sex"]

0        male
1      female
2      female
3      female
4        male
        ...  
886      male
887    female
888    female
889      male
890      male
Name: Sex, Length: 891, dtype: object

In [38]:
# It is possible to apply `head`, `sample`, `describe` and other methods from the dataframe
df["Sex"].head()

0      male
1    female
2    female
3    female
4      male
Name: Sex, dtype: object

In [39]:
# Unique values in a column
df["Sex"].unique()

array(['male', 'female'], dtype=object)

In [40]:
# Count values
df["Sex"].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [41]:
df["Sex"].value_counts(normalize=True)

male      0.647587
female    0.352413
Name: Sex, dtype: float64

## 3.4. Summarize
You can retrive some basic stats for the whole dataframe, like:
* mean
* median
* max
* min
* count
* sum
* var / std

In [61]:
# You can retrive it for the whole dataframe
df.mean()

PassengerId    446.000000
Survived         0.383838
Pclass           2.308642
Age             29.699118
SibSp            0.523008
Parch            0.381594
Fare            32.204208
dtype: float64

And it is also possible to get them for a Series (one column). There are extra stats that can only be retrived for a series, for example:
* **nunique:** count of unique values

In [57]:
# Or only for a column
df["Age"].nunique()

88

# 4. Slicing
* .copy
* at
* df[col], df[cols]
* df[filter]
    * isna, isin
    * multifilter
* loc
* iloc
* ix deprecation
# 5. Modify data
* sum, multiply
* loc
* apply / applymap / map
* strings
* datetime
* fillna / dropna
* dummies
* duplicates
* sorting
* new columns, new rows
# 6. Modify dataframe
* rename columns
* delete columns
# 7. Transformations
* groupby
    * iter groupby
    * `0.25.0` groupby
* merge
* .T

# TODO advanced pandas

* multiindex
    * create
    * selects
* category dtype `df.astype({'beer_servings':'float', 'spirit_servings':'float'})`
* pd.crosstab
* pandas plot
* display
    * pd.set_option('display.max_rows', None)
    * pd.set_option('display.precision', 2)
    * pd.describe_option()
    * pd.reset_option('all')