# Mean and median

```
df.info() // summary stat of dataset

df["col_name"].mean() // mean of column

df["col_name"].median() // median of column
```

# Summarizing dates

```
df["date_col"].max()  // Most recent date

df["date_col"].min()  // Least recent date
```

# Efficient Custom Aggregations

```
def custom_summary_func(column):
    return column.quantile(0.75) - column.quantile(0.25) // Custom IQR function
    
df["col_name"].agg(custom_summary_func) // agg let the custom function to be applied on a dataframe's column
```

```
df[["col1", "col2", "col3"]].agg(custom_summary_func) // agg on multiple columns
```

```
df[["col1", "col2", "col3"]].agg([custom_func1,custom_func2]) // multiple aggs on multiple columns
```

# Sort Dates

```
df.sort_values("date_col")
```

# Cumulative statistics

```
df["col_name"].cumsum() // cumulative sum

df["col_name"].cummax() // cumulative max

df["col_name"].cumprod() // cumulative product
```

# Dropping duplicates

```
df.drop_duplicates('col_name') // Drop duplicates of a single column entries

df.drop_duplicates(['col1', 'col2']) // Drop duplicates of a combination of multiple column entries
```

# Counting categorical variables

```
df['col_name'].value_counts() // count categorical values of a column

df['col_name'].value_counts(sort = True) // Count and sort the counts

df['col_name'].value_counts(normalize=True) // proportion of counts with respect to total count
```

# Calculations with .groupby()

```
df.groupby("col")["selected_column_to_summarize"] // grouping of a column with single column

df.groupby(["col1","col2"])["selected_column_to_summarize"] // grouping of a column with multiple columns
```

# Multiple grouped summaries Aggregations

```
df.groupby("col_name")["selected_col1","selected_col2"].agg([np.min, np.max, np.mean, np.median])
```

# Pivot Table

They are simplified group by summaries

```
df.groupby("col_based_on_which_will_be_grouped")["selected_col"].mean()

df.pivot_table(values="selected_col",index="col_based_on_which_will_be_grouped")
```

# Pivoting on one variable

```
df.pivot_table(index= "col_based_on_which_will_be_grouped", 
            values = "selected_col", 
            aggfunc=[np.mean,np.median])
```


# Pivoting on two variable

```
df.groupby(["col_based_on_which_will_be_grouped","second_col_based_on_which_will_be_grouped"])["selected_col"].mean()
```

is same as 

```
df.pivot_table(index= "col_based_on_which_will_be_grouped", 
            values = "selected_col", 
            columns = "second_col_based_on_which_will_be_grouped",
            aggfunc = 'mean')
```


# Get Rid of `NaN` from pivot table and Add summary column

```
df.pivot_table(index= "col_based_on_which_will_be_grouped", 
            values = "selected_col", 
            columns = "second_col_based_on_which_will_be_grouped",
            aggfunc = 'mean',
            fill_value=0, // fill value fills with 0
            margins=True) // Add summary column of contents at the end
```