# 数据准备

In [None]:
import pandas as pd
import numpy as np
import os
# to download https://www.kaggle.com/russellyates88/suicide-rates-overview-1985-to-2016

data_path = 'F:/suicide-rates-overview-1985-to-2016'
df = (pd.read_csv(filepath_or_buffer=os.path.join(data_path, 'master.csv'))
.rename(columns={'suicides/100k pop' : 'suicides_per_100k', ' gdp_for_year ($) ' : 'gdp_year',  'gdp_per_capita ($)' : 'gdp_capita', 'country-year' : 'country_year'}) 
.assign(gdp_year=lambda _df: _df['gdp_year'].str
.replace(',','').astype(np.int64)) )

In [None]:
df.columns

In [None]:
df['generation'].unique()

In [None]:
df['country'].nunique()

In [None]:
df.describe()

# 内存优化

In [None]:
def mem_usage(df: pd.DataFrame) -> str:
    """This method styles the memory usage of a DataFrame to be readable as MB.
    Parameters
    ----------
    df: pd.DataFrame
        Data frame to measure.
    Returns
    -------
    str
        Complete memory usage as a string formatted for MB.
    """
    return f'{df.memory_usage(deep=True).sum() / 1024 ** 2 : 3.2f} MB'


def convert_df(df: pd.DataFrame, deep_copy: bool = True) -> pd.DataFrame:
    """Automatically converts columns that are worth stored as
    ``categorical`` dtype.
    Parameters
    ----------
    df: pd.DataFrame
        Data frame to convert.
    deep_copy: bool
        Whether or not to perform a deep copy of the original data frame.
    Returns
    -------
    pd.DataFrame
        Optimized copy of the input data frame.
    """
    return df.copy(deep=deep_copy).astype({
        col: 'category' for col in df.columns
        if df[col].nunique() / df[col].shape[0] < 0.5})

In [None]:
mem_usage(df)

In [None]:
mem_usage(df.set_index(['country', 'year', 'sex', 'age']))

In [None]:
mem_usage(convert_df(df))

In [None]:
mem_usage(convert_df(df.set_index(['country', 'year', 'sex', 'age'])))

# 索引

In [None]:
%%time
df.query('country == "Albania" and year == 1987 and sex == "male" and age == "25-34 years"')

In [None]:
%%time
mi_df = df.set_index(['country', 'year', 'sex', 'age'])

In [None]:
%%time
mi_df.loc['Albania', 1987, 'male', '25-34 years']