In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

def reduce_mem_usage(df):
	""" iterate through all the columns of a dataframe and modify the data type
		to reduce memory usage.
	"""
	start_mem = df.memory_usage().sum() / 1024 ** 2
	print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

	for col in df.columns:
		col_type = df[col].dtype

		if col_type != object:
			c_min = df[col].min()
			c_max = df[col].max()
			if str(col_type)[:3] == 'int':
				if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
					df[col] = df[col].astype(np.int8)
				elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
					df[col] = df[col].astype(np.int16)
				elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
					df[col] = df[col].astype(np.int32)
				elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
					df[col] = df[col].astype(np.int64)
			else:
				if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
					df[col] = df[col].astype(np.float16)
				elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
					df[col] = df[col].astype(np.float32)
				else:
					df[col] = df[col].astype(np.float64)
		else:
			df[col] = df[col].astype('category')
	end_mem = df.memory_usage().sum() / 1024 ** 2
	print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
	print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

	return df

# 数据读取优化
def import_data(file, **kwargs):
	"""create a dataframe and optimize its memory usage"""
	df = pd.read_csv(file, parse_dates=True, keep_date_col=True, **kwargs)
	df = reduce_mem_usage(df)
	return df

# print fmt
def print_fmt(display_text, data):
	print(f"---------------------{display_text} START---------------------------")
	print(data)
	print(f"---------------------{display_text} END-----------------------------\n")

In [4]:
train_data = import_data("./tmdata/train_format1.csv")
test_data = import_data("./tmdata/test_format1.csv")
user_info = import_data("./tmdata/user_info_format1.csv")
user_log  = import_data("./tmdata/user_log_format1.csv")
print_fmt("user_log", user_log)

Memory usage of dataframe is 5.97 MB
Memory usage after optimization is: 1.74 MB
Decreased by 70.8%
Memory usage of dataframe is 5.98 MB
Memory usage after optimization is: 3.49 MB
Decreased by 41.7%
Memory usage of dataframe is 9.71 MB
Memory usage after optimization is: 3.24 MB
Decreased by 66.7%
Memory usage of dataframe is 2933.33 MB
Memory usage after optimization is: 890.48 MB
Decreased by 69.6%
---------------------user_log START---------------------------
          user_id  item_id  cat_id  seller_id  brand_id  time_stamp  \
0          328862   323294     833       2882    2660.0         829   
1          328862   844400    1271       2882    2660.0         829   
2          328862   575153    1271       2882    2660.0         829   
3          328862   996875    1271       2882    2660.0         829   
4          328862  1086186    1271       1253    1049.0         829   
...           ...      ...     ...        ...       ...         ...   
54925325   208016   107662     898 

In [5]:
train_data.head(5)

Unnamed: 0,user_id,merchant_id,label
0,34176,3906,0
1,34176,121,0
2,34176,4356,1
3,34176,2217,0
4,230784,4818,0
