In [1]:
%matplotlib inline
import pickle
import gc
import math
import time
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import xgboost as xgb
import lightgbm as lgb

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

from bayes_opt import BayesianOptimization
import matplotlib.pyplot as plt
from matplotlib import gridspec
%matplotlib inline

pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60)
pd.set_option('display.max_rows', 3000)

In [12]:
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    return "{:03.2f} MB".format(usage_mb)

In [6]:
outcome_df = pd.read_csv('C:/D_Disk/data_competition/gamer_value/outcome/submission_merged_new.csv')
print(mem_usage(outcome_df))

test_df_tt = pd.read_csv('C:/D_Disk/data_competition/gamer_value/data/tap_fun_test.csv', 
                       index_col=0, header=0)
print(mem_usage(test_df_tt))

12.65 MB
736.78 MB


In [7]:
train_df_tt = pd.read_csv('C:/D_Disk/data_competition/gamer_value/data/tap_fun_train.csv', 
                       index_col=0, header=0)
print(mem_usage(train_df_tt))

  mask |= (ar1 == a)


2051.09 MB


In [34]:
train_df_tt.info()
train_df_tt.columns
dtype_dict = dict(train_df_tt.dtypes)
print(dtype_dict)
for col_name, col_type in dtype_dict.items():
    if dtype_dict[col_name]==np.int64:
        dtype_dict[col_name]=np.int8
    elif dtype_dict[col_name]==np.float64:
        dtype_dict[col_name]=np.float16
print('after change: ')
print(dtype_dict)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2288007 entries, 1 to 3190530
Columns: 108 entries, register_time to prediction_pay_price
dtypes: float64(13), int64(94), object(1)
memory usage: 1.9+ GB
{'register_time': dtype('O'), 'wood_add_value': dtype('float64'), 'wood_reduce_value': dtype('float64'), 'stone_add_value': dtype('float64'), 'stone_reduce_value': dtype('float64'), 'ivory_add_value': dtype('float64'), 'ivory_reduce_value': dtype('float64'), 'meat_add_value': dtype('float64'), 'meat_reduce_value': dtype('float64'), 'magic_add_value': dtype('float64'), 'magic_reduce_value': dtype('float64'), 'infantry_add_value': dtype('int64'), 'infantry_reduce_value': dtype('int64'), 'cavalry_add_value': dtype('int64'), 'cavalry_reduce_value': dtype('int64'), 'shaman_add_value': dtype('int64'), 'shaman_reduce_value': dtype('int64'), 'wound_infantry_add_value': dtype('int64'), 'wound_infantry_reduce_value': dtype('int64'), 'wound_cavalry_add_value': dtype('int64'), 'wound_cavalry_reduc

In [35]:
train_df_tt_new = pd.read_csv('C:/D_Disk/data_competition/gamer_value/data/tap_fun_train.csv', 
                               index_col=0, header=0, dtype=dtype_dict)

  mask |= (ar1 == a)


In [36]:
train_df_tt_new.info()
print(mem_usage(train_df_tt_new))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2288007 entries, 1 to 3190530
Columns: 108 entries, register_time to prediction_pay_price
dtypes: float16(13), int8(94), object(1)
memory usage: 296.8+ MB
445.13 MB


In [2]:
gl = pd.read_csv('C:/D_Disk/data_competition/common_code/dataquest-mlb-game-logs/dataquest-mlb-game-logs/data/game_logs.csv')
gl.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,date,number_of_game,day_of_week,v_name,v_league,v_game_number,h_name,h_league,h_game_number,v_score,h_score,length_outs,day_night,completion,forefeit,protest,park_id,attendance,length_minutes,v_line_score,h_line_score,v_at_bats,v_hits,v_doubles,v_triples,v_homeruns,v_rbi,v_sacrifice_hits,v_sacrifice_flies,v_hit_by_pitch,...,v_player_9_def_pos,h_player_1_id,h_player_1_name,h_player_1_def_pos,h_player_2_id,h_player_2_name,h_player_2_def_pos,h_player_3_id,h_player_3_name,h_player_3_def_pos,h_player_4_id,h_player_4_name,h_player_4_def_pos,h_player_5_id,h_player_5_name,h_player_5_def_pos,h_player_6_id,h_player_6_name,h_player_6_def_pos,h_player_7_id,h_player_7_name,h_player_7_def_pos,h_player_8_id,h_player_8_name,h_player_8_def_pos,h_player_9_id,h_player_9_name,h_player_9_def_pos,additional_info,acquisition_info
0,18710504,0,Thu,CL1,na,1,FW1,na,1,0,2,54.0,D,,,,FOR01,200.0,120.0,0,10010000,30.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6.0,selmf101,Frank Sellman,5.0,mathb101,Bobby Mathews,1.0,foraj101,Jim Foran,3.0,goldw101,Wally Goldsmith,6.0,lennb101,Bill Lennon,2.0,caret101,Tom Carey,4.0,mince101,Ed Mincher,7.0,mcdej101,James McDermott,8.0,kellb105,Bill Kelly,9.0,,Y
1,18710505,0,Fri,BS1,na,1,WS3,na,1,20,18,54.0,D,,,,WAS01,5000.0,145.0,107000435,640113030,41.0,13.0,1.0,2.0,0.0,13.0,0.0,0.0,0.0,...,1.0,watef102,Fred Waterman,5.0,forcd101,Davy Force,6.0,mille105,Everett Mills,3.0,allid101,Doug Allison,2.0,hallg101,George Hall,7.0,leona101,Andy Leonard,4.0,braia102,Asa Brainard,1.0,burrh101,Henry Burroughs,9.0,berth101,Henry Berthrong,8.0,HTBF,Y
2,18710506,0,Sat,CL1,na,2,RC1,na,1,12,4,54.0,D,,,,RCK01,1000.0,140.0,610020003,10020100,49.0,11.0,1.0,1.0,0.0,8.0,0.0,0.0,0.0,...,6.0,mackd101,Denny Mack,3.0,addyb101,Bob Addy,4.0,fishc102,Cherokee Fisher,1.0,hasts101,Scott Hastings,8.0,ham-r101,Ralph Ham,5.0,ansoc101,Cap Anson,2.0,sagep101,Pony Sager,6.0,birdg101,George Bird,7.0,stirg101,Gat Stires,9.0,,Y
3,18710508,0,Mon,CL1,na,3,CH1,na,1,12,14,54.0,D,,,,CHI01,5000.0,150.0,101403111,77000000,46.0,15.0,2.0,1.0,2.0,10.0,0.0,0.0,0.0,...,6.0,mcatb101,Bub McAtee,3.0,kingm101,Marshall King,8.0,hodec101,Charlie Hodes,2.0,woodj106,Jimmy Wood,4.0,simmj101,Joe Simmons,9.0,folet101,Tom Foley,7.0,duffe101,Ed Duffy,6.0,pinke101,Ed Pinkham,5.0,zettg101,George Zettlein,1.0,,Y
4,18710509,0,Tue,BS1,na,2,TRO,na,1,9,5,54.0,D,,,,TRO01,3250.0,145.0,2232,101003000,46.0,17.0,4.0,1.0,0.0,6.0,0.0,0.0,0.0,...,1.0,flync101,Clipper Flynn,9.0,mcgem101,Mike McGeary,2.0,yorkt101,Tom York,8.0,mcmuj101,John McMullin,1.0,kings101,Steve King,7.0,beave101,Edward Beavens,4.0,bells101,Steve Bellan,5.0,pikel101,Lip Pike,3.0,cravb101,Bill Craver,6.0,HTBF,Y


In [11]:
gl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 171907 entries, 0 to 171906
Columns: 161 entries, date to acquisition_info
dtypes: float64(76), int64(6), object(79)
memory usage: 211.2+ MB


In [13]:
gl.info(memory_usage='deep')
print(mem_usage(gl))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 171907 entries, 0 to 171906
Columns: 161 entries, date to acquisition_info
dtypes: float64(76), int64(6), object(79)
memory usage: 864.4 MB
864.44 MB


In [26]:
gl['h_player_7_def_pos'].shape

(171907,)

In [27]:
len(gl['park_id'].value_counts())
gl_obj = gl.select_dtypes(include=['object']).copy()
gl_obj.describe()

Unnamed: 0,day_of_week,v_name,v_league,h_name,h_league,day_night,completion,forefeit,protest,park_id,v_line_score,h_line_score,h_triple_plays,hp_umpire_id,hp_umpire_name,1b_umpire_id,1b_umpire_name,2b_umpire_id,2b_umpire_name,3b_umpire_id,3b_umpire_name,lf_umpire_id,lf_umpire_name,rf_umpire_id,rf_umpire_name,v_manager_id,v_manager_name,h_manager_id,h_manager_name,winning_pitcher_id,...,v_player_5_id,v_player_5_name,v_player_6_id,v_player_6_name,v_player_7_id,v_player_7_name,v_player_8_id,v_player_8_name,v_player_9_id,v_player_9_name,h_player_1_id,h_player_1_name,h_player_2_id,h_player_2_name,h_player_3_id,h_player_3_name,h_player_4_id,h_player_4_name,h_player_5_id,h_player_5_name,h_player_6_id,h_player_6_name,h_player_7_id,h_player_7_name,h_player_8_id,h_player_8_name,h_player_9_id,h_player_9_name,additional_info,acquisition_info
count,171907,171907,171907,171907,171907,140150,116,145,180,171907,147271,147271,140838,171888,171891,147040,171891,88540,171127,116723,171135,203,171902,9,171902,171907,171907,171907,171907,140229,...,140838,140838,140838,140838,140838,140838,140838,140838,140835,140835,140838,140838,140838,140838,140838,140838,140838,140838,140838,140838,140838,140838,140838,140838,140838,140838,140838,140838,1456,140841
unique,7,148,7,148,7,2,116,3,5,245,36367,37859,2,1149,1146,678,678,324,325,362,363,31,32,8,9,648,648,659,659,5123,...,3757,3722,4794,4736,5301,5241,4812,4763,5643,5585,2802,2782,3648,3614,2881,2858,2533,2517,3696,3660,4774,4720,5253,5197,4760,4710,5193,5142,332,1
top,Sat,CHN,NL,CHN,NL,D,"19550830,,1,0,45",H,V,STL07,0,0,False,klemb901,Bill Klem,connt901,(none),westj901,(none),mcgob901,(none),sudoe901,(none),gormt101,(none),mackc101,Connie Mack,mackc101,Connie Mack,johnw102,...,heilh101,Harry Heilmann,grimc101,Charlie Grimm,grimc101,Charlie Grimm,lopea102,Al Lopez,grifa001,Alfredo Griffin,suzui001,Ichiro Suzuki,fox-n101,Nellie Fox,speat101,Tris Speaker,gehrl101,Lou Gehrig,heilh101,Harry Heilmann,grimc101,Charlie Grimm,grimc101,Charlie Grimm,lopea102,Al Lopez,spahw101,Warren Spahn,HTBF,Y
freq,28891,8870,88866,9024,88867,82724,1,69,90,7022,10102,8028,140603,3545,3545,2029,24851,815,82587,1129,54412,30,171699,2,171893,3901,3901,3848,3848,385,...,663,663,465,465,485,485,687,687,333,333,927,927,859,859,1165,1165,752,752,612,612,427,427,491,491,676,676,339,339,1112,140841


In [14]:
for dtype in ['float','int','object']:
    selected_dtype = gl.select_dtypes(include=[dtype])
    mean_usage_b = selected_dtype.memory_usage(deep=True).mean()
    mean_usage_mb = mean_usage_b / 1024 ** 2
    print("Average memory usage for {} columns: {:03.2f} MB".format(dtype,mean_usage_mb))

Average memory usage for float columns: 1.29 MB
Average memory usage for int columns: 0.00 MB
Average memory usage for object columns: 9.46 MB


In [20]:
int_types = ["uint8", "int8", "int16", 'int64']
for it in int_types:
    print(np.iinfo(it))
    
float_types = ['float16', 'float32', 'float64']
for ft in float_types:
    print(np.finfo(ft))

Machine parameters for uint8
---------------------------------------------------------------
min = 0
max = 255
---------------------------------------------------------------

Machine parameters for int8
---------------------------------------------------------------
min = -128
max = 127
---------------------------------------------------------------

Machine parameters for int16
---------------------------------------------------------------
min = -32768
max = 32767
---------------------------------------------------------------

Machine parameters for int64
---------------------------------------------------------------
min = -9223372036854775808
max = 9223372036854775807
---------------------------------------------------------------

Machine parameters for float16
---------------------------------------------------------------
precision =   3   resolution = 1.00040e-03
machep =    -10   eps =        9.76562e-04
negep =     -11   epsneg =     4.88281e-04
minexp =    -14   tiny =    

In [22]:
gl_int = gl.select_dtypes(include=['int64'])
converted_int = gl_int.apply(pd.to_numeric,downcast='unsigned')

print(mem_usage(gl_int))
print(mem_usage(converted_int))

7.87 MB
1.48 MB


[['1', '4', '5', '6', '9', "'aaaa'"]]
