In [2]:
import pandas as pd
from pandas import DataFrame
from sklearn.impute import SimpleImputer

# Analyzing the Dataset

In [3]:
dataframe = pd.read_csv("aps_training_set_sample3.csv")
display(dataframe[0:5])
display("DataFrame rows: %s" % (dataframe.size))
display("DataFrame columns: %s" % (len(dataframe.columns)))

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,neg,60874,?,1368,458,0,0,0,0,0,...,622012,229790,405298,347188,286954,311560,433954,1218,0,0
1,pos,153204,0,182,?,0,0,0,0,0,...,129862,26872,34044,22472,34362,0,0,0,0,0
2,neg,41212,0,2130706434,104,104,172,0,0,0,...,171390,95844,177206,173184,404690,736142,9268,6,0,0
3,pos,453236,?,2926,?,0,0,0,0,222,...,7908038,3026002,5025350,2025766,1160638,533834,493800,6914,0,0
4,neg,11572,0,62,?,0,0,0,0,0,...,246058,90772,85754,28014,27874,78510,0,0,0,0


'DataFrame rows: 528561'

'DataFrame columns: 171'

# Deal with Missing Data
- Token considered missing: ?
- Imputer Strategy: most frequent token

In [4]:
pd.set_option('display.expand_frame_repr', False)

imputer = SimpleImputer(missing_values='?', strategy = 'most_frequent')
result = imputer.fit_transform(dataframe)

preprocess_df = pd.DataFrame(result)
preprocess_df.columns = dataframe.columns
preprocess_df.index = dataframe.index
preprocess_df[:5]

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,neg,60874,0,1368,458,0,0,0,0,0,...,622012,229790,405298,347188,286954,311560,433954,1218,0,0
1,pos,153204,0,182,0,0,0,0,0,0,...,129862,26872,34044,22472,34362,0,0,0,0,0
2,neg,41212,0,2130706434,104,104,172,0,0,0,...,171390,95844,177206,173184,404690,736142,9268,6,0,0
3,pos,453236,0,2926,0,0,0,0,0,222,...,7908038,3026002,5025350,2025766,1160638,533834,493800,6914,0,0
4,neg,11572,0,62,0,0,0,0,0,0,...,246058,90772,85754,28014,27874,78510,0,0,0,0


# Normalizing Data
- Select features that contains top-10 stddev

In [5]:
pd.options.display.max_columns = None
for c in preprocess_df.columns[1:].values:
    preprocess_df[c] = pd.to_numeric(preprocess_df[c])
std_df = preprocess_df.std(axis = 0, skipna=True)
top10_std = std_df.sort_values(ascending=[False])[:10]
display(top10_std)

ac_000    6.777956e+08
dq_000    2.276409e+08
eb_000    9.286260e+07
bb_000    2.860221e+07
bv_000    2.849327e+07
bu_000    2.849326e+07
cq_000    2.849326e+07
bx_000    2.702470e+07
cc_000    2.544896e+07
du_000    2.436939e+07
dtype: float64

In [16]:
from sklearn import preprocessing
import numpy as np
pd.options.display.float_format = '{:,.2f}'.format
np.set_printoptions(formatter={'float': '{: 0.3f}'.format})

min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0,5))
top10_std_columns = top10_std.index
for column in top10_std_columns:
    x = min_max_scaler.fit_transform(preprocess_df[[column]].values.astype(float))
    preprocess_df[column] = x
preprocess_df[0:5]

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,ag_003,ag_004,ag_005,ag_006,ag_007,ag_008,ag_009,ah_000,ai_000,aj_000,ak_000,al_000,am_0,an_000,ao_000,ap_000,aq_000,ar_000,as_000,at_000,au_000,av_000,ax_000,ay_000,ay_001,ay_002,ay_003,ay_004,ay_005,ay_006,ay_007,ay_008,ay_009,az_000,az_001,az_002,az_003,az_004,az_005,az_006,az_007,az_008,az_009,ba_000,ba_001,ba_002,ba_003,ba_004,ba_005,ba_006,ba_007,ba_008,ba_009,bb_000,bc_000,bd_000,be_000,bf_000,bg_000,bh_000,bi_000,bj_000,bk_000,bl_000,bm_000,bn_000,bo_000,bp_000,bq_000,br_000,bs_000,bt_000,bu_000,bv_000,bx_000,by_000,bz_000,ca_000,cb_000,cc_000,cd_000,ce_000,cf_000,cg_000,ch_000,ci_000,cj_000,ck_000,cl_000,cm_000,cn_000,cn_001,cn_002,cn_003,cn_004,cn_005,cn_006,cn_007,cn_008,cn_009,co_000,cp_000,cq_000,cr_000,cs_000,cs_001,cs_002,cs_003,cs_004,cs_005,cs_006,cs_007,cs_008,cs_009,ct_000,cu_000,cv_000,cx_000,cy_000,cz_000,da_000,db_000,dc_000,dd_000,de_000,df_000,dg_000,dh_000,di_000,dj_000,dk_000,dl_000,dm_000,dn_000,do_000,dp_000,dq_000,dr_000,ds_000,dt_000,du_000,dv_000,dx_000,dy_000,dz_000,ea_000,eb_000,ec_00,ed_000,ee_000,ee_001,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,neg,60874,0,0.0,458,0,0,0,0,0,0,43752,1966618,1800340,131646,4588,0,1974038,0,226,0,0,0,3230626,2618878,1058136,551022,0,0,0,0,1788,642,0,0,0,0,42124,372236,2128914,819596,584074,0,1644,362,562,842,30194,3911734,1606,0,0,0,1348578,1035668,338762,236540,182278,151778,163248,470800,19292,0,0.11,448,556,642,2,1974038,86454,653692,399410,306780,282560,274180,1310700,1310700,1310700,1310700,1310700,189000,60874.03,0.11,0.11,0.13,24793.0,17052,61844,654700,0.14,1209600,135720,0,152,0,3565684.8,0.0,379111.68,0,746,0,0,356,378910,2497104,993000,64230,10482,2776,86,202,212,0.11,0,3942,520,80950,227322,186242,2288268,1137268,22228,204,0,1716,1664,3440288,215826,0,4262,0,0,3590004,2026,444,0,0,0,0,0,0,0,0,44946,62648,11506,0.0,0,149474,35154,0.01,80482,98334,27588,0,0,0.08,1116.06,1176,404740,904230,622012,229790,405298,347188,286954,311560,433954,1218,0,0
1,pos,153204,0,0.0,0,0,0,0,0,0,11804,684444,326536,31586,0,0,0,2658638,14346,0,0,29384,46356,5566182,4426834,1571480,1120644,2,0,0,0,1368,862,0,0,0,0,0,0,0,94188,960182,0,250,16,32,146,126,951126,91162,11512,0,0,702994,186172,69282,36640,20698,17720,18586,2278,0,0,0.19,692,152,1690,294,2658638,126990,355360,1215418,263620,263960,253220,260500,270840,1310700,1310700,1310700,255200,153203.49,0.19,0.19,0.03,2169.0,415352,108024,1114900,0.24,1209600,0,0,0,0,14779408.32,5602235.52,6587273.28,0,8,0,0,32854,113024,413450,341800,110462,40426,2354,0,0,0,0.19,0,556,62,11606,370336,119892,399936,149156,2826,0,0,0,0,0,0,0,0,0,0,0,16910,218,0,0,55760,1433920,0,0,0,0,51558,0,0,0.0,0,0,0,0.0,0,0,0,0,0,0.0,1.9,166,512878,293880,129862,26872,34044,22472,34362,0,0,0,0,0
2,neg,41212,0,5.0,104,104,172,0,0,0,0,9268,1343962,881214,81244,3876,0,1027430,0,32,0,0,0,2166316,1929602,679632,169490,0,0,3850,0,148,628,0,0,0,0,0,0,22990,796396,1500178,0,1602,1200,1992,5188,47486,175800,2086290,6,0,0,724890,783694,245928,138998,93056,74112,63832,96376,51000,47678,0.07,754,3128,6292,2,1027430,27030,499604,178442,301120,271600,1310700,1310700,1310700,1310700,1310700,1310700,100380,41212.87,0.07,0.07,0.09,14123.0,0,38044,419060,0.08,1209600,63758,2,78,0,2067268.8,0.0,250670.4,4,514,0,0,24,102928,754000,1027420,323420,99164,12510,98,288,288,0.07,0,3280,278,62158,170718,373006,1691314,18804,6,0,0,88,928,2084430,924590,0,1670,0,0,2085538,1988,1592,0,0,0,0,0,0,0,0,14900,8576,3004,0.01,628460,62386,11574,0.19,1216948,2745760,12524,0,0,0.0,1216.44,1244,265986,285848,171390,95844,177206,173184,404690,736142,9268,6,0,0
3,pos,453236,0,0.0,0,0,0,0,0,222,323436,2999280,20657518,12530224,913700,12670,0,18489312,19038,0,0,117902,203782,32633282,28658284,5348896,3667892,0,0,0,0,24514,1858,2273870,2608460,1424956,1607946,1409344,2051868,2227058,8113476,15720072,0,28828,33264,47402,627172,26128758,10570790,804,22,8,2,12630142,9040546,7110184,4970218,2616558,886208,160242,22100,842,10,0.99,632,1686,2018,684,18489312,447850,2507262,2827898,366980,314860,176220,232680,162300,189300,187600,189400,190440,453236.92,0.99,0.99,1.16,189372.0,8436120,0,565520,1.3,1209600,0,0,0,0,33480473.28,0.0,3935856.96,0,0,0,24598,523608,11350372,19808010,5225236,439104,59042,6950,130,0,42,0.99,0,16738,3392,1441482,1867216,1513156,21289878,11184234,120904,50,0,0,0,0,0,0,0,0,0,0,18132,2442,0,0,0,0,0,0,0,0,260830,169964,26740,0.1,16623244,1056682,155406,1.47,28423762,7966,382610,0,0,1.64,0.0,0,4079752,13176956,7908038,3026002,5025350,2025766,1160638,533834,493800,6914,0,0
4,neg,11572,0,0.0,0,0,0,0,0,0,0,272346,1208300,280120,20940,0,0,794230,0,0,0,3794,5686,1752498,1658112,159158,71502,0,0,0,0,0,0,0,0,0,0,0,0,0,105566,1676140,0,5922,1008,850,764,2044,1086250,684860,8,0,0,1052330,308284,137208,111202,56756,34530,23454,20076,10938,26928,0.05,40,76,392,410,794230,16618,66900,91606,113740,1310700,1310700,1310700,1310700,1310700,1310700,1310700,41240,11572.39,0.05,0.05,0.06,5995.0,171822,68724,701140,0.06,1209600,0,0,0,0,845237.76,650043.84,282062.4,10,152,0,0,12866,180186,798630,507812,136328,69732,50498,25654,0,0,0.05,0,6148,472,177768,171584,102876,673338,620524,27938,1056,2,0,0,0,0,0,0,0,0,0,1720,284,0,0,15120,655834,0,0,0,0,8624,2316,270,0.0,0,48872,9340,0.02,507524,0,0,0,0,0.0,1055.52,1080,951334,273390,246058,90772,85754,28014,27874,78510,0,0,0,0
