In [1]:
import pandas as pd
from utils import check_nas, unique_values, values_only_in_set

### 1. Load dataset

In [2]:
train = pd.read_csv('data/train.csv', index_col=0, low_memory=False)
test = pd.read_csv('data/test.csv', index_col=0, low_memory=False)

### 2. Check for NAs

Check no. missing values for Train

In [3]:
check_nas(train,'FIELD_39')

(9678, 0.3226)

Check no. missing value for Test

In [4]:
check_nas(test,'FIELD_39')

(6504, 0.2168)

### 3. Check for Unique Values

#### Similar Unique values in Train & Test

In [5]:
set(train.FIELD_39.unique()).intersection(set(test.FIELD_39.unique()))

{'1',
 'AU',
 'CN',
 'CZ',
 'DE',
 'DK',
 'DL',
 'DM',
 'DT',
 'ES',
 'FR',
 'HK',
 'HQ',
 'IL',
 'IN',
 'JP',
 'KP',
 'KR',
 'MY',
 'NL',
 'NN',
 'NU',
 'None',
 'PH',
 'SG',
 'TH',
 'TQ',
 'TW',
 'UK',
 'US',
 'VN',
 'VU',
 nan}

#### Unique values in Train

In [6]:
unique_values(train,'FIELD_39')

(VN      10569
 None     8997
 TW        158
 KR         97
 JP         95
 CN         85
 TQ         46
 CZ         37
 1          28
 HQ         27
 UK         23
 DL         23
 HK         20
 SG         17
 KP         14
 DE         12
 US         10
 TH          9
 MY          6
 NU          5
 IN          4
 NL          4
 FR          4
 NN          4
 DT          2
 N           2
 TS          2
 AD          2
 GB          2
 BE          2
 DK          2
 PH          1
 TL          1
 SE          1
 SC          1
 TK          1
 AU          1
 AE          1
 DM          1
 IL          1
 VU          1
 TR          1
 IT          1
 CA          1
 ES          1
 Name: FIELD_39, dtype: int64, VN      0.520077
 None    0.442722
 TW      0.007775
 KR      0.004773
 JP      0.004675
 CN      0.004183
 TQ      0.002264
 CZ      0.001821
 1       0.001378
 HQ      0.001329
 UK      0.001132
 DL      0.001132
 HK      0.000984
 SG      0.000837
 KP      0.000689
 DE      0.000590
 US    

#### Unique values in Test

In [7]:
unique_values(test,'FIELD_39')

(VN      7016
 None    5972
 TW        92
 KR        82
 JP        63
 CN        48
 CZ        38
 TQ        30
 DL        19
 HQ        17
 1         15
 HK        11
 SG        10
 US         9
 DE         9
 UK         8
 DT         7
 KP         5
 TH         5
 PH         5
 AU         5
 FR         4
 ID         3
 CH         3
 MY         3
 NL         3
 IN         2
 NN         2
 AT         1
 DK         1
 VU         1
 AN         1
 WS         1
 DM         1
 IL         1
 NU         1
 AO         1
 ES         1
 Name: FIELD_39, dtype: int64, VN      0.519858
 None    0.442501
 TW      0.006817
 KR      0.006076
 JP      0.004668
 CN      0.003557
 CZ      0.002816
 TQ      0.002223
 DL      0.001408
 HQ      0.001260
 1       0.001111
 HK      0.000815
 SG      0.000741
 US      0.000667
 DE      0.000667
 UK      0.000593
 DT      0.000519
 KP      0.000370
 TH      0.000370
 PH      0.000370
 AU      0.000370
 FR      0.000296
 ID      0.000222
 CH      0.000222
 MY   

### 4. Compare Train and Test

Compare Unique values only in Train

In [8]:
values_only_in_set(train, test, 'FIELD_39')

({'AD', 'AE', 'BE', 'CA', 'GB', 'IT', 'N', 'SC', 'SE', 'TK', 'TL', 'TR', 'TS'},
 13)

Compare Unique values only in Test

In [9]:
values_only_in_set(test, train, 'FIELD_39')

({'AN', 'AO', 'AT', 'CH', 'ID', 'WS'}, 6)