In [1]:
import pandas as pd
from utils import check_nas, unique_values, values_only_in_set

### 1. Load dataset

In [2]:
train = pd.read_csv('data/train.csv', index_col=0, low_memory=False)
test = pd.read_csv('data/test.csv', index_col=0, low_memory=False)

### 2. Check for NAs

Check no. missing values for Train

In [3]:
check_nas(train,'FIELD_13')

(9700, 0.3233333333333333)

Check no. missing value for Test

In [4]:
check_nas(test,'FIELD_13')

(6518, 0.21726666666666666)

### 3. Check for Unique Values

#### Similar Unique values in Train & Test

In [10]:
set(train.FIELD_13.unique()).intersection(set(test.FIELD_13.unique()))

{'0',
 '4',
 'A2',
 'A6',
 'AA',
 'AB',
 'AC',
 'AD',
 'AH',
 'AI',
 'AL',
 'AO',
 'AP',
 'AQ',
 'AR',
 'AS',
 'B2',
 'BA',
 'BB',
 'BD',
 'BE',
 'BG',
 'BH',
 'BI',
 'BK',
 'BM',
 'BN',
 'BO',
 'BP',
 'BQ',
 'BT',
 'BX',
 'CA',
 'CB',
 'CE',
 'CK',
 'DA',
 'DD',
 'DE',
 'DG',
 'DH',
 'DT',
 'DZ',
 'EA',
 'EB',
 'ED',
 'EF',
 'EJ',
 'EP',
 'F4',
 'FA',
 'FB',
 'FC',
 'FD',
 'FE',
 'FF',
 'FG',
 'FH',
 'FI',
 'FJ',
 'FK',
 'FL',
 'FM',
 'FO',
 'FP',
 'FQ',
 'FR',
 'FX',
 'FY',
 'H3',
 'HA',
 'HB',
 'HC',
 'HD',
 'HE',
 'HF',
 'HG',
 'HH',
 'HI',
 'HJ',
 'HK',
 'HL',
 'HM',
 'HN',
 'HO',
 'HP',
 'HQ',
 'HR',
 'HS',
 'HT',
 'HU',
 'HV',
 'HW',
 'HX',
 'HZ',
 'IA',
 'KC',
 'KT',
 'NB',
 'NC',
 'ND',
 'NE',
 'NF',
 'NG',
 'NH',
 'NI',
 'NJ',
 'NK',
 'NL',
 'NN',
 'NO',
 'NQ',
 'NU',
 'NV',
 'NX',
 'NZ',
 'QA',
 'QB',
 'QC',
 'QD',
 'QE',
 'QF',
 'QG',
 'QH',
 'QI',
 'QJ',
 'QK',
 'QL',
 'QM',
 'QO',
 'QP',
 'QW',
 'QZ',
 'SA',
 'SC',
 'SD',
 'SG',
 'SH',
 'SJ',
 'SK',
 'SL',
 'SM',
 'SO',
 

#### Unique values in Train

In [6]:
unique_values(train,'FIELD_13')

(YN    4976
 BI    4857
 TA    1194
 BO    1071
 TZ     610
       ... 
 CC       1
 NQ       1
 DQ       1
 EP       1
 EF       1
 Name: FIELD_13, Length: 227, dtype: int64, YN    0.245123
 BI    0.239261
 TA    0.058818
 BO    0.052759
 TZ    0.030049
         ...   
 CC    0.000049
 NQ    0.000049
 DQ    0.000049
 EP    0.000049
 EF    0.000049
 Name: FIELD_13, Length: 227, dtype: float64)

#### Unique values in Test

In [7]:
unique_values(test,'FIELD_13')

(YN    3309
 BI    3116
 TA     800
 BO     771
 TZ     441
       ... 
 IS       1
 HO       1
 SJ       1
 HS       1
 CK       1
 Name: FIELD_13, Length: 210, dtype: int64, YN    0.245438
 BI    0.231123
 TA    0.059338
 BO    0.057187
 TZ    0.032710
         ...   
 IS    0.000074
 HO    0.000074
 SJ    0.000074
 HS    0.000074
 CK    0.000074
 Name: FIELD_13, Length: 210, dtype: float64)

### 4. Compare Train and Test

Compare Unique values only in Train

In [8]:
values_only_in_set(train, test, 'FIELD_13')

({'8',
  'AY',
  'BU',
  'CC',
  'CD',
  'CH',
  'CJ',
  'CR',
  'CZ',
  'DB',
  'DC',
  'DF',
  'DI',
  'DQ',
  'DW',
  'E2',
  'EE',
  'EG',
  'EH',
  'EI',
  'EL',
  'EN',
  'F1',
  'FN',
  'FS',
  'FU',
  'FV',
  'GB',
  'H1',
  'HY',
  'IC',
  'NT',
  'NW',
  'QU',
  'QX',
  'SE',
  'SF',
  'SI',
  'SN',
  'SP',
  'SR',
  'SV',
  'YF'},
 43)

Compare Unique values only in Test

In [9]:
values_only_in_set(test, train, 'FIELD_13')

({'12',
  'A1',
  'AE',
  'BJ',
  'BW',
  'CG',
  'CL',
  'DL',
  'DM',
  'EK',
  'EU',
  'FT',
  'H2',
  'H5',
  'H7',
  'IS',
  'KX',
  'N7',
  'NM',
  'NP',
  'NY',
  'QQ',
  'QS',
  'SB',
  'SZ',
  'ZA'},
 26)