In [10]:
import pandas as pd
from utils import check_nas, unique_values, values_only_in_set

### 1. Load dataset

In [11]:
train = pd.read_csv('data/train.csv', index_col=0, low_memory=False)
test = pd.read_csv('data/test.csv', index_col=0, low_memory=False)

### 2. Check for NAs

Check no. missing values for Train

In [12]:
check_nas(train,'FIELD_3')

(463, 0.015433333333333334)

Check no. missing value for Test

In [13]:
check_nas(test,'FIELD_3')

(325, 0.010833333333333334)

### 3. Check for Unique Values

#### Similar Unique values in Train & Test

In [20]:
set(train.FIELD_3.unique()).intersection(set(test.FIELD_3.unique()))

{-1.0,
 337.0,
 338.0,
 339.0,
 340.0,
 341.0,
 342.0,
 343.0,
 344.0,
 345.0,
 346.0,
 347.0,
 348.0,
 349.0,
 350.0,
 351.0,
 352.0,
 353.0,
 354.0,
 355.0,
 356.0,
 357.0,
 358.0,
 359.0,
 360.0,
 361.0,
 362.0,
 363.0,
 364.0,
 365.0,
 366.0,
 367.0,
 702.0,
 703.0,
 704.0,
 705.0,
 706.0,
 707.0,
 708.0,
 709.0,
 710.0,
 711.0,
 712.0,
 713.0,
 714.0,
 715.0,
 716.0,
 717.0,
 718.0,
 719.0,
 720.0,
 721.0,
 722.0,
 723.0,
 724.0,
 725.0,
 726.0,
 727.0,
 728.0,
 729.0,
 730.0,
 731.0,
 732.0,
 1067.0,
 1068.0,
 1069.0,
 1070.0,
 1071.0,
 1072.0,
 1073.0,
 1074.0,
 1075.0,
 1076.0,
 1077.0,
 1078.0,
 1079.0,
 1080.0,
 1081.0,
 1082.0,
 1083.0,
 1084.0,
 1085.0,
 1086.0,
 1087.0,
 1088.0,
 1089.0,
 1090.0,
 1091.0,
 1092.0,
 1093.0,
 1094.0,
 1095.0,
 1096.0,
 1097.0,
 1433.0,
 1434.0,
 1435.0,
 1436.0,
 1437.0,
 1438.0,
 1439.0,
 1440.0,
 1441.0,
 1442.0,
 1443.0,
 1444.0,
 1445.0,
 1446.0,
 1447.0,
 1448.0,
 1449.0,
 1450.0,
 1451.0,
 1452.0,
 1453.0,
 1454.0,
 1455.0,
 1456.0,
 1

#### Unique values in Train

In [15]:
unique_values(train,'FIELD_3')

(-1.0       6996
  4019.0     199
  3993.0     180
  3997.0     153
  2901.0     148
            ... 
  6932.0       1
  4743.0       1
  4750.0       1
  8019.0       1
  6553.0       1
 Name: FIELD_3, Length: 513, dtype: int64, -1.0       0.236855
  4019.0    0.006737
  3993.0    0.006094
  3997.0    0.005180
  2901.0    0.005011
              ...   
  6932.0    0.000034
  4743.0    0.000034
  4750.0    0.000034
  8019.0    0.000034
  6553.0    0.000034
 Name: FIELD_3, Length: 513, dtype: float64)

#### Unique values in Test

In [16]:
unique_values(test,'FIELD_3')

(-1.0       4716
  3997.0     117
  4019.0     108
  3993.0     106
  2901.0      96
            ... 
  5829.0       1
  5479.0       1
  5104.0       1
  6187.0       1
  6561.0       1
 Name: FIELD_3, Length: 472, dtype: int64, -1.0       0.239695
  3997.0    0.005947
  4019.0    0.005489
  3993.0    0.005388
  2901.0    0.004879
              ...   
  5829.0    0.000051
  5479.0    0.000051
  5104.0    0.000051
  6187.0    0.000051
  6561.0    0.000051
 Name: FIELD_3, Length: 472, dtype: float64)

### 4. Compare Train and Test

Compare Unique values only in Train

In [17]:
values_only_in_set(train, test, 'FIELD_3')

({nan,
  4369.0,
  4724.0,
  4727.0,
  4731.0,
  4732.0,
  4734.0,
  4736.0,
  4737.0,
  4738.0,
  4741.0,
  4743.0,
  4747.0,
  4749.0,
  4750.0,
  5086.0,
  5087.0,
  5091.0,
  5095.0,
  5106.0,
  5107.0,
  5111.0,
  5112.0,
  5450.0,
  5451.0,
  5453.0,
  5454.0,
  5461.0,
  5465.0,
  5469.0,
  5474.0,
  5819.0,
  5823.0,
  5826.0,
  5827.0,
  5828.0,
  5832.0,
  5836.0,
  5839.0,
  5840.0,
  5846.0,
  6183.0,
  6184.0,
  6196.0,
  6197.0,
  6201.0,
  6203.0,
  6205.0,
  6206.0,
  6210.0,
  6548.0,
  6549.0,
  6553.0,
  6555.0,
  6559.0,
  6563.0,
  6916.0,
  6932.0,
  6935.0,
  6936.0,
  6937.0,
  6940.0,
  7279.0,
  7281.0,
  7284.0,
  7291.0,
  7299.0,
  7307.0,
  7643.0,
  7645.0,
  7648.0,
  7649.0,
  7652.0,
  7657.0,
  7658.0,
  7662.0,
  7665.0,
  7672.0,
  8011.0,
  8018.0,
  8019.0,
  8020.0,
  8029.0,
  8030.0,
  8035.0},
 85)

Compare Unique values only in Test

In [18]:
values_only_in_set(test, train, 'FIELD_3')

({nan,
  4744.0,
  4748.0,
  5088.0,
  5093.0,
  5097.0,
  5455.0,
  5466.0,
  5470.0,
  5472.0,
  5478.0,
  5820.0,
  5824.0,
  5829.0,
  5833.0,
  5837.0,
  5838.0,
  5845.0,
  6187.0,
  6195.0,
  6202.0,
  6552.0,
  6561.0,
  6572.0,
  6573.0,
  6575.0,
  6917.0,
  6925.0,
  6931.0,
  6933.0,
  6934.0,
  7283.0,
  7289.0,
  7298.0,
  7300.0,
  7301.0,
  7644.0,
  7663.0,
  7670.0,
  8015.0,
  8023.0,
  8024.0,
  8031.0,
  8033.0},
 44)