In [1]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import VarianceThreshold

In [3]:
data_train = pd.read_csv("data/train.csv")

data_train.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
0,1,A,B,A,A,B,D,A,E,C,...,0.881122,0.42165,0.741413,0.895799,0.802461,0.724417,0.701915,0.877618,0.719903,6.994023
1,2,B,A,A,A,B,B,A,E,A,...,0.440011,0.34623,0.278495,0.593413,0.546056,0.613252,0.741289,0.326679,0.808464,8.071256
2,3,A,A,A,C,B,D,A,B,C,...,0.914155,0.369602,0.832564,0.86562,0.825251,0.264104,0.695561,0.869133,0.828352,5.760456
3,4,A,A,A,C,B,D,A,E,G,...,0.934138,0.57893,0.407313,0.868099,0.794402,0.494269,0.698125,0.809799,0.614766,7.806457
4,6,A,B,A,A,B,B,A,E,C,...,0.3826,0.70594,0.325193,0.440967,0.462146,0.724447,0.683073,0.343457,0.297743,6.868974


In [4]:
data_train.dtypes

id          int64
cat0       object
cat1       object
cat2       object
cat3       object
cat4       object
cat5       object
cat6       object
cat7       object
cat8       object
cat9       object
cont0     float64
cont1     float64
cont2     float64
cont3     float64
cont4     float64
cont5     float64
cont6     float64
cont7     float64
cont8     float64
cont9     float64
cont10    float64
cont11    float64
cont12    float64
cont13    float64
target    float64
dtype: object

In [5]:
data_train.shape

(300000, 26)

In [6]:
data_train.describe()

Unnamed: 0,id,cont0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
count,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0
mean,250018.576947,0.524634,0.506649,0.444115,0.446214,0.455471,0.508337,0.478345,0.455904,0.459321,0.526899,0.504943,0.529938,0.524549,0.503349,7.45626
std,144450.15001,0.204875,0.235269,0.200089,0.238669,0.200695,0.231612,0.192432,0.204493,0.220642,0.204025,0.201549,0.23086,0.220892,0.225218,0.887295
min,1.0,-0.093505,-0.055105,-0.060274,0.13476,0.189216,-0.087247,0.043953,0.208703,0.004041,0.07304,0.059644,0.064161,-0.0056,0.158121,0.0
25%,124772.5,0.370451,0.352307,0.314121,0.214572,0.279853,0.338747,0.339896,0.278041,0.308655,0.361957,0.338898,0.316662,0.332143,0.291289,6.798341
50%,250002.5,0.492208,0.615156,0.457271,0.377823,0.411351,0.441384,0.41009,0.360736,0.425801,0.488867,0.519855,0.558827,0.407365,0.433909,7.496503
75%,375226.5,0.654793,0.68815,0.554835,0.719758,0.621808,0.709515,0.604246,0.639388,0.541525,0.752765,0.672809,0.720381,0.732431,0.73087,8.161166
max,499999.0,1.052666,0.851746,1.017689,1.006469,0.99405,1.044433,1.093312,1.036541,1.014156,0.972091,1.029773,1.038049,0.96137,0.873579,10.309208


In [7]:
X = data_train.drop("target" , axis = 1)
y = data_train["target"]

In [8]:
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.2 , random_state = 0 )

In [9]:
X_train_numeric = X_train.select_dtypes(include= np.number)
X_test_numeric = X_test.select_dtypes(include= np.number)

In [10]:
constant_filter = VarianceThreshold(threshold=0)
constant_filter.fit(X_train_numeric)

VarianceThreshold(threshold=0)

In [11]:
constant_filter.get_support().sum()

15

In [12]:
# Inverting the filter list
constant_list = [not temp for temp in constant_filter.get_support()]
print(constant_list)

[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]


In [13]:
X_train_numeric.columns[constant_list]

Index([], dtype='object')

In [14]:
# Transorm the dataset into non-constant feature space 
# Basically removing the constant features
# To prevent overfitting
X_train_filter = constant_filter.transform(X_train_numeric)
X_test_filter = constant_filter.transform(X_test_numeric)

In [19]:
# Removing feature almost constant or somewhat near to constant(Quasi Constant)
quasi_constant_filter = VarianceThreshold(threshold=0.02)
quasi_constant_filter.fit(X_train_numeric)

VarianceThreshold(threshold=0.02)

In [20]:
quasi_constant_filter.get_support().sum()

15

In [21]:
quasi_constant_filter.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True])

In [22]:
quasi_constant_list = [not temp for temp in quasi_constant_filter.get_support()]
print(quasi_constant_list)

[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]


In [23]:
X_train_numeric.columns[quasi_constant_list]

Index([], dtype='object')

In [24]:
X_train_quasi_filter = quasi_constant_filter.transform(X_train_filter)
X_test_quasi_filter = quasi_constant_filter.transform(X_test_filter)

In [25]:
X_train_quasi_filter.shape,X_test_quasi_filter.shape

((240000, 15), (60000, 15))

In [27]:
X_train_numeric_T = X_train_quasi_filter.T
X_test_numeric_T = X_test_quasi_filter.T

In [28]:
# Changing it back to Pandas Dataframe
X_train_numeric_T = pd.DataFrame(X_train_numeric_T)
X_test_numeric_T = pd.DataFrame(X_test_numeric_T)

X_train_numeric_T.shape , X_test_numeric_T.shape

((15, 240000), (15, 60000))

In [30]:
X_train_numeric_T.duplicated().sum()

0