In [1]:
#import packages to do vectorization
import pandas as pd
import numpy as np

import os
import gc

from sklearn.model_selection import train_test_split
from tqdm import tqdm
from scipy.sparse import coo_matrix,hstack,vstack
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer

#### Read preprocessed file

In [2]:
df_train = pd.read_csv("./data/pre_processed_df_train_1.csv",sep='\t', encoding='utf-8')

In [3]:
categorical_columns=df_train.select_dtypes(include=object).columns.to_list()
print('# of categorical columns~~> {}'.format(len(categorical_columns)))
categorical_columns[0]

# of categorical columns~~> 36


'NAME_CONTRACT_TYPE'

In [4]:
df_cat=df_train[categorical_columns].copy(deep=True)

In [5]:
#Below method will transform all categorical values to 
def convert_cat_to_vector(cat_columns,x_tr,x_te,y_tr):
    pattern = "(?u)\\b[\\w-]+\\b"
    cat_feature_names=list()
    vect_bow = CountVectorizer(token_pattern=pattern)
    #This method is to build all categorical features into numerical feature
    for idx,feature in enumerate(tqdm(cat_columns)):
        vect_bow.fit(X_train[feature].values)
        X_tr_bow = vect_bow.transform(X_train[feature].values)
        X_te_bow = vect_bow.transform(X_test[feature].values)

        if idx==0: # first categorical features
            tr_cat_vector  = X_tr_bow
            te_cat_vector  = X_te_bow
        else:
            tr_cat_vector  = hstack((tr_cat_vector,X_tr_bow))
            te_cat_vector  = hstack((te_cat_vector,X_te_bow))
        cat_feature_names.extend(vect_bow.get_feature_names())     
    return tr_cat_vector,te_cat_vector,cat_feature_names

In [7]:
# y class lables.
y = df_train['TARGET'].values
#categorical featuers only 


train_pct_index = int(0.67 * len(df_cat))
X_train, X_test = df_cat[:train_pct_index], df_cat[train_pct_index:]
y_train, y_test =y[:train_pct_index], y[train_pct_index:]


#invoke categorical to vector

#invoke categorical response coding method to convert categorical values to numerical vector
tr_cat_vect,te_cat_vect,cat_feature_names= convert_cat_to_vector(cat_columns=categorical_columns,x_tr=X_train,x_te=X_test,y_tr=y_train)

100%|██████████████████████████████████████████████████████████████████████████████████| 36/36 [00:46<00:00,  1.29s/it]


#### Remove unnecessary columns

In [8]:
numerical_columns=df_train.select_dtypes(exclude=object).columns.to_list()
numerical_columns.remove('TARGET')
numerical_columns.remove('Unnamed: 0')
numerical_columns.remove('SK_ID_CURR')
numerical_columns.remove('1')
print('# of Numerical columns~~> {}'.format(len(numerical_columns)))
numerical_columns[0]

# of Numerical columns~~> 977


'CNT_CHILDREN'

In [9]:
df_num=df_train[numerical_columns].copy(deep=True)

In [10]:
gc.enable()
del df_train
gc.collect()

14

In [11]:
#before imputing, replace infinite values with np.nan values. Inputer will update with median values
df_num.replace([np.inf, -np.inf,np.nan], 0, inplace=True)

In [12]:
train_pct_index = int(0.67 * len(df_num))
X_train, X_test = df_num[:train_pct_index], df_num[train_pct_index:]
y_train, y_test =y[:train_pct_index], y[train_pct_index:]

In [13]:
def convert_num_to_vector(num_columns,x_tr,x_te):
    # normalizer.fit(X_train['price'].values)
    # this will rise an error Expected 2D array, got 1D array instead: 
    # array=[105.22 215.96  96.01 ... 368.98  80.53 709.67].
    # Reshape your data either using 
    # array.reshape(-1, 1) if your data has a single feature 
    # array.reshape(1, -1)  if it contains a single sample.
    #Imputation of numerical data
    stdscaler = StandardScaler(with_mean=True, with_std=True)
    feature_names=list()

    for idx,feature in enumerate(tqdm(num_columns)):
        #fit and transform only for training set
        X_tr_std_col = stdscaler.fit_transform(x_tr[feature].values.reshape(-1,1))
        #transform test datapoints
        X_te_std_col = stdscaler.transform(x_te[feature].values.reshape(-1,1))
        #feature names list
        feature_names.append(feature)
        if idx==0:
            tr_num_vector = X_tr_std_col
            te_num_vector = X_te_std_col
        else:
            tr_num_vector = np.hstack((tr_num_vector,X_tr_std_col))
            te_num_vector = np.hstack((te_num_vector,X_te_std_col))
    return  tr_num_vector,  te_num_vector,feature_names 
#convert numerical fetures to vector form using normalizer
tr_num_vect,te_num_vect,num_feature_names= convert_num_to_vector(num_columns=numerical_columns,
                                                                 x_tr=X_train,x_te=X_test)

100%|████████████████████████████████████████████████████████████████████████████████| 977/977 [04:59<00:00,  3.26it/s]


In [14]:
#### since dataset is huge, try to delete /carbage it manually to improve the performance
gc.enable()
del X_train,X_test,df_cat,df_num
gc.collect()

0

In [15]:
for f in cat_feature_names:
    num_feature_names.append(f)

In [16]:
X_train_vector = hstack((coo_matrix(tr_num_vect),tr_cat_vect))
X_test_vector = hstack((coo_matrix(te_num_vect),te_cat_vect))

In [17]:
print("Final Data matrix ...")
print("="*25)
print(X_train_vector.shape, y_train.shape)
print(X_test_vector.shape, y_test.shape)
print(len(num_feature_names))

Final Data matrix ...
(206032, 1384) (206032,)
(101479, 1384) (101479,)
1384


In [18]:
X=vstack((X_train_vector,X_test_vector))
y=np.concatenate([y_train,y_test],axis=0)
print(X.shape,y.shape)

(307511, 1384) (307511,)


In [19]:
gc.enable()
del tr_num_vect, tr_cat_vect, te_num_vect, te_cat_vect,X_train_vector,X_test_vector
gc.collect()

0

#### Save vectors in pickle file.

In [20]:
df_train = pd.DataFrame(X.toarray(), columns =num_feature_names)

In [21]:
import pickle
with open('./data/train_vector_2.pkl', 'wb') as f:
    pickle.dump(X, f)

In [22]:
with open('./data/yvalues_2.pkl', 'wb') as yvalues:
    pickle.dump(y, yvalues)

#### Save Feature names

In [23]:
with open('./data/feature_names_2.pkl', 'wb') as feature:
    pickle.dump(num_feature_names, feature)

#### verify the shapes

In [24]:
with open('./data/train_vector_2.pkl', 'rb') as f:
    X = pickle.load(f)

In [25]:
print(X.shape)

(307511, 1384)


In [26]:
with open('./data/yvalues_2.pkl', 'rb') as f:
    y = pickle.load(f)
print(y.shape)

(307511,)


In [27]:
with open('./data/feature_names_2.pkl', 'rb') as f:
    f_names = pickle.load(f)
len(f_names)

1384