In [57]:
# Importing main necessary libraries for the project
import numpy as np
import pandas as pd
from IPython.display import display # to use display() for DataFrames

# Reading the anonymous data
data = pd.read_csv("train.csv")
# Dropping ID_code since it's redundant given the dataframe index
data.drop("ID_code", axis=1, inplace=True)

# Display first 10 observations
display(data.head(n=10))

Unnamed: 0,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,-4.92,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,3.1468,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,-4.9193,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,-5.8609,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,6.2654,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104
5,0,11.4763,-2.3182,12.608,8.6264,10.9621,3.5609,4.5322,15.2255,3.5855,...,-6.3068,6.6025,5.2912,0.4403,14.9452,1.0314,-3.6241,9.767,12.5809,-4.7602
6,0,11.8091,-0.0832,9.3494,4.2916,11.1355,-8.0198,6.1961,12.0771,-4.3781,...,8.783,6.4521,3.5325,0.1777,18.3314,0.5845,9.1104,9.1143,10.8869,-3.2097
7,0,13.558,-7.9881,13.8776,7.5985,8.6543,0.831,5.689,22.3262,5.0647,...,13.17,6.5491,3.9906,5.8061,23.1407,-0.3776,4.2178,9.4237,8.6624,3.4806
8,0,16.1071,2.4426,13.9307,5.6327,8.8014,6.163,4.4514,10.1854,-3.1882,...,1.4298,14.751,1.6395,1.4181,14.837,-1.994,-1.0733,8.1975,19.5114,4.8453
9,0,12.5088,1.9743,8.896,5.4508,13.6043,-16.2859,6.0637,16.841,0.1287,...,0.5543,6.316,1.0371,3.6885,14.8344,0.4467,14.1287,7.9133,16.2375,14.2514


In [59]:
# Number of customers
n_cust = data.shape[0]

# Number of customers that will not make future transactions
n_no_fut_trans = data[data["target"] == 0].shape[0]

# Number of customers that will make future transactions
n_fut_trans = data[data["target"] == 1].shape[0]

print("Total number of customers: {}".format(n_cust))
print("Customers that will not make future transactions: {}".format(n_no_fut_trans))
print("Customers that will make future transactions: {}".format(n_fut_trans))
print("Percentage of customers tha will make future transactions: {}%".format(n_fut_trans/n_cust*100))

Total number of customers: 200000
Customers that will not make future transactions: 179902
Customers that will make future transactions: 20098
Percentage of customers tha will make future transactions: 10.049%


In [61]:
# Outlier detection (Turkey's method)
outliers_t  = []
repeated = []

# Features
X = data.drop(['target'], axis = 1)

# For each feature 
for feature in X.keys():
    
    # Calculate first quartile
    Q1 = np.percentile(X[feature], 25)
    
    # Calculate
    Q3 = np.percentile(X[feature], 75)
    
    # Calculate interquatile range * 1.5
    step = (Q3-Q1)*1.5
    
    for i in list((X[~((X[feature] >= Q1 - step) & (X[feature] <= Q3 + step))]).index.values):
        if i not in outliers_t:
            outliers_t.append(i)
        elif i not in repeated:
            repeated.append(i)


print("Number of points considered outliers in more than one feature: {}".format(len(repeated)))
print("Number of points considered outliers in some feature: {}".format(len(outliers_t)))

Number of points considered outliers in more than one feature: 1556
Number of points considered outliers in some feature: 24896


In [71]:
from sklearn.cross_validation import train_test_split

# Dropping rows with outlier values in more than one feature
filtered_features = data.drop(repeated)
filtered_target = data[['target']].drop(repeated)

# Dividing train and test sets
X_train, X_test, y_train, y_test = train_test_split(filtered_features, filtered_target, test_size=0.3, random_state=42)

# Checking train and test sets size
print("X_train size: {} \t y_train size: {}".format(X_train.shape[0], y_train.shape[0]))
print("X_test size: {} \t y_test size: {}".format(X_test.shape[0], y_test.shape[0]))

X_train size: 138910 	 y_train size: 138910
X_test size: 59534 	 y_test size: 59534


In [54]:
# Outlier detection (Local Outlier Factor)
# Not viable due to O(n²) complexity, takes too long as result of dataset size

# from sklearn.neighbors import LocalOutlierFactor

# lof = LocalOutlierFactor()

# outliers_l = lof.fit_predict(X)

# print("Number of points considered outliers: {}".format(len(outliers_l[outliers_l == -1])))

In [50]:
from sklearn.decomposition import PCA

# Applying pca to the filtered data
pca = PCA(n_components=0.9, svd_solver='full').fit(filtered_features.reset_index(drop=True))

# Checking number of components
print("Number of components after pca: {}".format(pca.n_components_))

# Transforming data based on components
transf_X_train = pca.transform(X_train)
transf_X_test = pca.transform(X_test)


Number of components after pca: 90
