In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


In [2]:
df  = pd.read_parquet('/kaggle/input/amex-data-integer-dtypes-parquet-format/train.parquet')
labels  = pd.read_csv('/kaggle/input/amex-default-prediction/train_labels.csv', low_memory=True)

In [3]:
df.shape, labels.shape

In [4]:
labels['target'].value_counts()

In [5]:
def cleaning(data):
    # Data cleaning: remove >50% na
    cols_with_50pc_missing = [col for col in data.columns if data[col].isna().sum() >  0.5*len(data.index)]
    data = data.drop(columns=cols_with_50pc_missing)

    # Feature engineering: number of transactions
    numTx = data['customer_ID'].value_counts().tolist()
    data = data.groupby('customer_ID').tail(1)
    data = data.reset_index(drop=True)
    data.insert(2,"numTx",numTx)
    data = data.drop(columns=["S_2", "customer_ID"])

    # Data imputation: Fill up na
    for i in data.columns:
        data[i] = data[i].fillna(data[i].mean())
    return data


In [6]:
df = cleaning(df)

In [7]:
df.shape

In [8]:
#Joining the labels to main df

train = pd.concat([df,labels['target']], axis=1)


In [9]:
train.head()

In [10]:
#deleting some data to keep memory

del df


In [11]:
#First, We seperate the data based on the target variable.

dflt = train[train["target"] == 1]
n_dflt =  train[train["target"] == 0]
print(dflt.shape)
print(n_dflt.shape)

# RESAMPLING

In [12]:
#Resampling the most rep class to same length with less rep class

from sklearn.utils import resample
ndflt_downsample = resample(n_dflt,
             replace=True,
             n_samples=len(dflt),
             random_state=42)

print(ndflt_downsample.shape)

In [13]:
#Joining the datasets back together  #crashes

nt = pd.concat([ndflt_downsample, dflt])

In [14]:
nt.shape

In [15]:
#empty chcks
nt.isnull().sum().sum()

In [18]:
del train


In [16]:
nt.head()

## SYSTEMATIC SAMPLING

In [20]:
# # function that creates random sample using Systematic Sampling
# def systematic_sampling(df, step):

#     # these indices will increase with the step amount not 1
#     selected_index = np.arange(1,len(df),step)
#     # using iloc for getting thee data with selected indices
#     systematic_sampling = df.iloc[selected_index]
#     return(systematic_sampling)

In [21]:
# sys_df = systematic_sampling(nt, 30)

In [23]:
# sys_df.shape

Note that, Systematic Sampling usually produces a random sample but is not addressing the bias in the created sample.

## CLUSTER SAMPLING

In [30]:
# def get_clustered_Sample(df, n_per_cluster, num_select_clusters):
#     N = len(df)          #total no. of observations
#     K = int(N/n_per_cluster)    #where K is the possible no.of clusters derivable from df based
#     data = None                    #on the value of n_per_cluster
    
#     #developing each cluster
#     for k in range(K):    #for k in the range of the derivable no.of clusters K
#         sample_k = df.sample(n_per_cluster, replace=True)  #creating a cluster from population(df)
#         sample_k["cluster"] = np.repeat(k,len(sample_k))
#         df = df.drop(index = sample_k.index)
#         data = pd.concat([data,sample_k],axis = 0)

#     random_chosen_clusters = np.random.randint(0,K,size = num_select_clusters)
#     samples = data[data.cluster.isin(random_chosen_clusters)]
#     return(samples)



In [32]:
# sample = get_clustered_Sample(df = nt, n_per_cluster = 10000, num_select_clusters = 20)
# sample

In [34]:
# sample.shape

## WEIGHTED SAMPLING

In some experiments, you might need items sampling probabilities to be according to weights associated with each item, that’s when the proportions of the type of observations should be taken into account.

Weighted Sampling is a data sampling method with weights, that intends to compensate for the selection of specific observations with unequal probabilities (oversampling), non-coverage, non-responses, and other types of bias. If a biased data set is not adjusted and a simple random sampling type of approach is used instead, then the population descriptors (e.g., mean, median) will be skewed and they will fail to correctly represent the population’s proportion to the population.


#### Weighted Sampling addresses the bias in the sample, by creating a sample that takes into account the proportions of the type of observations in the population. Hence, Weighted Sampling usually produces a random and unbiased sample.


![](http://miro.medium.com/max/700/1*zNBVo75a4FGOLo-_ELyiyA.png)

In [35]:
# def get_weighted_sample(df,n):
#     def get_class_prob(x):
#         weight_x = int(np.rint(n * len(x[x.target != 0]) / len(df[df.target != 0])))
#         sampled_x = x.sample(weight_x).reset_index(drop=True)
#         return (sampled_x)
    
#         # we are grouping by the target class we use for the proportions

#     weighted_sample = df.groupby('target').apply(get_class_prob)
#     print(weighted_sample["target"].value_counts())
#     return (weighted_sample)



In [40]:
# sample = get_weighted_sample(nt,1000)
# sample

In [39]:
# sample['target'].value_counts()

In [19]:
#Defining X and y i.e seperating dependent from independent variables

X = nt.drop(['target','numTx'], axis=1) 
y = nt['target'].astype('int64')


In [24]:
y.head()

In [26]:
cyo = pd.DataFrame(nt['target']).reset_index()

In [27]:
cyo.head()

In [28]:
del nt

# DIMENSIONALITY REDUCTION

## APPLYING PCA 

In [30]:
from sklearn.decomposition import PCA

# Initializing pca algo and Selecting number of components
pca = PCA(n_components=2)


In [31]:
#transforming 191 features to another feature space with just 2 features

principalComponents = pca.fit_transform(X)

In [32]:
principalComponents

In [49]:
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])

In [50]:
X.shape

In [51]:
principalDf.head()

In [52]:
principalDf.describe()

In [53]:
X.describe()

In [54]:
#checking the variance of the pricipal components
pca.explained_variance_ratio_

In [57]:
#shuffling the dataframe

finalDf = pd.concat([principalDf, cyo['target']], axis = 1).sample(frac = 1)

## 2d plot of both components

In [58]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.relplot(x="principal component 1", y="principal component 2",hue='target', data=finalDf);


In [None]:
# emp = []
# for i in finalDf.columns:
#     if finalDf[i] <1750 and >-500:
        

In [None]:
# pc1= -500 to 1750

# pc2 = 800



In [30]:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
targets = [0,1]
colors = ['r', 'g']
for target, color in zip(targets,colors):
    indicesToKeep = finalDf['target'] == target
    ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
               , finalDf.loc[indicesToKeep, 'principal component 2']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()

### Explained Variance
The explained variance tells you how much information (variance) can be attributed to each of the principal components. This is important as while you can convert 4 dimensional space to 2 dimensional space, you lose some of the variance (information) when you do this. By using the attribute explained_variance_ratio_, you can see that the first principal component contains 13.78% of the variance and the second principal component contains 0.07% of the variance. Together, the two components contain 10.85% of the information.

In [None]:
#checking the variance of the pricipal components
pca.explained_variance_ratio_

### Alternative initializer for max variance factor

Notice the code below has .95 for the number of components parameter. It means that scikit-learn choose the minimum number of principal components such that 95% of the variance is retained.

In [None]:
del principalComponents,finalDf

In [None]:
#alternative initializer where you could specify minimum variance
from sklearn.decomposition import PCA
# Make an instance of the Model
pca = PCA(.95)


In [None]:
#transforming 191 features to another feature space with just 2 features

principalComponents = pca.fit_transform(X)

In [None]:
# #You can also check No. of components contrbuting to 95% of variance in the data
pca.n_components_

In [None]:
#checking the variance of the pricipal components
pca.explained_variance_ratio_

In [None]:
del principalComponents

## Applying Kernel PCA

In [None]:
# Applying Kernel PCA
from sklearn.decomposition import KernelPCA
kpca = KernelPCA(n_components = 2, kernel = 'rbf')


In [None]:
#transforming 191 features to another feature space with just 2 features using kpca

principalComponents = kpca.fit_transform(X)

principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])

In [None]:
#checking the variance of the pricipal components
kpca.explained_variance_ratio

In [None]:
finalDf = pd.concat([principalDf, cyo['target']], axis = 1).sample(frac = 1)

In [None]:
#plotting the PCA plot

sns.relplot(x="principal component 1", y="principal component 2",hue='target', data=finalDf);


In [None]:
del principalComponents,finalDf

## APPLYING LDA

In [25]:
# Applying LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA(n_components = 1)


In [26]:
#transforming 191 features to another feature space with just 2 features using kpca

ldc = lda.fit_transform(X, y)


In [27]:
ldc = pd.DataFrame(data = ldc
             , columns = ['linear component 1'])

In [28]:
finalDf = pd.concat([ldc, cyo['target']], axis = 1).sample(frac = 1)

In [30]:
import seaborn as sns

In [31]:
sns.catplot(x='target', y="linear component 1", data=finalDf)

In [None]:
cx

In [None]:
# #Defining X and y i.e seperating dependent from independent variables

# X = nt.drop(['target','numTx'], axis=1) 
# y = nt['target'].astype('int64')


In [None]:
del nt, labels

In [None]:
#cx

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score  
from sklearn.metrics import precision_score                         
from sklearn.metrics import recall_score

In [34]:
x_train, x_test, y_train, y_test = train_test_split(ldc, cyo['target'], test_size=0.25, random_state=26)

In [35]:
#deleting some data to keep memory

del X


In [None]:
# # Feature Scaling
# from sklearn.preprocessing import StandardScaler
# sc = StandardScaler()
# x_train = sc.fit_transform(x_train)
# x_test = sc.transform(x_test)

In [36]:
# # iter = 100, 18 seconds, 3000+GPU = 2 mins
import catboost as cat
from catboost import CatBoostClassifier
from catboost import CatBoostRegressor
clf = CatBoostClassifier(
    iterations=1000,
    task_type="GPU",
    bagging_temperature = 0.2
)
clf.fit(x_train,y_train,eval_set=(x_test,y_test),verbose=True)

In [37]:
#deleting some data to keep memory

del x_train, x_test, y_train, y_test


In [None]:
#Importing test set and sample prediction

test_set = pd.read_parquet('/kaggle/input/amex-data-integer-dtypes-parquet-format/test.parquet')
test_samp  = pd.read_csv('/kaggle/input/amex-default-prediction/sample_submission.csv', usecols=['customer_ID'], low_memory=True)

In [None]:
#Working on the test set

test_set = cleaning(test_set)

In [None]:
#test_set = test_set.drop(['numTx'], axis=1) 

In [None]:
test_set =  pca.transform(test_set)

In [None]:
# #scaling the test set

# test_set = sc.transform(test_set)

In [None]:
#Predicting test set

prediction = clf.predict_proba(test_set)
final_predictions = prediction[:,1]


In [None]:
#Saving prediction file 

output = pd.DataFrame({'customer_ID': test_samp.customer_ID, 'prediction': final_predictions}, index=None)
output.to_csv('submission.csv', index=False)
#output.to_csv('AMX_file.gz', compression='gzip')
print("Your submission was successfully saved!")