In [1]:
#!pip install kaggle

In [2]:
#!kaggle competitions download -c ieee-fraud-detection

In [1]:
from typing import Tuple, Dict
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             mean_squared_error, mean_absolute_error, r2_score)
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.utils import resample

In [2]:
train_transactions = pd.read_csv('../ieee-fraud-detection/train_transaction.csv')
train_identity = pd.read_csv('../ieee-fraud-detection/train_identity.csv')
train=train_transactions.merge(train_identity,how='left',left_index=True,right_index=True)
y_train=train['isFraud'].astype('uint8')

# x_train, x_test, y_train, y_test = train_test_split(train.drop('isFraud',axis=1), y_train, test_size=0.25, random_state=1)

In [3]:
null_percent = train.isnull().sum()/train.shape[0]*100

cols_to_drop = np.array(null_percent[null_percent > 50].index)

train = train.drop(cols_to_drop, axis=1)
# train = train.drop(cols_to_drop, axis=1)

In [4]:
print(train[:10])

   TransactionID_x  isFraud  TransactionDT  TransactionAmt ProductCD  card1  \
0          2987000        0          86400            68.5         W  13926   
1          2987001        0          86401            29.0         W   2755   
2          2987002        0          86469            59.0         W   4663   
3          2987003        0          86499            50.0         W  18132   
4          2987004        0          86506            50.0         H   4497   
5          2987005        0          86510            49.0         W   5937   
6          2987006        0          86522           159.0         W  12308   
7          2987007        0          86529           422.5         W  12695   
8          2987008        0          86535            15.0         H   2803   
9          2987009        0          86536           117.0         W  17399   

   card2  card3       card4  card5  ...   V312  V313   V314 V315  V316  \
0    NaN  150.0    discover  142.0  ...    0.0   0.0    

In [4]:
print(len(train.columns))

220


In [7]:
def missing_data(df) :
    count = df.isnull().sum()
    percent = (df.isnull().sum()) / (df.isnull().count()) * 100
    total = pd.concat([count, percent], axis=1, keys = ['Count', 'Percent'])
    types = []
    for col in df.columns :
        dtypes = str(df[col].dtype)
        types.append(dtypes)
    total['dtypes'] = types

    return np.transpose(total)

missing_data(train)

Unnamed: 0,TransactionID_x,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
Count,0,0,0,0,0,0,8933,1565,1577,4259,...,12,1269,1269,1269,12,12,12,12,12,12
Percent,0.0,0.0,0.0,0.0,0.0,0.0,1.512683,0.265012,0.267044,0.721204,...,0.002032,0.214888,0.214888,0.214888,0.002032,0.002032,0.002032,0.002032,0.002032,0.002032
dtypes,int64,int64,int64,float64,object,int64,float64,float64,object,float64,...,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64


In [8]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [9]:
not_fraud = train[train.isFraud == 0]
fraud = train[train.isFraud == 1]
not_fraud_downsampled = resample(not_fraud,
                                replace = False,
                                n_samples = len(fraud),
                                random_state = 52)

# combine minority and downsampled majority
downsampled = pd.concat([not_fraud_downsampled, fraud])

# checking counts
downsampled.isFraud.value_counts()

downsampled_y = downsampled['isFraud'].astype('uint8')

reduce_mem_usage(downsampled)

del train
del train_identity
del train_transactions

Mem. usage decreased to 20.85 Mb (70.1% reduction)


In [10]:
x_train, x_test, y_train, y_test = train_test_split(downsampled.drop('isFraud',axis=1), downsampled['isFraud'], test_size=0.25, random_state=1)

train_data = x_train
test_data = x_test

In [11]:
# Label Encoding for categorical variables.
# for f in train_data.columns:
#     if train_data[f].dtype=='object':
#         lbl = preprocessing.LabelEncoder()
#         lbl.fit(list(train_data[f].values))
#         train_data[f] = lbl.transform(list(train_data[f].values))
#         test_data[f] = lbl.transform(list(train_data[f].values))

# One-Hot Encoding for categorical variables

lbl = preprocessing.LabelEncoder()



for f in train_data.columns:
    if train_data[f].dtype == 'object':
        lbl.fit(train_data[f])
        test_data[f] = test_data[f].map(lambda s: 'unknown' if s not in lbl.classes_ else s)
        lbl.classes_ = np.append(lbl.classes_, 'unknown')
        train_data[f] = lbl.transform(train_data[f])
        test_data[f] = lbl.transform(test_data[f])

# oneh = preprocessing.OneHotEncoder(handle_unknown='ignore', sparse_output=False)
# for f in train_data.columns:
#     if train_data[f].dtype == 'object':
#         col_to_fit = train_data[f].values.reshape(-1, 1)
#         oneh.fit(col_to_fit)
#         encoded_train = oneh.transform(col_to_fit)
#         encoded_feature_names = [f"{f}_{category}" for category in oneh.categories_[0]]
#         encoded_df = pd.DataFrame(encoded_train, columns=encoded_feature_names, index=train_data.index)
        
#         train_data = train_data.drop(f, axis=1)
#         train_data = pd.concat([train_data, encoded_df], axis=1)
#         test_col = test_data[f].values.reshape(-1, 1)
#         encoded_test = oneh.transform(test_col)
#         encoded_test_df = pd.DataFrame(encoded_test, columns=encoded_feature_names, index=test_data.index)
#         test_data = test_data.drop(f, axis=1)
#         test_data = pd.concat([test_data, encoded_test_df], axis=1)

In [12]:
# downsampled.drop('isFraud', axis=1, inplace=True)

train_data.fillna(-1000, inplace=True)
test_data.fillna(-1000, inplace=True)

y_train.fillna(-1000, inplace=True)
y_test.fillna(-1000, inplace=True)

In [13]:
scaler = StandardScaler()
train_data = pd.DataFrame(scaler.fit_transform(train_data))
test_data = pd.DataFrame(scaler.transform(test_data))

In [14]:
test_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,209,210,211,212,213,214,215,216,217,218
0,1.075765,1.098385,0.326858,0.775921,-0.967245,-0.305302,-0.086261,-1.02547,0.323138,0.745806,...,4.460979,1.388431,4.077940,2.208120,0.875292,0.990988,0.904804,-0.11497,-0.140205,-0.134848
1,-1.082155,-1.136010,-0.328312,-1.515432,1.145441,0.138984,0.515154,-1.02547,0.323138,0.745806,...,-0.296239,-0.226305,-0.256370,-0.235179,-0.060217,-0.087935,-0.086724,-0.11497,-0.140205,-0.134848
2,-1.448524,-1.361042,-0.395352,0.775921,-1.561541,-0.967540,-0.086261,0.65406,0.339717,0.745806,...,-0.296239,-0.226305,-0.256370,-0.235179,-0.060217,-0.087935,-0.086724,-0.11497,-0.140205,-0.134848
3,-0.937529,-1.001572,-0.284242,-0.942594,0.616030,0.620993,-0.086261,0.65406,0.339717,0.745806,...,-0.296239,-0.226305,-0.256370,-0.235179,-0.060217,-0.087935,-0.086724,-0.11497,-0.140205,-0.134848
4,0.014315,-0.002249,0.121733,0.775921,0.323428,0.050965,-0.206544,0.65406,0.339717,0.745806,...,-0.296239,-0.226305,-0.256370,-0.235179,0.070209,-0.007165,0.006634,-0.11497,-0.140205,-0.134848
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10327,1.351393,1.393832,-0.357158,0.775921,0.945621,0.713203,-0.086261,0.65406,0.339717,0.745806,...,-0.296239,-0.226305,-0.256370,-0.235179,-0.060217,-0.087935,-0.086724,-0.11497,-0.140205,-0.134848
10328,0.676492,0.645119,-0.242309,-1.515432,-0.016701,-0.887903,0.515154,0.65406,-0.389732,0.745806,...,-0.296239,-0.226305,-0.256370,-0.235179,-0.060217,-0.087935,-0.086724,-0.11497,-0.140205,-0.134848
10329,1.462784,1.517311,-0.361164,0.775921,0.070914,-0.489722,-0.086261,-1.02547,0.323138,0.745806,...,0.027494,-0.226305,-0.256370,-0.235179,-0.060217,-0.087935,-0.086724,-0.11497,-0.140205,-0.134848
10330,-0.520173,-0.559343,-0.490769,-1.515432,0.363929,-0.929817,0.824453,-1.02547,-9.822823,-1.314033,...,0.080649,0.304375,0.053425,0.185433,-0.060217,-0.087935,-0.086724,-0.11497,-0.140205,-0.134848


In [None]:
train_features_df = pd.DataFrame(train_data)
train_features_df.to_csv("train_features.csv", index=False)

test_features_df = pd.DataFrame(test_data)
test_features_df.to_csv("test_features.csv", index=False)

train_labels_df = pd.DataFrame(y_train)
train_labels_df.to_csv("train_labels.csv", index=False)

test_labels_df = pd.DataFrame(y_test)
test_labels_df.to_csv("test_labels.csv", index=False)

In [15]:
# logreg = LogisticRegression(C=0.01, solver="lbfgs", penalty="l2")
# logreg.fit(downsampled, downsampled_y)

# knn = KNeighborsClassifier(n_neighbors=10, weights='uniform', metric='euclidean', n_jobs=-1)
# knn.fit(downsampled, downsampled_y)


In [16]:

res = {}

param_grid = {'C': [0.01, 0.1, 1, 10],
                'penalty': ['l2'],
                'solver': [
                    # 'lbfgs',
                'newton-cg',
                'liblinear']
                }

grid_search_log = GridSearchCV(estimator=LogisticRegression(),
                            cv=5, scoring='f1',
                            param_grid=param_grid)

grid_search_log.fit(train_data, y_train)
y_pred = grid_search_log.predict(test_data)
res['Logistic Regression'] = {'accuracy': accuracy_score(y_pred=y_pred, y_true=y_test),
                            'precision': precision_score(y_pred=y_pred, y_true=y_test, average='weighted'),
                            'recall': recall_score(y_pred=y_pred, y_true=y_test, average='weighted'),
                            'f1_score': f1_score(y_pred=y_pred, y_true=y_test, average='weighted')}
res['Logistic Regression']





{'accuracy': 0.7491289198606271,
 'precision': 0.7496533834654437,
 'recall': 0.7491289198606271,
 'f1_score': 0.748908227221292}

In [17]:
print(grid_search_log.n_features_in_)

219


In [18]:
del downsampled
del downsampled_y

In [19]:
# def logistic(x_train, y_train, x_test, y_test):
#     #X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.2,random_state=1)
#     lr=LogisticRegression()
#     lr.fit(x_train,y_train)
#     prob=lr.predict_proba(x_test)
#     return (prob[:,1],y_test)

# probs,y_test=logistic(x_train, y_train, x_test, y_test)
# print(probs)

test_identity = pd.read_csv('ieee-fraud-detection/test_identity.csv')
test_transaction = pd.read_csv('ieee-fraud-detection/test_transaction.csv')

test = pd.merge(test_transaction,test_identity,how="left",on="TransactionID")
reduce_mem_usage(test)
del test_identity
del test_transaction

print(type(test))

Mem. usage decreased to 561.50 Mb (66.5% reduction)
<class 'pandas.core.frame.DataFrame'>


In [20]:
cols_to_drop

del train_data
del test_data

In [21]:
to_rename = {}
for i in test.columns:
    if i.find('id') != -1:
        to_rename[i] = i.replace('-','_')

print(type(test))
test.rename(columns=to_rename, inplace=True)
cols_to_drop = np.delete(cols_to_drop, np.where(cols_to_drop == 'TransactionID_y'))
print(type(test))
# test.fillna(-1000, inplace=True)
lbl = preprocessing.LabelEncoder()
# print(test)

for f in test.columns:
    if test[f].dtype == 'object':
        col_to_fit = test[f]
        lbl.fit(col_to_fit)
        col_to_fit = col_to_fit.map(lambda s: 'unknown' if s not in lbl.classes_ else s)
        lbl.classes_ = np.append(lbl.classes_, 'unknown')
        test[f] = lbl.transform(col_to_fit)
        
test = test.drop(cols_to_drop,axis=1)
test.fillna(-1000, inplace=True)
test = pd.DataFrame(scaler.fit_transform(test))
print(len(test.columns.values))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
219


In [22]:
print(len(test.columns.values))


219


In [23]:
submission = pd.read_csv('ieee-fraud-detection/sample_submission.csv',index_col='TransactionID')
submission['isFraud'] = grid_search_log.predict_proba(test)[:,1]
submission.to_csv('Logreg_submission.csv')
submission.head()

Unnamed: 0_level_0,isFraud
TransactionID,Unnamed: 1_level_1
3663549,0.003974
3663550,0.332796
3663551,0.000634
3663552,0.003921
3663553,0.999873
