In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Beautiful format for float type
pd.set_option('display.float_format', lambda x: '%.3f' % x)

credit_card_txn = pd.read_csv('tj_05_credit_card_transaction.csv')
data_training = pd.read_csv('tj_05_training.csv', header=None, names=["card_no", "gender"])
data_test = pd.read_csv('tj_05_test.csv', header=None, names=["card_no"])

In [38]:
print('credit_card_txn shape =', credit_card_txn.shape, '\n')
credit_card_txn.info()
credit_card_txn.describe()

credit_card_txn shape = (893761, 6) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 893761 entries, 0 to 893760
Data columns (total 6 columns):
card_no         893761 non-null int64
txn_date        884013 non-null object
txn_hour        884013 non-null float64
txn_amount      893761 non-null float64
mer_cat_code    884013 non-null float64
mer_id          893761 non-null int64
dtypes: float64(3), int64(2), object(1)
memory usage: 40.9+ MB


Unnamed: 0,card_no,txn_hour,txn_amount,mer_cat_code,mer_id
count,893761.0,884013.0,893761.0,884013.0,893761.0
mean,1234000000030626.2,14.466,1694.535,5803.844,7269.036
std,18378.6,4.67,25873.996,877.35,10765.232
min,1234000000000001.0,0.0,50.0,742.0,0.0
25%,1234000000007268.0,11.0,250.0,5411.0,0.0
50%,1234000000014186.0,15.0,450.0,5631.0,0.0
75%,1234000000021106.0,18.0,1000.0,6011.0,14070.0
max,1234000000028172.0,23.0,19201000.0,9405.0,28728.0


In [39]:
credit_card_txn = credit_card_txn.dropna(axis=0, how='any')
print("Dropped na shape =", credit_card_txn.shape, '\n')

credit_card_txn.sample(5)

Dropped na shape = (884013, 6) 



Unnamed: 0,card_no,txn_date,txn_hour,txn_amount,mer_cat_code,mer_id
695234,1234000000021149,2016-02-11 00:00:00,6.0,200.0,4812.0,16783
205598,1234000000007688,2016-04-12 00:00:00,16.0,400.0,5541.0,0
94864,1234000000002269,2016-05-06 00:00:00,23.0,50.0,5735.0,0
270078,1234000000001319,2016-07-09 00:00:00,15.0,50.0,5735.0,0
250851,1234000000025961,2016-04-28 00:00:00,18.0,200.0,5541.0,0


In [40]:
mcc_codes = pd.read_csv('mcc_codes.csv')
credit_card_txn = credit_card_txn.join(mcc_codes[['mcc', 'irs_description', 'type_id']].set_index('mcc'), on='mer_cat_code')
credit_card_txn = credit_card_txn.dropna(axis=0, how='any')
credit_card_txn.type_id = credit_card_txn.type_id.astype(int)
credit_card_txn.head()

Unnamed: 0,card_no,txn_date,txn_hour,txn_amount,mer_cat_code,mer_id,irs_description,type_id
0,1234000000009154,2016-12-07 00:00:00,22.0,1550.0,4511.0,0,"Airlines, Air Carriers",30
1,1234000000017165,2016-12-07 00:00:00,22.0,250.0,6011.0,0,Automated Cash Disburse,170
2,1234000000000768,2016-12-07 00:00:00,22.0,250.0,8398.0,0,Charitable and Social Service Organizations - ...,268
3,1234000000018716,2016-12-07 00:00:00,22.0,50.0,5735.0,0,Record Stores,120
4,1234000000016652,2016-12-07 00:00:00,22.0,50.0,5735.0,0,Record Stores,120


In [41]:
# credit_card_txn.sample(500)

In [42]:
data_features = credit_card_txn
data_features = data_features.drop(['txn_date', 'txn_hour', 'mer_cat_code', 'mer_id', 'irs_description'], axis=1)
data_features.head()

Unnamed: 0,card_no,txn_amount,type_id
0,1234000000009154,1550.0,30
1,1234000000017165,250.0,170
2,1234000000000768,250.0,268
3,1234000000018716,50.0,120
4,1234000000016652,50.0,120


In [43]:
data_training.shape

data_training_cleaned = pd.read_csv('tj_05_training_cleaned.csv')
data_training_cleaned.head()

Unnamed: 0,card_no,gender,0,1
0,1234000000000000,1,3.0,4.0
1,1234000000000010,1,5.0,3.0
2,1234000000000020,1,3.0,2.0
3,1234000000000030,1,5.0,3.0
4,1234000000000040,1,4.0,4.0


In [44]:
data_for_model = data_training.join(data_features.set_index('card_no'), on='card_no')
data_for_model = data_for_model.dropna(axis=0, how='any')
data_for_model.head()

Unnamed: 0,card_no,gender,txn_amount,type_id
0,1234000000000792,1,600.0,91.0
0,1234000000000792,1,7750.0,257.0
0,1234000000000792,1,2000.0,36.0
0,1234000000000792,1,500.0,91.0
0,1234000000000792,1,500.0,91.0


In [61]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from sklearn.cross_validation import  train_test_split
from matplotlib import pyplot
from sklearn import metrics
from sklearn.cross_validation import KFold, cross_val_score
from xgboost import XGBClassifier
from xgboost import plot_importance
from xgboost import plot_tree
from time import time
from sklearn.preprocessing import StandardScaler

dataset = data_for_model
dataset = dataset.drop("card_no",1)
# dataset.head()
features = list(dataset.columns[1:])
print(features)

['txn_amount', 'type_id']


In [59]:
XGBmodel = XGBClassifier()
X_train, X_test, y_train, y_test = train_test_split(dataset[features], dataset['gender'].values, test_size=0.30, random_state=42)
XGBmodel.fit(X_train,y_train)
predictions = XGBmodel.predict(X_test)     

print ("accuracy" , metrics.accuracy_score(y_test, predictions))

accuracy 0.567601144407


In [76]:
model = Sequential()
# model.add(Dense(input_shape=2, activation='relu'))
model.add(Dense(12, input_dim=2, kernel_initializer ='uniform', activation='relu'))
model.add(Activation("softmax"))

model.compile(loss='sparse_categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])

In [77]:
scaler = StandardScaler()
model.fit(scaler.fit_transform(X_train.values), y_train)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1df979b83c8>

In [78]:
y_prediction = model.predict_classes(scaler.transform(X_test.values))
print ("\n\naccuracy" , np.sum(y_prediction == y_test) / float(len(y_test)))


accuracy 0.527243131974


In [107]:
model = Sequential()
model.add(Dense(3, input_dim=2))
# model.add(Dense(3, input_dim=2))
# model.add(Dense(3, input_dim=2))
model.add(Activation("softmax"))

model.compile(loss='sparse_categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])

In [None]:
model.fit(scaler.fit_transform(X_train.values), y_train)

Epoch 1/10
Epoch 2/10

In [87]:
data_test.join()

TypeError: join() missing 1 required positional argument: 'other'

In [82]:

data_test_prediction = model.predict_classes(scaler.transform(data_test.values))



ValueError: non-broadcastable output operand with shape (4623,1) doesn't match the broadcast shape (4623,2)