# Data mining seminar project

In [112]:
from datetime import date

import pandas as pd

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.metrics import balanced_accuracy_score, accuracy_score, get_scorer

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

from scipy.stats import uniform, loguniform, randint

## Data preparation

- load datasets
- handle MaritalStatus
	- only M (Maried) ann S (Single), NaNs most common, treated as value (not missing) for people in relationship and not married, divorced, or widowed
- correct number of children (cannot be negative)
- add has children column
- add income per dependant column (1 + num of children)
- encode responded to boolean
- handle date format for transaction and get age
- aggregate transaction data
	- minimum age - number of days since last transaction
	- number of transaction (count) total, 
	- monetary value (amount) total, average
- combine data
- encode data to numeric for further tasks

In [119]:
cust = pd.read_csv('data/customers.csv').set_index('CardID')
tran = pd.read_csv('data/transactions.csv').set_index('CardID')
camp = pd.read_csv('data/campaign.csv').set_index('CardID')

cust.replace({ 'MaritalStatus': { 'M': 'Married', 'S': 'Single'}}, inplace=True)
cust.fillna({ 'MaritalStatus': 'Other' }, inplace=True)

cust['NumChildren'] = cust['NumChildren'].clip(0, None)
cust['HasChildren'] = cust['NumChildren'] > 0
cust['HHIncodePerDependant'] = cust['HHIncomeMed']/(1 + cust['NumChildren'])

camp['Responded'] = camp['Responded'] == 'T'

tran['Date'] = tran['Date'].astype(str)
tran['Date'] = tran['Date'].transform(
	lambda x: date(year=int(x[:4]), month=int(x[4:6]), day=int(x[6:])))

tran['Age'] = tran['Date'].max() - tran['Date']
tran['Age'] = tran['Age'].transform(lambda x: x.days)

tran_agg = tran.groupby('CardID').apply(func=lambda gr: pd.Series({
	'age_min' : gr.Age.min(),
	'count_total' : gr.shape[0],
	# 'count_180' : gr[gr.Age < 180].shape[0],
	# 'count_90' : gr[gr.Age < 90].shape[0],
	# 'count_30' : gr[gr.Age < 30].shape[0],
	'amount_total' : gr.Amount.sum(),
	# 'amount_180' : gr[gr.Age < 180].Amount.sum(),
	# 'amount_90' : gr[gr.Age < 90].Amount.sum(),
	# 'amount_30' : gr[gr.Age < 30].Amount.sum(),
	'amount_avg' : gr.Amount.mean(),
}))

# for days in [90, 30]:
# 	tran_agg[f'used_{days}'] = tran_agg[f'count_{days}'] > 0


data = cust.join(tran_agg.add_prefix('Tran_')).join(camp)

data

Unnamed: 0_level_0,MaritalStatus,NumChildren,LoS,HHIncomeMed,HasChildren,HHIncodePerDependant,Tran_age_min,Tran_count_total,Tran_amount_total,Tran_amount_avg,Responded
CardID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
C0100000199,Single,4,1.156164,71079.744865,True,14215.948973,1.0,3.0,597.00,199.000000,False
C0100000343,Other,1,3.002740,79424.115726,True,39712.057863,114.0,6.0,700.94,116.823333,False
C0100000375,Single,0,0.068493,41878.414258,False,41878.414258,59.0,4.0,223.98,55.995000,False
C0100000482,Single,1,1.356164,62924.588763,True,31462.294381,20.0,4.0,197.98,49.495000,False
C0100000689,Married,3,2.484932,46616.718039,True,11654.179510,4.0,2.0,428.00,214.000000,False
...,...,...,...,...,...,...,...,...,...,...,...
C0106595162,Married,1,1.931507,49841.914121,True,24920.957061,92.0,2.0,388.99,194.495000,False
C0106596136,Married,3,1.400000,88015.076144,True,22003.769036,5.0,2.0,108.99,54.495000,False
C0106596422,Other,0,1.558904,46617.938780,False,46617.938780,134.0,2.0,948.00,474.000000,False
C0106596502,Married,2,-0.739726,52631.916257,True,17543.972086,16.0,1.0,279.00,279.000000,False


In [120]:
data_enc = pd.get_dummies(data, columns=['MaritalStatus'])
data_enc.drop(['MaritalStatus_Other'], axis=1, inplace=True)

data_enc.insert(0, 'MaritalStatus_Single', data_enc.pop('MaritalStatus_Single'))
data_enc.insert(1, 'MaritalStatus_Married', data_enc.pop('MaritalStatus_Married'))

data_enc

Unnamed: 0_level_0,MaritalStatus_Single,MaritalStatus_Married,NumChildren,LoS,HHIncomeMed,HasChildren,HHIncodePerDependant,Tran_age_min,Tran_count_total,Tran_amount_total,Tran_amount_avg,Responded
CardID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
C0100000199,True,False,4,1.156164,71079.744865,True,14215.948973,1.0,3.0,597.00,199.000000,False
C0100000343,False,False,1,3.002740,79424.115726,True,39712.057863,114.0,6.0,700.94,116.823333,False
C0100000375,True,False,0,0.068493,41878.414258,False,41878.414258,59.0,4.0,223.98,55.995000,False
C0100000482,True,False,1,1.356164,62924.588763,True,31462.294381,20.0,4.0,197.98,49.495000,False
C0100000689,False,True,3,2.484932,46616.718039,True,11654.179510,4.0,2.0,428.00,214.000000,False
...,...,...,...,...,...,...,...,...,...,...,...,...
C0106595162,False,True,1,1.931507,49841.914121,True,24920.957061,92.0,2.0,388.99,194.495000,False
C0106596136,False,True,3,1.400000,88015.076144,True,22003.769036,5.0,2.0,108.99,54.495000,False
C0106596422,False,False,0,1.558904,46617.938780,False,46617.938780,134.0,2.0,948.00,474.000000,False
C0106596502,False,True,2,-0.739726,52631.916257,True,17543.972086,16.0,1.0,279.00,279.000000,False


## Classification

Object for data representation

- split to test and train
- preprocess data using train split
- get metrics

In [None]:
class Clf_dataset:
	def __init__(self, data, target, random_state=None, test_size=0.2):
		self._X = { 'all' : data.drop(target, axis=1) }
		self._y = { 'all' : data[target] }

		self._X['train'], self._X['test'], self._y['train'], self._y['test'] = \
			train_test_split(X, y, stratify=y, test_size=test_size, random_state=random_state)
		
		self.prep = {
			'stand' : StandardScaler(),
			'quant'
		}

		for prep in self.prep.values():
			prep.fit(self.X('train'))


	def X(self, split='all', prep=None):
		if prep is None:
			return self._X[split]
		else:
			return self.prep[prep].transform(self._X[split])
		
	def y(self, split='all'):
		return self._y[split]

	def metric(self, clf, name='balanced_accuracy', prep=None, split='test'):
		scorer = get_scorer(name)
		return scorer(clf, self.X(split, prep), self.y(split))

clf_data = Clf_dataset(data_enc, 'Responded', random_state=37)

Define classifiers and their hyperparameter space

- random forrest
- logistic regression


In [None]:
classifiers = {
	# 'knn' : {
	# 	'clf' : KNeighborsClassifier(),
	# 	'param' : {
	# 		'n_neighbors' : randint(3, 10),
	# 		'weights' : ['uniform', 'distance'],
	# 		'algorithm' : ['ball_tree', 'kd_tree', 'brute'],
	# 		'leaf_size' : randint(10, 50),
	# 		'p' : [1, 2],
	# 	}
	# },
	# 'rf' : {
	# 	'clf' : RandomForestClassifier(class_weight='balanced'),
	# 	'param' : {
	# 		'n_estimators' : randint(10, 100),
	# 		'criterion' : ['gini', 'entropy'],
	# 		'bootstrap' : [True, False],
	# 		'min_samples_leaf' : randint(1, 5),
	# 		'min_samples_split' : randint(2, 10)
	# 	}
	# },
	# 'svm': {
	# 	'clf' : SVC(class_weight='balanced'),
	# 	'param' : {
	# 		'C' : loguniform(1e-2, 1e1),
	# 		'kernel' : ['linear', 'poly', 'rbf'],
	# 		'gamma' : loguniform(1e-2, 1)
	# 	}
	# },
	'nn': {
		'clf': MLPClassifier(),
		'param': {
			'hidden_layer_sizes' : [
				(128,),
				(64, 64),
				(32, 32, 32),
			],
			'solver' : ['adam', 'sgd'],
			'alpha' : loguniform(1e-4, 1e-2),
			'learning_rate' : ['constant', 'invscaling'],
			'learning_rate_init' : loguniform(1e-4, 1e-2),
		}
	}
}

metrics = [
	'balanced_accuracy'
]

for n, c in classifiers.items():
	for prep in clf_data.prep:
		c[prep] = {}
		d = c[prep]

		d['opt'] = RandomizedSearchCV(c['clf'], c['param'], scoring='balanced_accuracy', n_iter=100, n_jobs=-1, random_state=37)
		d['opt'].fit(clf_data.X('train', prep), clf_data.y('train'))
		d['metrics'] = {}
		for m in metrics:
			d['metrics'][m] = {}
			for split in ['train', 'test']:
				d['metrics'][m][split] = clf_data.metric(d['opt'], m, prep, split)

				print(f'{n}, prep: {prep}, metric: {m}, split: {split}: {d['metrics'][m][split]:.3f}')



KeyboardInterrupt: 